diff --git a/CMakeLists.txt b/CMakeLists.txt index c0ec99c7..d8c37bbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,7 +105,7 @@ file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c") list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h") # Add also all the strategies -file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c") +file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c") # ToDo: do something with encode_coding_tree-avx2, currently not converted to VVC list(REMOVE_ITEM LIB_SOURCES_STRATEGIES "src/strategies/avx2/encode_coding_tree-avx2.c") @@ -340,6 +340,9 @@ if(NOT DEFINED MSVC) if(NOT "test_external_symbols" IN_LIST XFAIL) add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) endif() + if(NOT "test_mtt" IN_LIST XFAIL) + add_test( NAME test_mtt COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mtt.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() if(NOT "test_intra" IN_LIST XFAIL) add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) endif() diff --git a/src/cabac.h b/src/cabac.h index be249ba2..f38030a9 100644 --- a/src/cabac.h +++ b/src/cabac.h @@ -77,6 +77,8 @@ typedef struct cabac_ctx_t mts_idx_model[4]; cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models + cabac_ctx_t mtt_vertical_model[5]; + cabac_ctx_t mtt_binary_model[4]; cabac_ctx_t intra_luma_mpm_flag_model; //!< \brief intra mode context models cabac_ctx_t intra_subpart_model[2]; //!< \brief intra sub part context models cabac_ctx_t chroma_pred_model; diff --git a/src/cfg.c b/src/cfg.c index cafadcb2..bf9e1307 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -80,7 +80,6 @@ int uvg_config_init(uvg_config *cfg) cfg->trskip_max_size = 2; //Default to 4x4 cfg->mts = 0; cfg->mts_implicit = 0; - cfg->tr_depth_intra = 0; cfg->ime_algorithm = 0; /* hexbs */ cfg->fme_level = 4; cfg->source_scan_type = 0; /* progressive */ @@ -207,6 +206,8 @@ int uvg_config_init(uvg_config *cfg) cfg->lfnst = false; + cfg->isp = false; + parse_qp_map(cfg, 0); cfg->jccr = 0; @@ -221,10 +222,27 @@ int uvg_config_init(uvg_config *cfg) cfg->cabac_debug_file_name = NULL; cfg->dual_tree = 0; + + cfg->min_qt_size[0] = 4; + cfg->min_qt_size[1] = 4; + cfg->min_qt_size[2] = 4; + + cfg->max_btt_depth[0] = 0; + cfg->max_btt_depth[1] = 0; + cfg->max_btt_depth[2] = 0; + + cfg->max_tt_size[0] = 64; + cfg->max_bt_size[0] = 64; + cfg->max_tt_size[1] = 64; + cfg->max_bt_size[1] = 64; + cfg->max_tt_size[2] = 64; + cfg->max_bt_size[2] = 64; + cfg->intra_rough_search_levels = 2; cfg->ibc = 0; + cfg->dep_quant = 0; return 1; } @@ -333,7 +351,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil return 1; } -/* + static int parse_uint8(const char *numstr,uint8_t* number,int min, int max) { char *tail; @@ -349,7 +367,7 @@ static int parse_uint8(const char *numstr,uint8_t* number,int min, int max) return 1; } } -*/ + static int parse_int8(const char *numstr,int8_t* number,int min, int max) { char *tail; @@ -365,7 +383,7 @@ static int parse_int8(const char *numstr,int8_t* number,int min, int max) return 1; } } -/* + static int parse_array(const char *array, uint8_t *coeff_key, int size, int min, int max) { @@ -389,15 +407,15 @@ static int parse_array(const char *array, uint8_t *coeff_key, int size, free(key); return 0; } - else if (imts = mts_type; cfg->mts_implicit = (mts_type == UVG_MTS_IMPLICIT); } - else if OPT("tr-depth-intra") - cfg->tr_depth_intra = atoi(value); else if OPT("me") { int8_t ime_algorithm = 0; if (!parse_enum(value, me_names, &ime_algorithm)) return 0; @@ -1454,6 +1470,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) else if OPT("lfnst") { cfg->lfnst = atobool(value); } + else if OPT("isp") { + cfg->isp = atobool(value); + } else if OPT("jccr") { cfg->jccr = (bool)atobool(value); } @@ -1479,6 +1498,49 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) else if OPT("dual-tree") { cfg->dual_tree = atobool(value); } + else if OPT("mtt-depth-intra") { + cfg->max_btt_depth[0] = atoi(value); + } + else if OPT("mtt-depth-intra-chroma") { + cfg->max_btt_depth[2] = atoi(value); + } + else if OPT("mtt-depth-inter") { + cfg->max_btt_depth[1] = atoi(value); + } + else if OPT("max-bt-size") { + uint8_t sizes[3]; + const int got = parse_array(value, sizes, 3, 0, 128); + if (got == 1) { + cfg->max_bt_size[0] = sizes[0]; + cfg->max_bt_size[1] = sizes[0]; + cfg->max_bt_size[2] = sizes[0]; + } + else if (got == 3) { + cfg->max_bt_size[0] = sizes[0]; + cfg->max_bt_size[1] = sizes[1]; + cfg->max_bt_size[2] = sizes[2]; + } else { + fprintf(stderr, "Incorrect amount of values provided for max-bt-size\n"); + return 0; + } + } + else if OPT("max-tt-size") { + uint8_t sizes[3]; + const int got = parse_array(value, sizes, 3, 0, 128); + if (got == 1) { + cfg->max_tt_size[0] = sizes[0]; + cfg->max_tt_size[1] = sizes[0]; + cfg->max_tt_size[2] = sizes[0]; + } + else if (got == 3) { + cfg->max_tt_size[0] = sizes[0]; + cfg->max_tt_size[1] = sizes[1]; + cfg->max_tt_size[2] = sizes[2]; + } else { + fprintf(stderr, "Incorrect amount of values provided for max-tt-size\n"); + return 0; + } + } else if OPT("intra-rough-granularity") { cfg->intra_rough_search_levels = atoi(value); } @@ -1489,7 +1551,11 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) return 0; } cfg->ibc = (uint8_t)ibc_value; - } else { + } + else if OPT("dep-quant") { + cfg->dep_quant = (bool)atobool(value); + } + else { return 0; } #undef OPT @@ -1681,12 +1747,6 @@ int uvg_config_validate(const uvg_config *const cfg) error = 1; } - if (cfg->tr_depth_intra < 0 || cfg->tr_depth_intra > 4) { - // range is 0 .. CtbLog2SizeY - Log2MinTrafoSize - fprintf(stderr, "Input error: --tr-depth-intra is out of range [0..4]\n"); - error = 1; - } - if (cfg->fme_level != 0 && cfg->fme_level > 4) { fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n"); error = 1; diff --git a/src/cli.c b/src/cli.c index fa6ee6df..6e66f77e 100644 --- a/src/cli.c +++ b/src/cli.c @@ -76,7 +76,6 @@ static const struct option long_options[] = { { "tr-skip-max-size", required_argument, NULL, 0 }, { "mts", required_argument, NULL, 0 }, { "no-mts", no_argument, NULL, 0 }, - { "tr-depth-intra", required_argument, NULL, 0 }, { "me", required_argument, NULL, 0 }, { "subme", required_argument, NULL, 0 }, { "source-scan-type", required_argument, NULL, 0 }, @@ -178,6 +177,8 @@ static const struct option long_options[] = { { "no-mip", no_argument, NULL, 0 }, { "lfnst", no_argument, NULL, 0 }, { "no-lfnst", no_argument, NULL, 0 }, + { "isp", no_argument, NULL, 0 }, + { "no-isp", no_argument, NULL, 0 }, { "jccr", no_argument, NULL, 0 }, { "no-jccr", no_argument, NULL, 0 }, { "amvr", no_argument, NULL, 0 }, @@ -191,8 +192,15 @@ static const struct option long_options[] = { { "dual-tree", no_argument, NULL, 0 }, { "no-dual-tree", no_argument, NULL, 0 }, { "cabac-debug-file", required_argument, NULL, 0 }, + { "mtt-depth-intra", required_argument, NULL, 0 }, + { "mtt-depth-inter", required_argument, NULL, 0 }, + { "mtt-depth-intra-chroma", required_argument, NULL, 0 }, + { "max-bt-size", required_argument, NULL, 0 }, + { "max-tt-size", required_argument, NULL, 0 }, { "intra-rough-granularity",required_argument, NULL, 0 }, { "ibc", required_argument, NULL, 0 }, + { "dep-quant", no_argument, NULL, 0 }, + { "no-dep-quant", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -571,6 +579,7 @@ void print_help(void) " - full: Full ALF\n" " --(no-)rdoq : Rate-distortion optimized quantization [enabled]\n" " --(no-)rdoq-skip : Skip RDOQ for 4x4 blocks. [disabled]\n" + " --(no-)dep-quant : Use dependent quantization. [disabled]\n" " --(no-)signhide : Sign hiding [disabled]\n" " --rd : Intra mode search complexity [0]\n" " - 0: Skip intra if inter is good enough.\n" @@ -602,14 +611,14 @@ void print_help(void) " - 2: + 1/2-pixel diagonal\n" " - 3: + 1/4-pixel horizontal and vertical\n" " - 4: + 1/4-pixel diagonal\n" - " --pu-depth-inter - : Inter prediction units sizes [0-3]\n" - " - 0, 1, 2, 3: from 64x64 to 8x8\n" + " --pu-depth-inter - : Maximum and minimum split depths where\n" + " inter search is performed 0..8. [0-3]\n" " - Accepts a list of values separated by ','\n" " for setting separate depths per GOP layer\n" " (values can be omitted to use the first\n" " value for the respective layer).\n" - " --pu-depth-intra - : Intra prediction units sizes [1-4]\n" - " - 0, 1, 2, 3, 4: from 64x64 to 4x4\n" + " --pu-depth-intra - : Maximum and minimum split depths where\n" + " intra search is performed 0..8. [1-4]\n" " - Accepts a list of values separated by ','\n" " for setting separate depths per GOP layer\n" " (values can be omitted to use the first\n" @@ -617,6 +626,22 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --mtt-depth-intra : Depth of mtt for intra slices 0..3.[0]\n" + " --mtt-depth-intra-chroma : Depth of mtt for chroma dual tree in\n" + " intra slices 0..3.[0]\n" + " --mtt-depth-inter : Depth of mtt for inter slices 0..3.[0]\n" + " All MTTs are currently experimental and\n" + " require disabling some avx2 optimizations.\n" + " --max-bt-size : maximum size for a CU resulting from\n" + " a bt split. A singular value shared for all\n" + " or a list of three values for the different\n" + " slices types (intra, inter, intra-chroma)\n" + " can be provided. [64, 64, 32]\n" + " --max-tt-size : maximum size for a CU resulting from\n" + " a tt split. A singular value shared for all\n" + " or a list of three values for the different\n" + " slices types (intra, inter, intra-chroma)\n" + " can be provided. [64, 64, 32]\n" " --intra-rough-granularity : How many levels are used for the\n" " logarithmic intra rough search. 0..4\n" " With 0 all of the modes are checked \n" @@ -634,7 +659,6 @@ void print_help(void) " This is mostly for debugging and is not\n" " guaranteed to produce sensible bitstream or\n" " work at all. [disabled]\n" - " --tr-depth-intra : Transform split depth for intra blocks [0]\n" " --(no-)bipred : Bi-prediction [disabled]\n" " --cu-split-termination : CU split search termination [zero]\n" " - off: Don't terminate early.\n" @@ -671,6 +695,9 @@ void print_help(void) " --(no-)mip : Enable matrix weighted intra prediction.\n" " --(no-)lfnst : Enable low frequency non-separable transform.\n" " [disabled]\n" + " --(no-)isp : Enable intra sub partitions. [disabled]\n" + " Experimental, requires disabling some avx2\n" + " optimizations.\n" " --mts : Multiple Transform Selection [off].\n" " (Currently only implemented for intra\n" " and has effect only when rd >= 2)\n" diff --git a/src/context.c b/src/context.c index 83bd5502..30861849 100644 --- a/src/context.c +++ b/src/context.c @@ -50,6 +50,21 @@ static const uint8_t INIT_QT_SPLIT_FLAG[4][6] = { { 0, 8, 8, 12, 12, 8, }, }; + +static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = { + { 43, 42, 37, 42, 44, }, + { 43, 35, 37, 34, 52, }, + { 43, 42, 29, 27, 44, }, + { 9, 8, 9, 8, 5, }, +}; + +static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = { + { 28, 29, 28, 29, }, + { 43, 37, 21, 22, }, + { 36, 45, 36, 45, }, + { 12, 13, 12, 13, }, + }; + static const uint8_t INIT_SKIP_FLAG[4][3] = { { 57, 60, 46, }, { 57, 59, 45, }, @@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice) uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]); uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]); uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]); + uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]); + } + + for (i = 0; i < 5; i++) { + uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]); } for (i = 0; i < 6; i++) { @@ -618,13 +638,14 @@ void uvg_context_copy(encoder_state_t * const target_state, const encoder_state_ uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, - int32_t width) + int32_t width, + int32_t height) { uint32_t uiRight = 0; uint32_t uiLower = 0; uint32_t position = pos_y * width + pos_x; if (pos_x + 1 < (uint32_t)width) uiRight = sig_coeff_group_flag[position + 1]; - if (pos_y + 1 < (uint32_t)width) uiLower = sig_coeff_group_flag[position + width]; + if (pos_y + 1 < (uint32_t)height) uiLower = sig_coeff_group_flag[position + width]; return uiRight || uiLower; } @@ -656,7 +677,7 @@ uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, * \returns context index for current scan position */ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, - uint32_t height, uint32_t width, int8_t type, + uint32_t width, uint32_t height, int8_t color, int32_t* temp_diag, int32_t* temp_sum) { const coeff_t* data = coeff + pos_x + pos_y * width; @@ -686,7 +707,7 @@ uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, u } #undef UPDATE int ctx_ofs = MIN((sum_abs+1)>>1, 3) + (diag < 2 ? 4 : 0); - if (type == 0 /* Luma */) + if (color == COLOR_Y) { ctx_ofs += diag < 5 ? 4 : 0; } @@ -814,7 +835,7 @@ unsigned uvg_lrg1_ctx_id_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos * \returns context go rice parameter */ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, - uint32_t height, uint32_t width, uint32_t baselevel) + uint32_t width, uint32_t height, uint32_t baselevel) { #define UPDATE(x) sum+=abs(x)/*-(x?1:0)*/ @@ -856,8 +877,8 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, * \returns context go rice parameter */ uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, - uint32_t height, uint32_t width, uint32_t baselevel) + uint32_t width, uint32_t height, uint32_t baselevel) { - uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, height, width, baselevel); + uint32_t check = uvg_abs_sum(coeff, pos_x, pos_y, width, height, baselevel); return g_go_rice_pars[check]; } \ No newline at end of file diff --git a/src/context.h b/src/context.h index 366a438a..f083e44c 100644 --- a/src/context.h +++ b/src/context.h @@ -49,10 +49,10 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice); void uvg_context_copy(encoder_state_t * target_state, const encoder_state_t * source_state); -uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width); +uint32_t uvg_context_get_sig_coeff_group( uint32_t *sig_coeff_group_flag,uint32_t pos_x, uint32_t pos_y,int32_t width, int32_t height); uint32_t uvg_context_get_sig_coeff_group_ts(uint32_t* sig_coeff_group_flag, uint32_t pos_x, uint32_t pos_y, int32_t width); uint32_t uvg_context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, - uint32_t height, uint32_t width, int8_t type, + uint32_t width, uint32_t height, int8_t type, int32_t* temp_diag, int32_t* temp_sum); uint32_t uvg_context_get_sig_ctx_idx_abs_ts(const coeff_t* coeff, int32_t pos_x, int32_t pos_y, @@ -66,7 +66,7 @@ uint32_t uvg_abs_sum(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, uint32_t height, uint32_t width, uint32_t baselevel); uint32_t uvg_go_rice_par_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, - uint32_t height, uint32_t width, uint32_t baselevel); + uint32_t width, uint32_t height, uint32_t baselevel); #define CNU 35 #define DWS 8 diff --git a/src/cu.c b/src/cu.c index 40fce65e..d7c37108 100644 --- a/src/cu.c +++ b/src/cu.c @@ -34,6 +34,9 @@ #include #include "cu.h" + +#include "alf.h" +#include "encoderstate.h" #include "threads.h" @@ -97,6 +100,42 @@ cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px) } +void uvg_get_isp_cu_arr_coords(int *x, int *y, int dim) +{ + // Do nothing if dimensions are divisible by 4 + if (*y % 4 == 0 && *x % 4 == 0) return; + const int remainder_y = *y % 4; + const int remainder_x = *x % 4; + + if (remainder_y != 0) { + // Horizontal ISP split + if (remainder_y % 2 == 0 && dim == 8) { + // 8x2 block + *y -= 2; + *x += 4; + } + else { + // 16x1 block + *y -= remainder_y; + *x += remainder_y * 4; + } + } + else { + // Vertical ISP split + if (*x % 2 == 0 && dim == 8) { + // 2x8 block + *y += 4; + *x -= 2; + } + else { + // 1x16 block + *y += remainder_x * 4; + *x -= remainder_x; + } + } +} + + const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px) { assert(x_px < cua->width); @@ -237,10 +276,10 @@ cu_array_t * uvg_cu_array_copy_ref(cu_array_t* cua) * \param dst_y y-coordinate of the top edge of the copied area in dst * \param src source lcu */ -void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type tree_type) +void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src) { const int dst_stride = dst->stride >> 2; - const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C; + const int width = LCU_WIDTH; for (int y = 0; y < width; y += SCU_WIDTH) { for (int x = 0; x < width; x += SCU_WIDTH) { const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y); @@ -251,3 +290,215 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu } } } + +/* + * \brief Constructs cu_loc_t based on given parameters. Calculates chroma dimensions automatically. + * + * \param loc Destination cu_loc. + * \param x Block top left x coordinate. + * \param y Block top left y coordinate. + * \param width Block width. + * \param height Block height. +*/ +void uvg_cu_loc_ctor(cu_loc_t* loc, int x, int y, int width, int height) +{ + assert(x >= 0 && y >= 0 && width >= 0 && height >= 0 && "Cannot give negative coordinates or block dimensions."); + assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Luma CU dimension exceeds maximum (dim > LCU_WIDTH)."); + // This check is no longer valid. With non-square blocks and ISP enabled, even 1x16 and 16x1 (ISP needs at least 16 samples) blocks are valid + //assert(!(width < 4 || height < 4) && "Luma CU dimension smaller than 4."); + + loc->x = x; + loc->y = y; + loc->local_x = x % LCU_WIDTH; + loc->local_y = y % LCU_WIDTH; + loc->width = width; + loc->height = height; + // TODO: when MTT is implemented, chroma dimensions can be minimum 2. + // Chroma width is half of luma width, when not at maximum depth. + loc->chroma_width = width >> 1; + loc->chroma_height = height >> 1; +} + + +int uvg_get_split_locs( + const cu_loc_t* const origin, + enum split_type split, + cu_loc_t out[4], + uint8_t* separate_chroma) +{ + const int half_width = origin->width >> 1; + const int half_height = origin->height >> 1; + const int quarter_width = origin->width >> 2; + const int quarter_height = origin->height >> 2; + if (origin->width == 4 && separate_chroma) *separate_chroma = 1; + + switch (split) { + case NO_SPLIT: + assert(0 && "trying to get split from no split"); + break; + case QT_SPLIT: + uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, half_height); + uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, half_height); + uvg_cu_loc_ctor(&out[2], origin->x, origin->y + half_height, half_width, half_height); + uvg_cu_loc_ctor(&out[3], origin->x + half_width, origin->y + half_height, half_width, half_height); + if (half_height == 4 && separate_chroma) *separate_chroma = 1; + return 4; + case BT_HOR_SPLIT: + uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, half_height); + uvg_cu_loc_ctor(&out[1], origin->x, origin->y + half_height, origin->width, half_height); + if (half_height * origin->width < 64 && separate_chroma) *separate_chroma = 1; + return 2; + case BT_VER_SPLIT: + uvg_cu_loc_ctor(&out[0], origin->x, origin->y, half_width, origin->height); + uvg_cu_loc_ctor(&out[1], origin->x + half_width, origin->y, half_width, origin->height); + if ((half_width == 4 || half_width * origin->height < 64) && separate_chroma) *separate_chroma = 1; + return 2; + case TT_HOR_SPLIT: + uvg_cu_loc_ctor(&out[0], origin->x, origin->y, origin->width, quarter_height); + uvg_cu_loc_ctor(&out[1], origin->x, origin->y + quarter_height, origin->width, half_height); + uvg_cu_loc_ctor(&out[2], origin->x, origin->y + quarter_height + half_height, origin->width, quarter_height); + if (quarter_height * origin->width < 64 && separate_chroma) *separate_chroma = 1; + return 3; + case TT_VER_SPLIT: + uvg_cu_loc_ctor(&out[0], origin->x, origin->y, quarter_width, origin->height); + uvg_cu_loc_ctor(&out[1], origin->x + quarter_width, origin->y, half_width, origin->height); + uvg_cu_loc_ctor(&out[2], origin->x + quarter_width + half_width, origin->y, quarter_width, origin->height); + if ((quarter_width == 4 || quarter_width * origin->height < 64) && separate_chroma) *separate_chroma = 1; + return 3; + } + return 0; +} + + +int uvg_get_implicit_split( + const encoder_state_t* const state, + const cu_loc_t* const cu_loc, + uint8_t max_mtt_depth) +{ + bool right_ok = (state->tile->frame->width) >= cu_loc->x + cu_loc->width; + bool bottom_ok = (state->tile->frame->height) >= cu_loc->y + cu_loc->height; + + if (right_ok && bottom_ok) return NO_SPLIT; + if (right_ok && max_mtt_depth != 0) return BT_HOR_SPLIT; + if (bottom_ok && max_mtt_depth != 0) return BT_VER_SPLIT; + return QT_SPLIT; +} + + +int uvg_get_possible_splits(const encoder_state_t * const state, + const cu_loc_t * const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]) +{ + const unsigned width = cu_loc->width; + const unsigned height = cu_loc->height; + const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1; + + const unsigned max_btd = + state->encoder_control->cfg.max_btt_depth[slice_type] + split_tree.implicit_mtt_depth; + const unsigned max_bt_size = state->encoder_control->cfg.max_bt_size[slice_type]; + const unsigned min_bt_size = 1 << MIN_SIZE; + const unsigned max_tt_size = state->encoder_control->cfg.max_tt_size[slice_type]; + const unsigned min_tt_size = 1 << MIN_SIZE; + const unsigned min_qt_size = state->encoder_control->cfg.min_qt_size[slice_type]; + + const enum split_type implicitSplit = uvg_get_implicit_split(state, cu_loc, max_btd); + + splits[NO_SPLIT] = splits[QT_SPLIT] = splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = true; + bool can_btt = split_tree.mtt_depth < max_btd; + + const enum split_type last_split = GET_SPLITDATA(&split_tree, split_tree.current_depth - 1); + const enum split_type parl_split = last_split == TT_HOR_SPLIT ? BT_HOR_SPLIT : BT_VER_SPLIT; + + // don't allow QT-splitting below a BT split + if (split_tree.current_depth != 0 && last_split != QT_SPLIT /* && !(width > 64 || height > 64)*/) splits[QT_SPLIT] = false; + if (width <= min_qt_size) splits[QT_SPLIT] = false; + + if (tree_type == UVG_CHROMA_T && width <= 8) splits[QT_SPLIT] = false; + + if (implicitSplit != NO_SPLIT) + { + splits[NO_SPLIT] = splits[TT_HOR_SPLIT] = splits[TT_VER_SPLIT] = false; + + splits[BT_HOR_SPLIT] = implicitSplit == BT_HOR_SPLIT && height <= max_bt_size; + splits[BT_VER_SPLIT] = implicitSplit == BT_VER_SPLIT && width <= max_bt_size; + if (tree_type == UVG_CHROMA_T && width <= 8) splits[BT_VER_SPLIT] = false; + if (!splits[BT_HOR_SPLIT] && !splits[BT_VER_SPLIT] && !splits[QT_SPLIT]) splits[QT_SPLIT] = true; + return 1; + } + + if ((last_split == TT_HOR_SPLIT || last_split == TT_VER_SPLIT) && split_tree.part_index == 1) + { + splits[BT_HOR_SPLIT] = parl_split != BT_HOR_SPLIT; + splits[BT_VER_SPLIT] = parl_split != BT_VER_SPLIT; + } + + if (can_btt && (width <= min_bt_size && height <= min_bt_size) + && ((width <= min_tt_size && height <= min_tt_size))) + { + can_btt = false; + } + if (can_btt && (width > max_bt_size || height > max_bt_size) + && ((width > max_tt_size || height > max_tt_size))) + { + can_btt = false; + } + + if (!can_btt) + { + splits[BT_HOR_SPLIT] = splits[TT_HOR_SPLIT] = splits[BT_VER_SPLIT] = splits[TT_VER_SPLIT] = false; + + return 0; + } + + if (width > max_bt_size || height > max_bt_size) + { + splits[BT_HOR_SPLIT] = splits[BT_VER_SPLIT] = false; + } + + // specific check for BT splits + if (height <= min_bt_size) splits[BT_HOR_SPLIT] = false; + if (width > 64 && height <= 64) splits[BT_HOR_SPLIT] = false; + if (tree_type == UVG_CHROMA_T && width * height <= 64) splits[BT_HOR_SPLIT] = false; + + if (width <= min_bt_size) splits[BT_VER_SPLIT] = false; + if (width <= 64 && height > 64) splits[BT_VER_SPLIT] = false; + if (tree_type == UVG_CHROMA_T && (width * height <= 64 || width <= 8)) splits[BT_VER_SPLIT] = false; + + //if (modeType == MODE_TYPE_INTER && width * height == 32) splits[BT_VER_SPLIT] = splits[BT_HOR_SPLIT] = false; + + if (height <= 2 * min_tt_size || height > max_tt_size || width > max_tt_size) + splits[TT_HOR_SPLIT] = false; + if (width > 64 || height > 64) splits[TT_HOR_SPLIT] = false; + if (tree_type == UVG_CHROMA_T && width * height <= 64 * 2) splits[TT_HOR_SPLIT] = false; + + if (width <= 2 * min_tt_size || width > max_tt_size || height > max_tt_size) + splits[TT_VER_SPLIT] = false; + if (width > 64 || height > 64) splits[TT_VER_SPLIT] = false; + if (tree_type == UVG_CHROMA_T && (width * height <= 64 * 2 || width <= 16)) splits[TT_VER_SPLIT] = false; + + //if (modeType == MODE_TYPE_INTER && width * height == 64) splits[TT_VER_SPLIT] = splits[TT_HOR_SPLIT] = false; + return 0; +} + + +int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left) +{ + if ((left && cu_loc->x == 0) || (!left && cu_loc->y == 0)) { + return 0; + } + if (left && cu_loc->local_x == 0) return (LCU_WIDTH - cu_loc->local_y) / 4; + if (!left && cu_loc->local_y == 0) return (cu_loc->width) / 2; + + int amount = left ? cu_loc->height & ~3 : cu_loc->width & ~3; + if(left) { + const cu_info_t* cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y); + if (cu_loc->local_y == 0 && cu_loc->local_x == 32 && cu->log2_height == 6 && cu->log2_width == 6) return 8; + while (cu_loc->local_y + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x - TR_MIN_WIDTH, cu_loc->local_y + amount)->type != CU_NOTSET) { + amount += TR_MIN_WIDTH; + } + return MAX(amount / TR_MIN_WIDTH, cu_loc->height / TR_MIN_WIDTH); + } + while (cu_loc->local_x + amount < LCU_WIDTH && LCU_GET_CU_AT_PX(lcu, cu_loc->local_x + amount, cu_loc->local_y - TR_MIN_WIDTH)->type != CU_NOTSET) { + amount += TR_MIN_WIDTH; + } + return MAX(amount / TR_MIN_WIDTH, cu_loc->width / TR_MIN_WIDTH); +} diff --git a/src/cu.h b/src/cu.h index ddddaf55..8f3ec8bf 100644 --- a/src/cu.h +++ b/src/cu.h @@ -77,55 +77,6 @@ typedef enum { MTS_TR_NUM = 6, } mts_idx; -extern const uint8_t uvg_part_mode_num_parts[]; -extern const uint8_t uvg_part_mode_offsets[][4][2]; -extern const uint8_t uvg_part_mode_sizes[][4][2]; - -/** - * \brief Get the x coordinate of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param cu_x x coordinate of the containing CU - * \param i number of the PU - * \return location of the left edge of the PU - */ -#define PU_GET_X(part_mode, cu_width, cu_x, i) \ - ((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4) - -/** - * \brief Get the y coordinate of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param cu_y y coordinate of the containing CU - * \param i number of the PU - * \return location of the top edge of the PU - */ -#define PU_GET_Y(part_mode, cu_width, cu_y, i) \ - ((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4) - -/** - * \brief Get the width of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param i number of the PU - * \return width of the PU - */ -#define PU_GET_W(part_mode, cu_width, i) \ - (uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4) - -/** - * \brief Get the height of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param i number of the PU - * \return height of the PU - */ -#define PU_GET_H(part_mode, cu_width, i) \ - (uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4) ////////////////////////////////////////////////////////////////////////// // TYPES @@ -142,24 +93,53 @@ enum uvg_tree_type { UVG_CHROMA_T = 2 }; +enum split_type { + NO_SPLIT = 0, + QT_SPLIT = 1, + BT_HOR_SPLIT = 2, + BT_VER_SPLIT = 3, + TT_HOR_SPLIT = 4, + TT_VER_SPLIT = 5, +}; + +typedef struct { + uint32_t split_tree; + uint8_t current_depth; + uint8_t mtt_depth; + uint8_t implicit_mtt_depth; + uint8_t part_index; +} split_tree_t; + + +// Split for each depth takes three bits like xxy where if either x bit is set +// it is a MTT split, and if there are any MTT split QT split is not allowed +#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0) + /** * \brief Struct for CU info */ typedef struct { uint8_t type : 3; //!< \brief block type, one of cu_type_t values - uint8_t depth : 3; //!< \brief depth / size of this block - uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values - uint8_t tr_depth : 3; //!< \brief transform depth uint8_t skipped : 1; //!< \brief flag to indicate this block is skipped uint8_t merged : 1; //!< \brief flag to indicate this block is merged uint8_t merge_idx : 3; //!< \brief merge index uint8_t tr_skip : 3; //!< \brief transform skip flag uint8_t tr_idx : 3; //!< \brief transform index - uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding + uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding + + uint8_t log2_width : 3; + uint8_t log2_height : 3; + + uint8_t log2_chroma_width : 3; + uint8_t log2_chroma_height : 3; uint16_t cbf; + uint8_t root_cbf; + + uint32_t split_tree : 3 * 9; + /** * \brief QP used for the CU. * @@ -172,12 +152,15 @@ typedef struct uint8_t violates_mts_coeff_constraint : 1; uint8_t mts_last_scan_pos : 1; - uint8_t violates_lfnst_constrained_luma : 1; // Two types, luma and chroma. Luma index is 0. - uint8_t violates_lfnst_constrained_chroma : 1; // Two types, luma and chroma. Luma index is 0. + uint8_t violates_lfnst_constrained_luma : 1; + uint8_t violates_lfnst_constrained_chroma : 1; uint8_t lfnst_last_scan_pos : 1; uint8_t lfnst_idx : 2; uint8_t cr_lfnst_idx : 2; + uint8_t luma_deblocking : 2; + uint8_t chroma_deblocking : 2; + union { struct { int8_t mode; @@ -185,6 +168,9 @@ typedef struct uint8_t multi_ref_idx; int8_t mip_flag; int8_t mip_is_transposed; + int8_t isp_mode; + uint8_t isp_cbfs : 4; + uint8_t isp_index : 2; } intra; struct { mv_t mv[2][2]; // \brief Motion vectors for L0 and L1 @@ -200,12 +186,25 @@ typedef struct typedef struct { int16_t x; int16_t y; + uint8_t local_x; + uint8_t local_y; int8_t width; int8_t height; int8_t chroma_width; int8_t chroma_height; } cu_loc_t; +void uvg_cu_loc_ctor(cu_loc_t *loc, int x, int y, int width, int height); +typedef struct encoder_state_t encoder_state_t; + +int uvg_get_split_locs( + const cu_loc_t* const origin, + enum split_type split, + cu_loc_t out[4], + uint8_t* separate_chroma); +int uvg_get_possible_splits(const encoder_state_t* const state, + const cu_loc_t* const cu_loc, split_tree_t split_tree, enum uvg_tree_type tree_type, bool splits[6]); + #define CU_GET_MV_CAND(cu_info_ptr, reflist) \ (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1) @@ -219,7 +218,7 @@ typedef struct { } \ } while (0) -#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \ +#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d part_size=%d coded=%d " \ "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \ "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \ "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \ @@ -227,7 +226,7 @@ typedef struct { "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \ "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \ "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \ - , (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \ + , (cu).type, (cu).part_size, (cu).coded, \ (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \ (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \ (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \ @@ -246,6 +245,7 @@ typedef struct cu_array_t { } cu_array_t; cu_info_t* uvg_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px); +void uvg_get_isp_cu_arr_coords(int* x, int* y, int dim); const cu_info_t* uvg_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px); cu_array_t * uvg_cu_array_alloc(const int width, const int height); @@ -382,8 +382,9 @@ typedef struct { cu_info_t cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1]; } lcu_t; -void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src, enum uvg_tree_type - tree_type); +void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src); + +int uvg_count_available_edge_cus(const cu_loc_t* const cu_loc, const lcu_t* const lcu, bool left); /** * \brief Return pointer to the top right reference CU. @@ -412,9 +413,11 @@ void uvg_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu */ static INLINE void copy_coeffs(const coeff_t *__restrict src, coeff_t *__restrict dest, - size_t width) + size_t width, size_t height, const int lcu_width) { - memcpy(dest, src, width * width * sizeof(coeff_t)); + for (int j = 0; j < height; ++j) { + memcpy(dest + j * lcu_width, src + j * lcu_width, width * sizeof(coeff_t)); + } } @@ -554,56 +557,52 @@ static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y) } while(0) -#define NUM_CBF_DEPTHS 5 -static const uint16_t cbf_masks[NUM_CBF_DEPTHS] = { 0x1f, 0x0f, 0x07, 0x03, 0x1 }; - /** * Check if CBF in a given level >= depth is true. */ -static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane) +static INLINE int cbf_is_set(uint16_t cbf, color_t plane) { - return (cbf & (cbf_masks[depth] << (NUM_CBF_DEPTHS * plane))) != 0; + return (cbf & (1 << (plane))) != 0; } /** * Check if CBF in a given level >= depth is true. */ -static INLINE int cbf_is_set_any(uint16_t cbf, int depth) +static INLINE int cbf_is_set_any(uint16_t cbf) { - return cbf_is_set(cbf, depth, COLOR_Y) || - cbf_is_set(cbf, depth, COLOR_U) || - cbf_is_set(cbf, depth, COLOR_V); + return cbf_is_set(cbf, COLOR_Y) || + cbf_is_set(cbf, COLOR_U) || + cbf_is_set(cbf, COLOR_V); } /** * Set CBF in a level to true. */ -static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane) +static INLINE void cbf_set(uint16_t *cbf, color_t plane) { // Return value of the bit corresponding to the level. - *cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane); + *cbf |= (1) << (plane); } /** * Set CBF in a level to true if it is set at a lower level in any of * the child_cbfs. */ -static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], int depth, color_t plane) +static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs[3], color_t plane) { - bool child_cbf_set = cbf_is_set(child_cbfs[0], depth + 1, plane) || - cbf_is_set(child_cbfs[1], depth + 1, plane) || - cbf_is_set(child_cbfs[2], depth + 1, plane); + bool child_cbf_set = cbf_is_set(child_cbfs[0], plane) || + cbf_is_set(child_cbfs[1], plane) || + cbf_is_set(child_cbfs[2], plane); if (child_cbf_set) { - cbf_set(cbf, depth, plane); + cbf_set(cbf, plane); } } /** - * Set CBF in a levels <= depth to false. */ -static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane) +static INLINE void cbf_clear(uint16_t *cbf, color_t plane) { - *cbf &= ~(cbf_masks[depth] << (NUM_CBF_DEPTHS * plane)); + *cbf &= ~(1 << (plane)); } /** @@ -611,11 +610,11 @@ static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane) */ static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane) { - cbf_clear(cbf, 0, plane); - *cbf |= src & (cbf_masks[0] << (NUM_CBF_DEPTHS * plane)); + cbf_clear(cbf, plane); + *cbf |= src & (1 << plane); } -#define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth) -#define SET_SPLITDATA(CU,flag) { (CU)->split=(flag); } +#define GET_SPLITDATA(CU,curDepth) ((CU)->split_tree >> ((MAX((curDepth), 0) * 3)) & 7) +#define PU_IS_TU(cu) ((cu)->log2_width <= TR_MAX_LOG2_SIZE && (cu)->log2_height <= TR_MAX_LOG2_SIZE) #endif diff --git a/src/dep_quant.c b/src/dep_quant.c new file mode 100644 index 00000000..16591390 --- /dev/null +++ b/src/dep_quant.c @@ -0,0 +1,1139 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "dep_quant.h" + +#include "cu.h" +#include "encoderstate.h" +#include "intra.h" +#include "rdo.h" +#include "transform.h" +#include "uvg_math.h" +#include "generic/quant-generic.h" + +#include "strategies-depquant.h" +static const int32_t g_goRiceBits[4][RICEMAX] = { + { 32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, + { 65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, + { 98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680}, + {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376}, +}; + +static const int g_riceT[4] = { 32,128, 512, 2048 }; +static const int g_riceShift[5] = { 0, 2, 4, 6, 8 }; + +static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 }; + + +int uvg_init_nb_info(encoder_control_t * encoder) { + memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray)); + memset(encoder->m_scanId2NbInfoOutArray, 0, sizeof(encoder->m_scanId2NbInfoOutArray)); + memset(encoder->scan_info, 0, sizeof(encoder->scan_info)); + for (int hd = 0; hd <= 6; hd++) + { + + uint32_t raster2id[64 * 64] = {0}; + + for (int vd = 0; vd <= 6; vd++) + { + if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0)) + { + continue; + } + const uint32_t blockWidth = (1 << hd); + const uint32_t blockHeight = (1 << vd); + const uint32_t log2CGWidth = g_log2_sbb_size[hd][vd][0]; + const uint32_t log2CGHeight = g_log2_sbb_size[hd][vd][1]; + const uint32_t groupWidth = 1 << log2CGWidth; + const uint32_t groupHeight = 1 << log2CGHeight; + const uint32_t groupSize = groupWidth * groupHeight; + const int scanType = SCAN_DIAG; + const uint32_t blkWidthIdx = hd; + const uint32_t blkHeightIdx = vd; + const uint32_t* scanId2RP = uvg_get_scan_order_table(SCAN_GROUP_4X4, scanType, blkWidthIdx, blkHeightIdx); + const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, 0, hd, vd); + NbInfoSbb** sId2NbSbb = &encoder->m_scanId2NbInfoSbbArray[hd][vd]; + NbInfoOut** sId2NbOut = &encoder->m_scanId2NbInfoOutArray[hd][vd]; + // consider only non-zero-out region + const uint32_t blkWidthNZOut = MIN(32, blockWidth); + const uint32_t blkHeightNZOut = MIN(32, blockHeight); + const uint32_t totalValues = blkWidthNZOut * blkHeightNZOut; + + *sId2NbSbb = MALLOC(NbInfoSbb, totalValues); + if (*sId2NbSbb == NULL) { + return 0; + } + *sId2NbOut = MALLOC(NbInfoOut, totalValues); + if (*sId2NbOut == NULL) { + return 0; + } + encoder->scan_info[hd][vd] = MALLOC(struct dep_quant_scan_info, totalValues); + if (encoder->scan_info[hd][vd] == NULL) { + return 0; + } + + + for (uint32_t scanId = 0; scanId < totalValues; scanId++) + { + raster2id[scanId2RP[scanId]] = scanId; + } + const uint32_t height_in_sbb = MAX(blockHeight >> 2, 1); + const uint32_t width_in_sbb = MAX(blockWidth >> 2, 1); + + for (unsigned scanId = 0; scanId < totalValues; scanId++) + { + const int rpos = scanId2RP[scanId]; + uint32_t pos_y = rpos >> hd; + uint32_t pos_x = rpos - (pos_y << hd); // TODO: height + { + //===== inside subband neighbours ===== + NbInfoSbb *nbSbb = &(*sId2NbSbb)[scanId]; + const int begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock + int cpos[5]; + + cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] < groupSize + begSbb ? raster2id[rpos + 1] - begSbb : 0) : 0); + cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] < groupSize + begSbb ? raster2id[rpos + 2] - begSbb : 0) : 0); + cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] < groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] - begSbb : 0) : 0); + cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] < groupSize + begSbb ? raster2id[rpos + blockWidth] - begSbb : 0) : 0); + cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] < groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] - begSbb : 0) : 0); + + for (nbSbb->num = 0; true; ) + { + int nk = -1; + for (int k = 0; k < 5; k++) + { + if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk])) + { + nk = k; + } + } + if (nk < 0) + { + break; + } + nbSbb->inPos[nbSbb->num++] = (uint8_t)(cpos[nk]); + cpos[nk] = 0; + } + for (int k = nbSbb->num; k < 5; k++) + { + nbSbb->inPos[k] = 0; + } + } + { + //===== outside subband neighbours ===== + NbInfoOut *nbOut = &(*sId2NbOut)[scanId]; + const int begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock + int cpos[5]; + + cpos[0] = (pos_x + 1 < blkWidthNZOut ? (raster2id[rpos + 1] >= groupSize + begSbb ? raster2id[rpos + 1] : 0) : 0); + cpos[1] = (pos_x + 2 < blkWidthNZOut ? (raster2id[rpos + 2] >= groupSize + begSbb ? raster2id[rpos + 2] : 0) : 0); + cpos[2] = (pos_x + 1 < blkWidthNZOut && pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + 1 + blockWidth] >= groupSize + begSbb ? raster2id[rpos + 1 + blockWidth] : 0) : 0); + cpos[3] = (pos_y + 1 < blkHeightNZOut ? (raster2id[rpos + blockWidth] >= groupSize + begSbb ? raster2id[rpos + blockWidth] : 0) : 0); + cpos[4] = (pos_y + 2 < blkHeightNZOut ? (raster2id[rpos + 2 * blockWidth] >= groupSize + begSbb ? raster2id[rpos + 2 * blockWidth] : 0) : 0); + + for (nbOut->num = 0; true; ) + { + int nk = -1; + for (int k = 0; k < 5; k++) + { + if (cpos[k] != 0 && (nk < 0 || cpos[k] < cpos[nk])) + { + nk = k; + } + } + if (nk < 0) + { + break; + } + nbOut->outPos[nbOut->num++] = (uint16_t)(cpos[nk]); + cpos[nk] = 0; + } + for (int k = nbOut->num; k < 5; k++) + { + nbOut->outPos[k] = 0; + } + nbOut->maxDist = (scanId == 0 ? 0 : (*sId2NbOut)[scanId - 1].maxDist); + for (int k = 0; k < nbOut->num; k++) + { + if (nbOut->outPos[k] > nbOut->maxDist) + { + nbOut->maxDist = nbOut->outPos[k]; + } + } + } + uint32_t cg_pos = cg_scan[scanId >> 4]; + + uint32_t blkpos_next = scanId2RP[scanId ? scanId - 1 : 0]; + uint32_t pos_y_next = blkpos_next >> hd; + uint32_t pos_x_next = blkpos_next - (pos_y_next << hd); + uint32_t cg_blockpos_next = scanId ? cg_scan[(scanId - 1) >> 4] : 0; + uint32_t cg_pos_y_next = cg_blockpos_next / width_in_sbb; + uint32_t cg_pos_x_next = cg_blockpos_next - (cg_pos_y_next * width_in_sbb); + uint32_t diag = pos_y_next + pos_x_next; + + + uint32_t nextSbbRight = (cg_pos_x_next < width_in_sbb - 1 ? cg_blockpos_next + 1 : 0); + uint32_t nextSbbBelow = (cg_pos_y_next < height_in_sbb - 1 ? cg_blockpos_next + width_in_sbb : 0); + encoder->scan_info[hd][vd][scanId].pos_x = pos_x; + encoder->scan_info[hd][vd][scanId].pos_y = pos_y; + encoder->scan_info[hd][vd][scanId].sig_ctx_offset[0] = (diag < 2 ? 8 : diag < 5 ? 4 : 0); + encoder->scan_info[hd][vd][scanId].sig_ctx_offset[1] = (diag < 2 ? 4 : 0); + encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[0] = (diag < 1 ? 16 : diag < 3 ? 11 : diag < 10 ? 6 : 1); + encoder->scan_info[hd][vd][scanId].gtx_ctx_offset[1] = (diag < 1 ? 6 : 1); + encoder->scan_info[hd][vd][scanId].cg_pos = cg_pos; + encoder->scan_info[hd][vd][scanId].next_sbb_right = nextSbbRight; + encoder->scan_info[hd][vd][scanId].next_sbb_below = nextSbbBelow; + } + + // make it relative + for (unsigned scanId = 0; scanId < totalValues; scanId++) + { + NbInfoOut *nbOut = &(*sId2NbOut)[scanId]; + const int begSbb = scanId - (scanId & (groupSize - 1)); // first pos in current subblock + for (int k = 0; k < nbOut->num; k++) + { + nbOut->outPos[k] -= begSbb; + } + nbOut->maxDist -= scanId; + } + } + } + return 1; +} + +void uvg_dealloc_nb_info(encoder_control_t* encoder) { + + for (int hd = 0; hd <= 7; hd++) { + for (int vd = 0; vd <= 7; vd++) + { + if ((hd == 0 && vd <= 1) || (hd <= 1 && vd == 0)) + { + continue; + } + if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoOutArray[hd][vd]); + if(encoder->m_scanId2NbInfoOutArray[hd][vd]) FREE_POINTER(encoder->m_scanId2NbInfoSbbArray[hd][vd]); + if(encoder->scan_info[hd][vd]) FREE_POINTER(encoder->scan_info[hd][vd]); + } + } +} + + +static INLINE int ceil_log2(uint64_t x) +{ + static const uint64_t t[6] = { 0xFFFFFFFF00000000ull, 0x00000000FFFF0000ull, 0x000000000000FF00ull, 0x00000000000000F0ull, 0x000000000000000Cull, 0x0000000000000002ull }; + int y = (((x & (x - 1)) == 0) ? 0 : 1); + int j = 32; + for (int i = 0; i < 6; i++) + { + int k = (((x & t[i]) == 0) ? 0 : j); + y += k; + x >>= k; + j >>= 1; + } + return y; +} + +static void init_quant_block( + const encoder_state_t* state, + quant_block* qp, + const cu_info_t* const cur_tu, + unsigned log2_width, + unsigned log2_height, + color_t color, + const bool needsSqrt2ScaleAdjustment, + const int gValue) +{ + double lambda = color == COLOR_Y ? state->lambda : state->c_lambda; + + const int qpDQ = state->qp + 1; + const int qpPer = qpDQ / 6; + const int qpRem = qpDQ - 6 * qpPer; + const int channelBitDepth = state->encoder_control->bitdepth; + const int maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE; + const int nomTransformShift = MAX_TR_DYNAMIC_RANGE - channelBitDepth - ((log2_width + log2_height) >> 1); + const bool clipTransformShift = (cur_tu->tr_skip >> color) & 1 && false; // extended precision + const int transformShift = + (clipTransformShift ? MAX(0, nomTransformShift) : + nomTransformShift) + + (needsSqrt2ScaleAdjustment ? -1 : 0); + // quant parameters + qp->m_QShift = QUANT_SHIFT - 1 + qpPer + transformShift; + qp->m_QAdd = -((3 << qp->m_QShift) >> 1); + int invShift = IQUANT_SHIFT + 1 - qpPer - transformShift; + qp->m_QScale = uvg_g_quant_scales[needsSqrt2ScaleAdjustment ? 1 : 0][qpRem]; + const unsigned qIdxBD = MIN( + maxLog2TrDynamicRange + 1, + 8 * sizeof(int) + invShift - IQUANT_SHIFT - 1); + qp->m_maxQIdx = (1 << (qIdxBD - 1)) - 4; + qp->m_thresLast = (((int64_t)(4) << (int64_t)qp->m_QShift)); + qp->m_thresSSbb = (((int64_t)(3) << (int64_t)qp->m_QShift)); + // distortion calculation parameters + const int64_t qScale = (gValue == -1) ? qp->m_QScale : gValue; + const int nomDShift = + 15 - + 2 * (nomTransformShift) + + qp->m_QShift + (needsSqrt2ScaleAdjustment ? 1 : 0); + const double qScale2 = (double)(qScale * qScale); + const double nomDistFactor = + (nomDShift < 0 ? + 1.0 / ((double)((int64_t)(1) << (-nomDShift)) * qScale2 * lambda) : + (double)((int64_t)(1) << nomDShift) / (qScale2 * lambda)); + const int64_t pow2dfShift = (int64_t)(nomDistFactor * qScale2) + 1; + const int dfShift = ceil_log2(pow2dfShift); + qp->m_DistShift = 62 + qp->m_QShift - 2 * maxLog2TrDynamicRange - dfShift; + qp->m_DistAdd = ((int64_t)(1) << qp->m_DistShift) >> 1; + qp->m_DistStepAdd = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + qp->m_QShift)) + .5); + qp->m_DistOrgFact = (int64_t)(nomDistFactor * (double)((int64_t)(1) << (qp->m_DistShift + 1)) + .5); + qp->needs_init = false; +} + +static void reset_common_context(common_context* ctx, const rate_estimator_t * rate_estimator, int numSbb, int num_coeff) +{ + //memset(&ctx->m_nbInfo, 0, sizeof(ctx->m_nbInfo)); + memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits)); + uint8_t* next_sbb_memory = ctx->sbb_memory; + uint8_t* next_level_memory = ctx->level_memory; + for (int k = 0; k < 2; k++, next_sbb_memory += numSbb * 4llu, next_level_memory += num_coeff * 4llu) { + ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory; + ctx->m_allSbbCtx[k].levels = next_level_memory; + } + ctx->m_curr_sbb_ctx_offset = 0; + ctx->m_prev_sbb_ctx_offset = 1; + ctx->num_coeff = num_coeff; +} + +static void init_rate_esimator(rate_estimator_t * rate_estimator, const cabac_data_t * const ctx, color_t color) +{ + const cabac_ctx_t * base_ctx = color == COLOR_Y ? ctx->ctx.sig_coeff_group_model : (ctx->ctx.sig_coeff_group_model + 2); + for (unsigned ctxId = 0; ctxId < SM_MAX_NUM_SIG_SBB_CTX; ctxId++) { + rate_estimator->m_sigSbbFracBits[ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0); + rate_estimator->m_sigSbbFracBits[ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1); + } + unsigned numCtx = (color == COLOR_Y ? 12 : 8); + for (unsigned ctxSetId = 0; ctxSetId < SM_NUM_CTX_SETS_SIG; ctxSetId++) { + base_ctx = color == COLOR_Y ? ctx->ctx.cu_sig_model_luma[ctxSetId] : ctx->ctx.cu_sig_model_chroma[ctxSetId]; + for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) { + rate_estimator->m_sigFracBits[ctxSetId][ctxId][0] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 0); + rate_estimator->m_sigFracBits[ctxSetId][ctxId][1] = CTX_ENTROPY_BITS(&base_ctx[ctxId], 1); + } + } + + numCtx = (color == COLOR_Y? 21 : 11); + for (unsigned ctxId = 0; ctxId < numCtx; ctxId++) { + const cabac_ctx_t * par_ctx = color == COLOR_Y ? &ctx->ctx.cu_parity_flag_model_luma[ctxId] : &ctx->ctx.cu_parity_flag_model_chroma[ctxId]; + const cabac_ctx_t * gt2_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[0][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[0][ctxId]; + const cabac_ctx_t * gt1_ctx = color == COLOR_Y ? &ctx->ctx.cu_gtx_flag_model_luma[1][ctxId] : &ctx->ctx.cu_gtx_flag_model_chroma[1][ctxId]; + + int32_t* cb = rate_estimator->m_gtxFracBits[ctxId]; + int32_t par0 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 0); + int32_t par1 = (1 << SCALE_BITS) + (int32_t)CTX_ENTROPY_BITS(par_ctx, 1); + cb[0] = 0; + cb[1] = CTX_ENTROPY_BITS(gt1_ctx, 0) + (1 << SCALE_BITS); + cb[2] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 0); + cb[3] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 0); + cb[4] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par0 + CTX_ENTROPY_BITS(gt2_ctx, 1); + cb[5] = CTX_ENTROPY_BITS(gt1_ctx, 1) + par1 + CTX_ENTROPY_BITS(gt2_ctx, 1); + } +} + + +static void xSetLastCoeffOffset( + const encoder_state_t* const state, + const cu_info_t* const cur_tu, + const int width, + const int height, + rate_estimator_t* rate_estimator, + const color_t compID) +{ + int32_t cbfDeltaBits = 0; + if (compID == COLOR_Y && cur_tu->type != CU_INTRA /*&& !tu.depth*/) { + cbfDeltaBits = (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 1) - (int32_t)CTX_ENTROPY_BITS(&state->search_cabac.ctx.cu_qt_root_cbf_model, 0); + } else { + bool prevLumaCbf = false; + bool lastCbfIsInferred = false; + bool useIntraSubPartitions = cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode && compID == COLOR_Y; + if (useIntraSubPartitions) { + uint32_t nTus = uvg_get_isp_split_num(1 << cur_tu->log2_width, 1 << cur_tu->log2_height, cur_tu->intra.isp_mode, true); + bool isLastSubPartition = cur_tu->intra.isp_index +1 == nTus; //TODO: isp check + if (isLastSubPartition) { + lastCbfIsInferred = cur_tu->intra.isp_cbfs == 0; + } + if (!lastCbfIsInferred) { + prevLumaCbf = cur_tu->intra.isp_index != 0 && (cur_tu->intra.isp_cbfs & (1 << (cur_tu->intra.isp_index - 1))); + } + const cabac_ctx_t * const cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[2 + prevLumaCbf]; + cbfDeltaBits = lastCbfIsInferred ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0); + } + else { + const cabac_ctx_t* cbf_ctx; + switch (compID) { + case COLOR_Y: + cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_luma[0]; + break; + case COLOR_U: + cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cb[0]; + break; + case COLOR_V: + cbf_ctx = &state->search_cabac.ctx.qt_cbf_model_cr[cbf_is_set(cur_tu->cbf, COLOR_U)]; + break; + } + cbfDeltaBits = compID != COLOR_Y && cur_tu->joint_cb_cr ? 0 : (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 1) - (int32_t)CTX_ENTROPY_BITS(cbf_ctx, 0); + } + + } + + static const unsigned prefixCtx[] = {0, 0, 0, 3, 6, 10, 15, 21}; + uint32_t ctxBits[14]; + for (unsigned xy = 0; xy < 2; xy++) { + int32_t bitOffset = (xy ? cbfDeltaBits : 0); + int32_t* lastBits = (xy ? rate_estimator->m_lastBitsY : rate_estimator->m_lastBitsX); + const unsigned size = (xy ? (height) : (width)); + const unsigned log2Size = uvg_math_ceil_log2(size); + const bool useYCtx = (xy != 0); + const cabac_ctx_t* const ctxSetLast = useYCtx ? + (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_y_luma : state->search_cabac.ctx.cu_ctx_last_y_chroma) : + (compID == COLOR_Y ? state->search_cabac.ctx.cu_ctx_last_x_luma : state->search_cabac.ctx.cu_ctx_last_x_chroma); + const unsigned lastShift = (compID == COLOR_Y ? (log2Size + 1) >> 2 : CLIP(0, 2, size >> 3)); + const unsigned lastOffset = (compID == COLOR_Y ? (prefixCtx[log2Size]) : 0); + uint32_t sumFBits = 0; + unsigned maxCtxId = g_group_idx[MIN(32, size) - 1]; + for (unsigned ctxId = 0; ctxId < maxCtxId; ctxId++) { + ctxBits[ctxId] = sumFBits + + CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 0) + + (ctxId > 3 ? ((ctxId - 2) >> 1) << SCALE_BITS : 0) + + bitOffset; + sumFBits += CTX_ENTROPY_BITS(&ctxSetLast[lastOffset + (ctxId >> lastShift)], 1); + } + ctxBits[maxCtxId] = sumFBits + (maxCtxId > 3 ? ((maxCtxId - 2) >> 1) << SCALE_BITS : 0) + bitOffset; + for (unsigned pos = 0; pos < MIN(32, size); pos++) { + lastBits[pos] = ctxBits[g_group_idx[pos]]; + } + } +} + + +static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2], uint32_t gtx_frac_bits[6]) +{ + state->m_rdCost = INT64_MAX >> 1; + state->m_numSigSbb = 0; + state->m_remRegBins = 4; // just large enough for last scan pos + state->m_refSbbCtxId = -1; + state->m_sigFracBits[0] = sig_frac_bits[0]; + state->m_sigFracBits[1] = sig_frac_bits[1]; + memcpy(state->m_coeffFracBits, gtx_frac_bits, sizeof(state->m_coeffFracBits)); + state->m_goRicePar = 0; + state->m_goRiceZero = 0; + + state->m_sbbFracBits[0] = 0; + state->m_sbbFracBits[1] = 0; +} + + +void uvg_dep_quant_check_rd_costs( + const all_depquant_states * const state, + const enum ScanPosType spt, + const PQData * pqDataA, + Decision * decisions, + const int decisionA, + const int decisionB, + const int state_offset) +{ + const int pqA = decisionA && decisionB ? 3 : 0; + const int pqB = decisionA && decisionB ? 1 : 2; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA]; + int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB]; + int64_t rdCostZ = state->m_rdCost[state_offset]; + if (state->m_remRegBins[state_offset] >= 4) { + if (pqDataA->absLevel[pqA] < 4) { + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } + else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (pqDataA->absLevel[pqB] < 4) { + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } + else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (spt == SCAN_ISCSBB) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } + else if (spt == SCAN_SOCSBB) { + rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0]; + } + else if (state->m_numSigSbb[state_offset]) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } + else { + rdCostZ = decisions->rdCost[decisionA]; + } + } + else { + rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] + ? pqDataA->absLevel[pqA] - 1 + : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)]; + rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] + ? pqDataA->absLevel[pqB] - 1 + : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)]; + rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]]; + } + if (rdCostA < decisions->rdCost[decisionA]) { + decisions->rdCost[decisionA] = rdCostA; + decisions->absLevel[decisionA] = pqDataA->absLevel[pqA]; + decisions->prevId[decisionA] = state->m_stateId[state_offset]; + } + if (rdCostZ < decisions->rdCost[decisionA]) { + decisions->rdCost[decisionA] = rdCostZ; + decisions->absLevel[decisionA] = 0; + decisions->prevId[decisionA] = state->m_stateId[state_offset]; + } + if (rdCostB < decisions->rdCost[decisionB]) { + decisions->rdCost[decisionB] = rdCostB; + decisions->absLevel[decisionB] = pqDataA->absLevel[pqB]; + decisions->prevId[decisionB] = state->m_stateId[state_offset]; + } +} + + +static INLINE unsigned templateAbsCompare(coeff_t sum) +{ + int rangeIdx = 0; + if (sum < g_riceT[0]) { + rangeIdx = 0; + } + else if (sum < g_riceT[1]) { + rangeIdx = 1; + } + else if (sum < g_riceT[2]) { + rangeIdx = 2; + } + else if (sum < g_riceT[3]) { + rangeIdx = 3; + } + else { + rangeIdx = 4; + } + return g_riceShift[rangeIdx]; +} + +static INLINE void update_common_context( + context_store* ctxs, + common_context * cc, + const uint32_t scan_pos, + const uint32_t cg_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const uint32_t next_sbb_right, + const uint32_t next_sbb_below, + const int prev_state, + const int curr_state) +{ + const uint32_t numSbb = width_in_sbb * height_in_sbb; + const int curr_state_without_offset = curr_state & 3; + uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags; + uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels; + size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t); + int8_t prev_sbb_state = ctxs->m_allStates.m_refSbbCtxId[prev_state]; + if (prev_state != -1 && prev_sbb_state >= 0) { + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state]; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state]; + } + } + else { + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state_without_offset] = 0; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = 0; + } + } + sbbFlags[cg_pos * 4 + curr_state_without_offset] = !!ctxs->m_allStates.m_numSigSbb[curr_state]; + for (int i = 0; i < 16; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = ctxs->m_allStates.m_absLevels[curr_state / 4][i * 4 + curr_state_without_offset]; + } + + const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right * 4 + curr_state_without_offset] : false) + || (next_sbb_below ? sbbFlags[next_sbb_below* 4 + curr_state_without_offset] : false) ? 1 : 0); + ctxs->m_allStates.m_numSigSbb[curr_state] = 0; + if (prev_state != -1) { + ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state]; + } + else { + int ctxBinSampleRatio = 28; + // (scanInfo.chType == COLOR_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; + ctxs->m_allStates.m_remRegBins[curr_state] = (ctxs->m_allStates.effWidth * ctxs->m_allStates.effHeight * ctxBinSampleRatio) / 16; + } + ctxs->m_allStates.m_goRicePar[curr_state] = 0; + ctxs->m_allStates.m_refSbbCtxId[curr_state] = curr_state_without_offset; + ctxs->m_allStates.m_sbbFracBits[curr_state][0] = cc->m_sbbFlagBits[sigNSbb][0]; + ctxs->m_allStates.m_sbbFracBits[curr_state][1] = cc->m_sbbFlagBits[sigNSbb][1]; + + uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2]; + const int scanBeg = scan_pos - 16; + const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; + const uint8_t* absLevels = levels + scanBeg * 4; + for (int id = 0; id < 16; id++, nbOut++) { + if (nbOut->num) { + coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0; +#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k] * 4 + curr_state_without_offset]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } + UPDATE(0); + if (nbOut->num > 1) { + UPDATE(1); + if (nbOut->num > 2) { + UPDATE(2); + if (nbOut->num > 3) { + UPDATE(3); + if (nbOut->num > 4) { + UPDATE(4); + } + } + } + } +#undef UPDATE + templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1 << 3)) + (uint16_t)(MIN(127, sumAbs) << 8); + } + else { + templateCtxInit[curr_state_without_offset + id * 4] = 0; + } + } + for (int i = curr_state_without_offset; i < 64; i += 4) { + ctxs->m_allStates.m_absLevels[curr_state >> 2][i] = 0; + } +} + + + +void uvg_dep_quant_update_state_eos( + context_store* ctxs, + const uint32_t scan_pos, + const uint32_t cg_pos, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const uint32_t next_sbb_right, + const uint32_t next_sbb_below, + const Decision * decisions, + int decision_id) +{ + all_depquant_states* state = &ctxs->m_allStates; + int curr_state_offset = ctxs->m_curr_state_offset + decision_id; + state->m_rdCost[curr_state_offset] = decisions->rdCost[decision_id]; + if (decisions->prevId[decision_id] > -2) { + int prvState = -1; + if (decisions->prevId[decision_id] >= 4) { + prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); + state->m_numSigSbb[curr_state_offset] = 0; + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0; + } + } + else if (decisions->prevId[decision_id] >= 0) { + prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; + state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id]; + for (int i = 0; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i + decision_id] = + state->m_absLevels[ctxs->m_prev_state_offset / 4][i + decisions->prevId[decision_id]]; + } + } + else { + state->m_numSigSbb[curr_state_offset] = 1; + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0; + } + } + uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id]; + *temp = (uint8_t)MIN(255, decisions->absLevel[decision_id]); + + update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right, + next_sbb_below, prvState, ctxs->m_curr_state_offset + decision_id); + + coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id]; + coeff_t sumNum = tinit & 7; + coeff_t sumAbs1 = (tinit >> 3) & 31; + coeff_t sumGt1 = sumAbs1 - sumNum; + state->m_sigFracBits[curr_state_offset][0] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0]; + state->m_sigFracBits[curr_state_offset][1] = state->m_sigFracBitsArray[curr_state_offset][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1]; + + memcpy(state->m_coeffFracBits[curr_state_offset], + state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0])); + } +} + + +void uvg_dep_quant_update_state( + context_store * ctxs, + int numIPos, + const uint32_t scan_pos, + const Decision* decisions, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const NbInfoSbb next_nb_info_ssb, + const int baseLevel, + const bool extRiceFlag, + int decision_id) { + all_depquant_states* state = &ctxs->m_allStates; + int state_id = ctxs->m_curr_state_offset + decision_id; + state->m_rdCost[state_id] = decisions->rdCost[decision_id]; + int32_t prev_id_no_offset = decisions->prevId[decision_id]; + if (prev_id_no_offset > -2) { + if (prev_id_no_offset >= 0) { + const int prvState = ctxs->m_prev_state_offset + prev_id_no_offset; + state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id]; + state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState]; + state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0]; + state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1]; + state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1; + state->m_goRicePar[state_id] = state->m_goRicePar[prvState]; + if (state->m_remRegBins[state_id] >= 4) { + state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 + ? (unsigned)decisions->absLevel[decision_id] + : 3); + } + for (int i = 0; i < 64; i += 4) { + state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i]; + } + for (int i = 0; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i]; + } + } + else { + state->m_numSigSbb[state_id] = 1; + state->m_refSbbCtxId[state_id] = -1; + int ctxBinSampleRatio = 28; + //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; + state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - ( + decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset >> 2][i] = 0; + } + for (int i = decision_id; i < 64; i += 4) { + state->m_ctxInit[ctxs->m_curr_state_offset >> 2][i] = 0; + } + } + state->all_gte_four &= state->m_remRegBins[state_id] >= 4; + state->all_lt_four &= state->m_remRegBins[state_id] < 4; + uint8_t* levels = state->m_absLevels[ctxs->m_curr_state_offset >> 2]; + levels[(scan_pos & 15) * 4 + decision_id] = (uint8_t)MIN(32, decisions->absLevel[decision_id]); + + if (state->m_remRegBins[state_id] >= 4) { + coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id]; + coeff_t sumAbs1 = (tinit >> 3) & 31; + coeff_t sumNum = tinit & 7; +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + coeff_t sumGt1 = sumAbs1 - sumNum; + state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN( + (sumAbs1 + 1) >> 1, 3)][0]; + state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN( + (sumAbs1 + 1) >> 1, 3)][1]; + memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], + sizeof(state->m_coeffFracBits[0])); + + + coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8; +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + unsigned currentShift = templateAbsCompare(sumAbs); + sumAbs = sumAbs >> currentShift; + int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; + state->m_goRicePar[state_id] += currentShift; + } + else { + int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; + } + } + else { + coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8; +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + unsigned currentShift = templateAbsCompare(sumAbs); + sumAbs = sumAbs >> currentShift; + sumAbs = MIN(31, sumAbs); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; + state->m_goRicePar[state_id] += currentShift; + } + else { + sumAbs = MIN(31, sumAbs); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; + } + state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id]; + } + } + else { + state->all_gte_four &= state->m_remRegBins[state_id] >= 4; + state->all_lt_four &= state->m_remRegBins[state_id] < 4; + } +} + + +int uvg_dep_quant( + const encoder_state_t* const state, + const cu_info_t* const cur_tu, + const int width, + const int height, + const coeff_t* srcCoeff, + coeff_t* coeff_out, + const color_t compID, + enum uvg_tree_type tree_type, + int* absSum, + const bool enableScalingLists) +{ + const encoder_control_t* const encoder = state->encoder_control; + //===== reset / pre-init ===== + const int baseLevel = 4; + context_store dep_quant_context; + dep_quant_context.m_curr_state_offset = 0; + dep_quant_context.m_prev_state_offset = 4; + dep_quant_context.m_skip_state_offset = 8; + + const uint32_t lfnstIdx = tree_type != UVG_CHROMA_T || compID == COLOR_Y ? + cur_tu->lfnst_idx : + cur_tu->cr_lfnst_idx; + + const int numCoeff = width * height; + + memset(coeff_out, 0x00, width * height * sizeof(coeff_t)); + *absSum = 0; + + const bool is_mts = compID == COLOR_Y && cur_tu->tr_idx > MTS_SKIP; + const bool is_ts = (cur_tu->tr_skip >> compID) & 1; + + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4,0,log2_tr_width,log2_tr_height); + const uint32_t* const cg_scan = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED,0,log2_tr_width,log2_tr_height); + + int32_t qp_scaled = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); + qp_scaled = is_ts ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; + bool needs_block_size_trafo_scale = !is_ts && ((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size + + const int32_t scalinglist_type = (cur_tu->type == CU_INTRA ? 0 : 3) + (int8_t)compID; + const int32_t *q_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6]; + + if (compID != COLOR_Y) { + dep_quant_context.m_quant = (quant_block*)& state->quant_blocks[2]; + } else if (cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP) { + dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[1]; + } else { + dep_quant_context.m_quant = (quant_block*)&state->quant_blocks[0]; + } + //TODO: no idea when it is safe not to reinit for inter + if (dep_quant_context.m_quant->needs_init || cur_tu->type == CU_INTER) { + init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, -1); + } + + //===== scaling matrix ==== + //const int qpDQ = cQP.Qp + 1; + //const int qpPer = qpDQ / 6; + //const int qpRem = qpDQ - 6 * qpPer; + + //TCoeff thresTmp = thres; + bool zeroOut = false; + bool zeroOutforThres = false; + int effWidth = width, effHeight = height; + if ( + (is_mts || + (state->encoder_control->cfg.mts && 0 /*sbt used by block*/ && + height <= 32 && width <= 32)) && + compID == COLOR_Y) { + effHeight = (height == 32) ? 16 : height; + effWidth = (width == 32) ? 16 : width; + zeroOut = (effHeight < height || effWidth < width); + } + zeroOutforThres = zeroOut || (32 < height || 32 < width); + //===== find first test position ===== + int firstTestPos = numCoeff - 1; + if ( + lfnstIdx > 0 && !is_ts && width >= 4 && + height >= 4) { + firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15; + } + uvg_find_first_non_zero_coeff( + srcCoeff, + enableScalingLists, + &dep_quant_context, + scan, + q_coeff, + &firstTestPos, + width, + height); + if (firstTestPos < 0) { + return 0; + } + + //===== real init ===== + rate_estimator_t* rate_estimator = (rate_estimator_t *)(compID == COLOR_Y && cur_tu->type == CU_INTRA && cur_tu->intra.isp_mode != ISP_MODE_NO_ISP ? + &state->rate_estimator[3] : &state->rate_estimator[compID]); + if(rate_estimator->needs_init || cur_tu->type == CU_INTER) { + init_rate_esimator(rate_estimator, &state->search_cabac, compID); + xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID); + rate_estimator->needs_init = false; + } else if (compID == COLOR_U && state->encoder_control->cfg.jccr) { + xSetLastCoeffOffset(state, cur_tu, width, height, rate_estimator, compID); + } + + reset_common_context(&dep_quant_context.m_common_context, rate_estimator, (width * height) >> 4, numCoeff); + dep_quant_context.m_common_context.m_nbInfo = encoder->m_scanId2NbInfoOutArray[log2_tr_width][log2_tr_height]; + + + int effectHeight = MIN(32, effHeight); + int effectWidth = MIN(32, effWidth); + for (int k = 0; k < 12; k++) { + dep_quant_context.m_allStates.m_rdCost[k] = INT64_MAX >> 1; + dep_quant_context.m_allStates.m_numSigSbb[k] = 0; + dep_quant_context.m_allStates.m_remRegBins[k] = 4; // just large enough for last scan pos + dep_quant_context.m_allStates.m_refSbbCtxId[k] = -1; + dep_quant_context.m_allStates.m_sigFracBits[k][0] = rate_estimator->m_sigFracBits[0][0][0]; + dep_quant_context.m_allStates.m_sigFracBits[k][1] = rate_estimator->m_sigFracBits[0][0][1]; + memcpy(dep_quant_context.m_allStates.m_coeffFracBits[k], rate_estimator->m_gtxFracBits[0], sizeof(dep_quant_context.m_allStates.m_coeffFracBits[k])); + dep_quant_context.m_allStates.m_goRicePar[k] = 0; + dep_quant_context.m_allStates.m_goRiceZero[k] = 0; + + dep_quant_context.m_allStates.m_sbbFracBits[k][0] = 0; + dep_quant_context.m_allStates.m_sbbFracBits[k][1] = 0; + + dep_quant_context.m_allStates.m_stateId[k] = k & 3; + for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) { + memcpy(dep_quant_context.m_allStates.m_sigFracBitsArray[k][i], rate_estimator->m_sigFracBits[(k & 3 ? (k & 3) - 1 : 0)][i], sizeof(uint32_t) * 2); + } + } + + dep_quant_context.m_allStates.effHeight = effectHeight; + dep_quant_context.m_allStates.effWidth = effectWidth; + dep_quant_context.m_allStates.all_gte_four = true; + dep_quant_context.m_allStates.all_lt_four = false; + dep_quant_context.m_allStates.m_commonCtx = &dep_quant_context.m_common_context; + for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) { + memcpy(dep_quant_context.m_allStates.m_gtxFracBitsArray[i], rate_estimator->m_gtxFracBits[i], sizeof(int32_t) * 6); + } + + depquant_state_init(&dep_quant_context.m_startState, rate_estimator->m_sigFracBits[0][0], rate_estimator->m_gtxFracBits[0]); + dep_quant_context.m_startState.effHeight = effectHeight; + dep_quant_context.m_startState.effWidth = effectWidth; + dep_quant_context.m_startState.m_stateId = 0; + dep_quant_context.m_startState.m_commonCtx = &dep_quant_context.m_common_context; + for (int i = 0; i < (compID == COLOR_Y ? 12 : 8); ++i) { + dep_quant_context.m_startState.m_sigFracBitsArray[i] = rate_estimator->m_sigFracBits[0][i]; + } + for (int i = 0; i < (compID == COLOR_Y ? 21 : 11); ++i) { + dep_quant_context.m_startState.m_gtxFracBitsArray[i] = rate_estimator->m_gtxFracBits[i]; + } + + const uint32_t height_in_sbb = MAX(height >> 2, 1); + const uint32_t width_in_sbb = MAX(width >> 2, 1); + + const int default_quant_coeff = dep_quant_context.m_quant->m_QScale; + //===== populate trellis ===== + for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) { + uint32_t blkpos = scan[scanIdx]; + struct dep_quant_scan_info* scan_info = &encoder->scan_info[log2_tr_width][log2_tr_height][scanIdx]; + + context_store* ctxs = &dep_quant_context; + if (enableScalingLists) { + init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]); + + uvg_dep_quant_decide_and_update( + rate_estimator, + ctxs, + scan_info, + abs(srcCoeff[blkpos]), + scanIdx, + width_in_sbb, + height_in_sbb, + encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0], + (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)), + q_coeff[blkpos], + width, + height, + compID != 0 + ); //tu.cu->slice->getReverseLastSigCoeffFlag()); + } + else { + uvg_dep_quant_decide_and_update( + rate_estimator, + ctxs, + scan_info, + abs(srcCoeff[blkpos]), + scanIdx, + width_in_sbb, + height_in_sbb, + encoder->m_scanId2NbInfoSbbArray[log2_tr_width][log2_tr_height][scanIdx ? scanIdx - 1 : 0], + (zeroOut && (scan_info->pos_x >= effWidth || scan_info->pos_y >= effHeight)), + default_quant_coeff, + width, + height, + compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag()); + } + } + + //===== find best path ===== + int prev_id = -1; + int64_t minPathCost = 0; + for (int8_t stateId = 0; stateId < 4; stateId++) { + int64_t pathCost = dep_quant_context.m_trellis[0].rdCost[stateId]; + if (pathCost < minPathCost) { + prev_id = stateId; + minPathCost = pathCost; + } + } + + //===== backward scanning ===== + int scanIdx = 0; + context_store* ctxs = &dep_quant_context; + for (; prev_id >= 0; scanIdx++) { + Decision temp = dep_quant_context.m_trellis[scanIdx]; + int32_t blkpos = scan[scanIdx]; + coeff_out[blkpos] = (srcCoeff[blkpos] < 0 ? -temp.absLevel[prev_id] : temp.absLevel[prev_id]); + *absSum += temp.absLevel[prev_id]; + prev_id = temp.prevId[prev_id]; + } + return *absSum; +} + + +void uvg_dep_quant_dequant( + const encoder_state_t* const state, + const int block_type, + const int width, + const int height, + const color_t compID, + coeff_t* quant_coeff, + coeff_t * coeff, + bool enableScalingLists) +{ + const encoder_control_t* const encoder = state->encoder_control; + + const int numCoeff = width * height; + + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, 0, log2_tr_width, log2_tr_height); + bool needs_block_size_trafo_scale =((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size + + //----- reset coefficients and get last scan index ----- + memset(coeff, 0, numCoeff * sizeof(coeff_t)); + int lastScanIdx = -1; + for (int scanIdx = numCoeff - 1; scanIdx >= 0; scanIdx--) + { + if (quant_coeff[scan[scanIdx]]) + { + lastScanIdx = scanIdx; + break; + } + } + if (lastScanIdx < 0) + { + return; + } + + //----- set dequant parameters ----- + const int qpDQ = uvg_get_scaled_qp(compID, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]) + 1; + const int qpPer = qpDQ / 6; + const int qpRem = qpDQ - 6 * qpPer; + const int channelBitDepth = encoder->bitdepth; + const int maxLog2TrDynamicRange = MAX_TR_DYNAMIC_RANGE; + const coeff_t minTCoeff = -(1 << maxLog2TrDynamicRange); + const coeff_t maxTCoeff = (1 << maxLog2TrDynamicRange) - 1; + const int transformShift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; + int shift = IQUANT_SHIFT + 1 - qpPer - transformShift + (enableScalingLists ? 4 : 0); + int invQScale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale ? 1 : 0][qpRem]; + int add = (shift < 0) ? 0 : ((1 << shift) >> 1); + int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(compID); + + const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qpDQ % 6]; + //----- dequant coefficients ----- + for (int state = 0, scanIdx = lastScanIdx; scanIdx >= 0; scanIdx--) + { + const unsigned rasterPos = scan[scanIdx]; + const coeff_t level = quant_coeff[rasterPos]; + if (level) + { + if (enableScalingLists) + { + invQScale = dequant_coef[rasterPos];//scalingfactor*levelScale + } + if (shift < 0 && (enableScalingLists || scanIdx == lastScanIdx)) + { + invQScale <<= -shift; + } + int qIdx = (level << 1) + (level > 0 ? -(state >> 1) : (state >> 1)); + int64_t nomTCoeff = ((int64_t)qIdx * (int64_t)invQScale + add) >> ((shift < 0) ? 0 : shift); + coeff[rasterPos] = (coeff_t)CLIP(minTCoeff, maxTCoeff, nomTCoeff); + } + state = (32040 >> ((state << 2) + ((level & 1) << 1))) & 3; // the 16-bit value "32040" represent the state transition table + } +} diff --git a/src/dep_quant.h b/src/dep_quant.h new file mode 100644 index 00000000..6ef54f4d --- /dev/null +++ b/src/dep_quant.h @@ -0,0 +1,247 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#ifndef DEP_QUANT_H_ +#define DEP_QUANT_H_ + +#include "cu.h" +#include "global.h" + +#define SM_NUM_CTX_SETS_SIG 3 +#define SM_NUM_CTX_SETS_GTX 2 +#define SM_MAX_NUM_SIG_SBB_CTX 2 +#define SM_MAX_NUM_SIG_CTX 12 +#define SM_MAX_NUM_GTX_CTX 21 +#define SCALE_BITS 15 +#define RICEMAX 32 + +typedef struct encoder_control_t encoder_control_t; + +enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 }; + +struct dep_quant_scan_info +{ + uint8_t sig_ctx_offset[2]; + uint8_t gtx_ctx_offset[2]; + uint16_t cg_pos; + uint16_t pos_y; + uint16_t pos_x; + uint8_t next_sbb_right; + uint8_t next_sbb_below; +}; + +typedef struct +{ + int m_QShift; + int64_t m_QAdd; + int64_t m_QScale; + int64_t m_maxQIdx; + int64_t m_thresLast; + int64_t m_thresSSbb; + // distortion normalization + int m_DistShift; + int64_t m_DistAdd; + int64_t m_DistStepAdd; + int64_t m_DistOrgFact; + bool needs_init; +} quant_block; + +typedef struct +{ + int32_t m_lastBitsX[TR_MAX_WIDTH]; + int32_t m_lastBitsY[TR_MAX_WIDTH]; + uint32_t m_sigSbbFracBits[SM_MAX_NUM_SIG_SBB_CTX][2]; + uint32_t m_sigFracBits[SM_NUM_CTX_SETS_SIG][SM_MAX_NUM_SIG_CTX][2]; + int32_t m_gtxFracBits[SM_MAX_NUM_GTX_CTX][6]; + bool needs_init; +} rate_estimator_t; + + +typedef struct +{ + uint8_t num; + uint8_t inPos[5]; +} NbInfoSbb; + +typedef struct +{ + uint16_t maxDist; + uint16_t num; + uint16_t outPos[5]; +} NbInfoOut; + +typedef struct { + int32_t absLevel[4]; + int64_t deltaDist[4]; +} PQData; + +typedef struct { + int64_t ALIGNED(32) rdCost[8]; + int32_t ALIGNED(32) absLevel[8]; + int32_t ALIGNED(32) prevId[8]; +} Decision; + + +typedef struct { + uint8_t* sbbFlags; + uint8_t* levels; +} SbbCtx; + +typedef struct { + const NbInfoOut* m_nbInfo; + uint32_t m_sbbFlagBits[2][2]; + SbbCtx m_allSbbCtx[2]; + int m_curr_sbb_ctx_offset; + int m_prev_sbb_ctx_offset; + uint8_t sbb_memory[8 * 1024]; + uint8_t level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH]; + int num_coeff; +} common_context; + + +typedef struct { + int64_t m_rdCost; + uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id + int8_t m_numSigSbb; + int m_remRegBins; + int8_t m_refSbbCtxId; + uint32_t m_sbbFracBits[2]; + uint32_t m_sigFracBits[2]; + int32_t m_coeffFracBits[6]; + int8_t m_goRicePar; + int8_t m_goRiceZero; + int8_t m_stateId; + uint32_t* m_sigFracBitsArray[12]; + int32_t* m_gtxFracBitsArray[21]; + common_context* m_commonCtx; + + unsigned effWidth; + unsigned effHeight; +} depquant_state; +typedef struct { + int64_t ALIGNED(32) m_rdCost[12]; + uint8_t ALIGNED(32) m_absLevels[3][16 * 4]; + uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; + int8_t ALIGNED(16) m_numSigSbb[12]; + int ALIGNED(32) m_remRegBins[12]; + int8_t ALIGNED(16) m_refSbbCtxId[12]; + uint32_t ALIGNED(32) m_sbbFracBits[12][2]; + uint32_t ALIGNED(32) m_sigFracBits[12][2]; + int32_t ALIGNED(32) m_coeffFracBits[12][6]; + int8_t ALIGNED(16) m_goRicePar[12]; + int8_t ALIGNED(16) m_goRiceZero[12]; + int8_t ALIGNED(16) m_stateId[12]; + uint32_t ALIGNED(32) m_sigFracBitsArray[12][12][2]; + int32_t ALIGNED(32) m_gtxFracBitsArray[21][6]; + common_context* m_commonCtx; + + unsigned effWidth; + unsigned effHeight; + + bool all_gte_four; + bool all_lt_four; +} all_depquant_states; + +typedef struct { + common_context m_common_context; + all_depquant_states m_allStates; + int m_curr_state_offset; + int m_prev_state_offset; + int m_skip_state_offset; + depquant_state m_startState; + quant_block* m_quant; + Decision m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH]; +} context_store; + + +int uvg_init_nb_info(encoder_control_t* encoder); +void uvg_dealloc_nb_info(encoder_control_t* encoder); + + +void uvg_dep_quant_dequant( + const encoder_state_t* const state, + const int block_type, + const int width, + const int height, + const color_t compID, + coeff_t* quant_coeff, + coeff_t* coeff, + bool enableScalingLists); + +int uvg_dep_quant( + const encoder_state_t* const state, + const cu_info_t* const cur_tu, + const int width, + const int height, + const coeff_t* srcCoeff, + coeff_t* coeff_out, + const color_t compID, + enum uvg_tree_type tree_type, + int* absSum, + const bool enableScalingLists); + + +void uvg_dep_quant_update_state( + context_store* ctxs, + int numIPos, + const uint32_t scan_pos, + const Decision* decisions, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const NbInfoSbb next_nb_info_ssb, + const int baseLevel, + const bool extRiceFlag, + int decision_id); + + +void uvg_dep_quant_update_state_eos( + context_store* ctxs, + const uint32_t scan_pos, + const uint32_t cg_pos, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const uint32_t next_sbb_right, + const uint32_t next_sbb_below, + const Decision* decisions, + int decision_id); + +void uvg_dep_quant_check_rd_costs( + const all_depquant_states* const state, + const enum ScanPosType spt, + const PQData* pqDataA, + Decision* decisions, + const int decisionA, + const int decisionB, + const int state_offset); +#endif diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 7a3f401c..858d89f4 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -47,18 +47,19 @@ #include "tables.h" #include "videoframe.h" -bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu) +bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu, const cu_loc_t* + const cu_loc) { uint32_t ts_max_size = 1 << state->encoder_control->cfg.trskip_max_size; const uint32_t max_size = 32; // CU::isIntra(cu) ? MTS_INTRA_MAX_CU_SIZE : MTS_INTER_MAX_CU_SIZE; - const uint32_t cu_width = LCU_WIDTH >> pred_cu->depth; - const uint32_t cu_height = LCU_WIDTH >> pred_cu->depth; + const uint32_t cu_width = cu_loc->width; + const uint32_t cu_height = cu_loc->height; //bool mts_allowed = cu.chType == CHANNEL_TYPE_LUMA && compID == COMPONENT_Y; uint8_t mts_type = state->encoder_control->cfg.mts; bool mts_allowed = mts_type == UVG_MTS_BOTH || (pred_cu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : pred_cu->type == CU_INTER && mts_type == UVG_MTS_INTER); mts_allowed &= cu_width <= max_size && cu_height <= max_size; - //mts_allowed &= !cu.ispMode; // ISP_TODO: Uncomment this when ISP is implemented. + mts_allowed &= pred_cu->type == CU_INTRA ? !pred_cu->intra.isp_mode : true; //mts_allowed &= !cu.sbtInfo; mts_allowed &= !(pred_cu->bdpcmMode && cu_width <= ts_max_size && cu_height <= ts_max_size); mts_allowed &= pred_cu->tr_idx != MTS_SKIP && !pred_cu->violates_mts_coeff_constraint && pred_cu->mts_last_scan_pos ; @@ -66,14 +67,16 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr return mts_allowed; } -static void encode_mts_idx(encoder_state_t * const state, +static void encode_mts_idx( + encoder_state_t * const state, cabac_data_t * const cabac, - const cu_info_t *const pred_cu) + const cu_info_t *const pred_cu, + const cu_loc_t* const cu_loc) { //TransformUnit &tu = *cu.firstTU; int mts_idx = pred_cu->tr_idx; - if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu) && mts_idx != MTS_SKIP + if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu, cu_loc) && mts_idx != MTS_SKIP && !pred_cu->violates_mts_coeff_constraint && pred_cu->mts_last_scan_pos ) @@ -102,122 +105,67 @@ static void encode_mts_idx(encoder_state_t * const state, } } -// ISP_TODO: move these defines to a proper place when ISP is implemented -// As of now, these are only needed in lfnst checks -#define NOT_INTRA_SUBPARTITIONS 0 -#define HOR_INTRA_SUBPARTITIONS 1 -#define VER_INTRA_SUBPARTITIONS 2 -#define NUM_INTRA_SUBPARTITIONS_MODES 3 -#define INTRA_SUBPARTITIONS_RESERVED 4 -#define TU_1D_HOR_SPLIT 8 -#define TU_1D_VER_SPLIT 9 -#define MIN_TB_SIZE_X 4 -#define MIN_TB_SIZE_Y 4 - -static int get_isp_split_dim(const int width, const int height, const int isp_split_type) -{ - bool divide_tu_in_rows = isp_split_type == TU_1D_HOR_SPLIT; - uint32_t split_dim_size, non_split_dim_size, partition_size, div_shift = 2; - - if (divide_tu_in_rows) - { - split_dim_size = height; - non_split_dim_size = width; - } - else - { - split_dim_size = width; - non_split_dim_size = height; - } - - const unsigned min_num_samples_cu = 1 << ((uvg_math_floor_log2(MIN_TB_SIZE_Y) << 1)); - const unsigned factor_to_min_samples = non_split_dim_size < min_num_samples_cu ? min_num_samples_cu >> uvg_math_floor_log2(non_split_dim_size) : 1; - partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift); - - assert(!(uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) < uvg_math_floor_log2(min_num_samples_cu)) && "Partition has less than minimum amount of samples."); - return partition_size; -} - -static bool can_use_lfnst_with_isp(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type) -{ - if (tree_type == UVG_CHROMA_T) { - return false; - } - if (isp_split_type == NOT_INTRA_SUBPARTITIONS) { - return false; - } - - const int tu_width = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? width : get_isp_split_dim(width, height, TU_1D_VER_SPLIT); - const int tu_height = (isp_split_type == HOR_INTRA_SUBPARTITIONS) ? get_isp_split_dim(width, height, TU_1D_HOR_SPLIT) : height; - - if (!(tu_width >= MIN_TB_SIZE_Y && tu_height >= MIN_TB_SIZE_Y)) - { - return false; - } - return true; -} - - bool uvg_is_lfnst_allowed( +bool uvg_is_lfnst_allowed( const encoder_state_t* const state, const cu_info_t* const pred_cu, - const int width, - const int height, - const int x, - const int y, enum uvg_tree_type tree_type, const color_t color, - const lcu_t* lcu) + const cu_loc_t* const cu_loc, const lcu_t* const lcu) { - if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA) { - const int isp_mode = 0; // ISP_TODO: assign proper ISP mode when ISP is implemented - const int isp_split_type = 0; - const int depth = pred_cu->depth; - const int chroma_width = width >> 1; - const int chroma_height = height >> 1; - const int cu_width = tree_type != UVG_LUMA_T || depth == 4 ? width : chroma_width; - const int cu_height = tree_type != UVG_LUMA_T || depth == 4 ? height : chroma_height; - bool can_use_lfnst_with_mip = (width >= 16 && height >= 16); - bool is_sep_tree = depth == 4 || tree_type != UVG_BOTH_T; + if (state->encoder_control->cfg.lfnst && pred_cu->type == CU_INTRA && PU_IS_TU(pred_cu)) { + const int isp_mode = pred_cu->intra.isp_mode; + const int cu_width = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_width : 1 << pred_cu->log2_chroma_width; + const int cu_height = tree_type != UVG_CHROMA_T ? 1 << pred_cu->log2_height : 1 << pred_cu->log2_chroma_height; + bool can_use_lfnst_with_mip = (cu_width >= 16 && cu_height >= 16); + bool is_sep_tree = tree_type != UVG_BOTH_T; bool mip_flag = pred_cu->type == CU_INTRA && color == COLOR_Y ? pred_cu->intra.mip_flag : false; - if ((isp_mode && !can_use_lfnst_with_isp(width, height, isp_split_type, tree_type)) || - (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip) || + if ((isp_mode && !uvg_can_use_isp_with_lfnst(cu_width, cu_height, isp_mode, tree_type) && color == COLOR_Y) || + (pred_cu->type == CU_INTRA && mip_flag && !can_use_lfnst_with_mip && color == COLOR_Y) || (is_sep_tree && MIN(cu_width, cu_height) < 4) || - (cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH)) { + (cu_width > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)) || cu_height > (TR_MAX_WIDTH >> (tree_type == UVG_CHROMA_T)))) { return false; } - bool luma_flag = (depth == 4 && color == COLOR_Y) || (tree_type != UVG_CHROMA_T && depth != 4); - bool chroma_flag = (depth == 4 && color != COLOR_Y) || tree_type != UVG_LUMA_T; - bool non_zero_coeff_non_ts_corner_8x8 = (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma); + bool luma_flag = tree_type != UVG_CHROMA_T; + bool chroma_flag = tree_type != UVG_LUMA_T; + bool non_zero_coeff_non_ts_corner_8x8 = false; + bool last_scan_pos = false; bool is_tr_skip = false; - + + int split_num = color == COLOR_Y && isp_mode ? uvg_get_isp_split_num(cu_width, cu_height, isp_mode, false) : 0; const videoframe_t* const frame = state->tile->frame; - //const int num_pred_units = kvz_part_mode_num_parts[pred_cu->part_size]; - const int tr_depth = pred_cu->tr_depth; - assert(depth <= tr_depth && "Depth greater than transform depth. This should never trigger."); - const int num_transform_units = 1 << (2 * (tr_depth - depth)); - const int tu_row_length = 1 << (tr_depth - depth); - const int tu_width = cu_width >> (tr_depth - depth); - const int tu_height = tu_width; // TODO: height for non-square blocks - // TODO: chroma transform skip - if (color == COLOR_Y) { - for (int i = 0; i < num_transform_units; i++) { - // TODO: this works only for square blocks - const int tu_x = x + ((i % tu_row_length) * tu_width); - const int tu_y = y + ((i / tu_row_length) * tu_height); - const cu_info_t* cur_tu = lcu ? LCU_GET_CU_AT_PX(lcu, tu_x, tu_y) : uvg_cu_array_at_const(frame->cu_array, tu_x, tu_y); - assert(cur_tu != NULL && "NULL transform unit."); - bool cbf_set = cbf_is_set(cur_tu->cbf, tr_depth, COLOR_Y); + if (split_num) { + // Constraints for ISP split blocks + for (int i = 0; i < split_num; ++i) { + cu_loc_t split_loc; + uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_width, cu_height, i, isp_mode, false); + int local_split_x = lcu ? split_loc.local_x : split_loc.x; + int local_split_y = lcu ? split_loc.local_y : split_loc.y; + uvg_get_isp_cu_arr_coords(&local_split_x, &local_split_y, MAX(cu_width, cu_height)); + const cu_info_t* split_cu = lcu ? LCU_GET_CU_AT_PX(lcu, local_split_x, local_split_y) : + uvg_cu_array_at_const(frame->cu_array, local_split_x, local_split_y); - if (cur_tu != NULL && cbf_set && cur_tu->tr_idx == MTS_SKIP) { - is_tr_skip = true; + //if (cbf_is_set(split_cu->cbf, depth, COLOR_Y)) { + // ISP_TODO: remove this if clause altogether if it seems it is not needed + if (true) { + non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && split_cu->violates_lfnst_constrained_luma) || (chroma_flag && split_cu->violates_lfnst_constrained_chroma); + //last_scan_pos |= split_cu->lfnst_last_scan_pos; + last_scan_pos |= true; } } } + else { + non_zero_coeff_non_ts_corner_8x8 |= (luma_flag && pred_cu->violates_lfnst_constrained_luma) || (chroma_flag && pred_cu->violates_lfnst_constrained_chroma); + last_scan_pos |= pred_cu->lfnst_last_scan_pos; + } - if ((!pred_cu->lfnst_last_scan_pos && !isp_mode) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) { + if (color == COLOR_Y && pred_cu->tr_idx == MTS_SKIP) { + is_tr_skip = true; + } + + if ((!last_scan_pos) || non_zero_coeff_non_ts_corner_8x8 || is_tr_skip) { return false; } return true; @@ -231,19 +179,15 @@ static bool encode_lfnst_idx( const encoder_state_t* const state, cabac_data_t * const cabac, const cu_info_t * const pred_cu, - const int x, - const int y, - const int depth, - const int width, - const int height, enum uvg_tree_type tree_type, - const color_t color) + const color_t color, + const cu_loc_t* const cu_loc) { - if (uvg_is_lfnst_allowed(state, pred_cu, width, height, x, y, tree_type, color, NULL)) { + if (uvg_is_lfnst_allowed(state, pred_cu, tree_type, color, cu_loc, NULL)) { // Getting separate tree bool from block size is a temporary fix until a proper dual tree check is possible (there is no dual tree structure at time of writing this). // VTM seems to force explicit dual tree structure for small 4x4 blocks - bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; + bool is_separate_tree = tree_type != UVG_BOTH_T; const int lfnst_index = !is_separate_tree || color == COLOR_Y ? pred_cu->lfnst_idx : pred_cu->cr_lfnst_idx; assert((lfnst_index >= 0 && lfnst_index < 3) && "Invalid LFNST index."); @@ -261,6 +205,12 @@ static bool encode_lfnst_idx( return true; } else { + if(color == COLOR_Y) { + assert(pred_cu->lfnst_idx == 0); + } + if(tree_type == UVG_CHROMA_T && color != COLOR_Y) { + assert(pred_cu->cr_lfnst_idx == 0); + } return false; } } @@ -269,9 +219,11 @@ void uvg_encode_ts_residual(encoder_state_t* const state, cabac_data_t* const cabac, const coeff_t* coeff, uint32_t width, + uint32_t height, uint8_t type, int8_t scan_mode, - double* bits_out) { + double* bits_out) +{ //const encoder_control_t * const encoder = state->encoder_control; //int c1 = 1; uint32_t i; @@ -282,10 +234,14 @@ void uvg_encode_ts_residual(encoder_state_t* const state, // CONSTANTS - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1]; - const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1]; - const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode]; + const uint32_t log2_block_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_block_height = uvg_g_convert_to_log2[height]; + // TODO: log2_cg_size is wrong if width != height + const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_width][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; + + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); + double bits = 0; // Init base contexts according to block type @@ -293,23 +249,23 @@ void uvg_encode_ts_residual(encoder_state_t* const state, cabac->cur_ctx = base_coeff_group_ctx; - int maxCtxBins = (width * width * 7) >> 2; + int maxCtxBins = (width * height * 7) >> 2; unsigned scan_cg_last = (unsigned )-1; //unsigned scan_pos_last = (unsigned )-1; - for (i = 0; i < width * width; i++) { + for (i = 0; i < width * height; i++) { if (coeff[scan[i]]) { - //scan_pos_last = i; sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1; } } - scan_cg_last = (width * width - 1) >> log2_cg_size; + // TODO: this won't work with non-square blocks + scan_cg_last = (width * height - 1) >> log2_cg_size; const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2)); bool no_sig_group_before_last = true; for (i = 0; i <= scan_cg_last; i++) { - if (!(width == 4 || (i ==scan_cg_last && no_sig_group_before_last))) { + if (!((width == 4 && height == 4) || (i ==scan_cg_last && no_sig_group_before_last))) { uint32_t cg_blkpos = scan_cg[i]; uint32_t cg_pos_y = cg_blkpos / cg_width; uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * cg_width); @@ -462,13 +418,13 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t type, uint8_t scan, double* bits_out) { const int index_x = uvg_math_floor_log2(width); - const int index_y = uvg_math_floor_log2(width); + const int index_y = uvg_math_floor_log2(height); const int prefix_ctx[8] = { 0, 0, 0, 3, 6, 10, 15, 21 }; //ToDo: own ctx_offset and shift for X and Y uint8_t ctx_offset_x = type ? 0 : prefix_ctx[index_x]; uint8_t ctx_offset_y = type ? 0 : prefix_ctx[index_y]; - uint8_t shift_x = type ? CLIP(0, 2, width>>3) : (index_x+1)>>2; - uint8_t shift_y = type ? CLIP(0, 2, width >> 3) : (index_y + 1) >> 2; + uint8_t shift_x = type ? CLIP(0, 2, width >> 3) : (index_x + 1) >> 2; + uint8_t shift_y = type ? CLIP(0, 2, height >> 3) : (index_y + 1) >> 2; double bits = 0; cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma); @@ -515,107 +471,130 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac, static void encode_chroma_tu( encoder_state_t* const state, - int x, - int y, - int depth, - const uint8_t width_c, + const cu_loc_t * const cu_loc, cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, - uint8_t joint_chroma, - enum - uvg_tree_type tree_type) + uint8_t joint_chroma) { - int x_local = ((x >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C; - int y_local = ((y >> (tree_type != UVG_CHROMA_T)) & ~3) % LCU_WIDTH_C; + int width_c = cu_loc->chroma_width; + int height_c = cu_loc->chroma_height; + int x_local = (cu_loc->x >> 1) % LCU_WIDTH_C; + int y_local = (cu_loc->y >> 1) % LCU_WIDTH_C; cabac_data_t* const cabac = &state->cabac; - *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth); + *scan_idx = SCAN_DIAG; if(!joint_chroma){ - const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)]; - const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)]; + // const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)]; + // const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)]; + coeff_t coeff_u[TR_MAX_WIDTH * TR_MAX_WIDTH]; + coeff_t coeff_v[TR_MAX_WIDTH * TR_MAX_WIDTH]; + uvg_get_sub_coeff(coeff_u, coeff->u, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); + uvg_get_sub_coeff(coeff_v, coeff->v, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); - if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) { - if(state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)){ + if (cbf_is_set(cur_pu->cbf, COLOR_U)) { + if(state->encoder_control->cfg.trskip_enable + && width_c <= (1 << state->encoder_control->cfg.trskip_max_size) + && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)){ cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; // HEVC only supports transform_skip for Luma // TODO: transform skip for chroma blocks CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, cur_pu, NULL); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, cu_loc, COLOR_U, *scan_idx, cur_pu, NULL); } - if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) { - if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) { + if (cbf_is_set(cur_pu->cbf, COLOR_V)) { + if (state->encoder_control->cfg.trskip_enable + && width_c <= (1 << state->encoder_control->cfg.trskip_max_size) + && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) { cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, cur_pu, NULL); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL); } } else { - const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)]; - if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) { + coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH]; + uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); + if (state->encoder_control->cfg.trskip_enable + && width_c <= (1 << state->encoder_control->cfg.trskip_max_size) + && height_c <= (1 << state->encoder_control->cfg.trskip_max_size)) { cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma; CABAC_BIN(cabac, 0, "transform_skip_flag"); } - uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, cur_pu, NULL); + uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, cu_loc, COLOR_V, *scan_idx, cur_pu, NULL); } } static void encode_transform_unit( encoder_state_t * const state, - int x, - int y, - int depth, - bool only_chroma, + const cu_loc_t *cu_loc, + const cu_info_t* cur_pu, lcu_coeff_t* coeff, - enum uvg_tree_type tree_type) + bool only_chroma, + enum uvg_tree_type tree_type, + bool last_split, + const cu_loc_t *original_loc, + const cu_loc_t* const chroma_loc) // Original cu dimensions, before CU split { - assert(depth >= 1 && depth <= MAX_PU_DEPTH); - const videoframe_t * const frame = state->tile->frame; cabac_data_t* const cabac = &state->cabac; - const uint8_t width = LCU_WIDTH >> depth; - const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2); + const int x = cu_loc->x; + const int y = cu_loc->y; + const uint8_t width = cu_loc->width; + const uint8_t height = cu_loc->height; + const uint8_t width_c = cu_loc->chroma_width; + const uint8_t height_c = cu_loc->chroma_height; cu_array_t* used_cu_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array; - const cu_info_t *cur_pu = uvg_cu_array_at_const(used_cu_array, x, y); + int isp_x = x; + int isp_y = y; + uvg_get_isp_cu_arr_coords(&isp_x, &isp_y, MAX(width, height)); + if(cur_pu == NULL) { + cur_pu = uvg_cu_array_at_const(used_cu_array, isp_x, isp_y); + } - int8_t scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth); + int8_t scan_idx = SCAN_DIAG; - int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y); + int cbf_y = cbf_is_set(cur_pu->cbf, COLOR_Y); if (cbf_y && !only_chroma) { int x_local = x % LCU_WIDTH; int y_local = y % LCU_WIDTH; - const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)]; + // const coeff_t *coeff_y = &coeff->y[xy_to_zorder(LCU_WIDTH, x_local, y_local)]; + coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH]; + uvg_get_sub_coeff(coeff_y, coeff->y, x_local, y_local, width, height, LCU_WIDTH); // CoeffNxN // Residual Coding - if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) { + if(state->encoder_control->cfg.trskip_enable + && width <= (1 << state->encoder_control->cfg.trskip_max_size) + && height <= (1 << state->encoder_control->cfg.trskip_max_size) + && !(cur_pu->type == CU_INTRA && cur_pu->intra.isp_mode != ISP_MODE_NO_ISP)) { cabac->cur_ctx = &cabac->ctx.transform_skip_model_luma; CABAC_BIN(cabac, cur_pu->tr_idx == MTS_SKIP, "transform_skip_flag"); - DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, width, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0); + DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_TR_SKIP, x, y, width, height, (cur_pu->tr_idx == MTS_SKIP) ? 1 : 0); } if(cur_pu->tr_idx == MTS_SKIP) { - uvg_encode_ts_residual(state, cabac, coeff_y, width, 0, scan_idx, NULL); + uvg_encode_ts_residual(state, cabac, coeff_y, width, height, 0, scan_idx, NULL); } else { uvg_encode_coeff_nxn(state, cabac, coeff_y, - width, + cu_loc, 0, scan_idx, (cu_info_t * )cur_pu, NULL); } + if (tree_type == UVG_LUMA_T) return; } bool joint_chroma = cur_pu->joint_cb_cr != 0; - if (depth == MAX_DEPTH) { + if (cur_pu->log2_height + cur_pu->log2_width < 6 && tree_type != UVG_CHROMA_T && !only_chroma) { // For size 4x4 luma transform the corresponding chroma transforms are // also of size 4x4 covering 8x8 luma pixels. The residual is coded in // the last transform unit. @@ -629,11 +608,12 @@ static void encode_transform_unit( } } - bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) || - cbf_is_set(cur_pu->cbf, depth, COLOR_V); - if (chroma_cbf_set || joint_chroma) { + bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, COLOR_U) || + cbf_is_set(cur_pu->cbf, COLOR_V); + if ((chroma_cbf_set || joint_chroma) && last_split && chroma_loc) { //Need to drop const to get lfnst constraints - encode_chroma_tu(state, x, y, depth, width_c, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma, tree_type); + // Use original dimensions instead of ISP split dimensions + encode_chroma_tu(state, chroma_loc, (cu_info_t*)cur_pu, &scan_idx, coeff, joint_chroma); } } @@ -642,120 +622,104 @@ static void encode_transform_unit( * \param x_pu Prediction units' x coordinate. * \param y_pu Prediction units' y coordinate. * \param depth Depth from LCU. - * \param tr_depth Depth from last CU. * \param parent_coeff_u What was signaled at previous level for cbf_cb. * \param parent_coeff_v What was signlaed at previous level for cbf_cr. */ static void encode_transform_coeff( encoder_state_t * const state, - int32_t x, - int32_t y, - int8_t depth, - int8_t tr_depth, - uint8_t parent_coeff_u, - uint8_t parent_coeff_v, + const cu_loc_t * cu_loc, bool only_chroma, lcu_coeff_t* coeff, - enum uvg_tree_type tree_type) + const cu_info_t* cur_tu, + enum uvg_tree_type tree_type, + bool last_split, + bool can_skip_last_cbf, + int *luma_cbf_ctx, + // Always true except when writing sub partition coeffs (ISP) + const cu_loc_t * const original_loc, + const cu_loc_t* const chroma_loc) // Original dimensions before ISP split { cabac_data_t * const cabac = &state->cabac; + + bool isp_split = cu_loc->x != original_loc->x || cu_loc->y != original_loc->y; + int x = cu_loc->x; + int y = cu_loc->y; + if (isp_split) { + uvg_get_isp_cu_arr_coords(&x, &y, MAX(cu_loc->width, cu_loc->height)); + } + //const encoder_control_t *const ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array; - - const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, x, y); - // Round coordinates down to a multiple of 8 to get the location of the - // containing CU. - const int x_cu = 8 * (x / 8); - const int y_cu = 8 * (y / 8); - const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); - - // NxN signifies implicit transform split at the first transform level. - // There is a similar implicit split for inter, but it is only used when - // transform hierarchy is not in use. - //int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN); - - // The implicit split by intra NxN is not counted towards max_tr_depth. - /* - int max_tr_depth; - if (cur_cu->type == CU_INTRA) { - max_tr_depth = ctrl->cfg.tr_depth_intra + intra_split_flag; - } else { - max_tr_depth = ctrl->tr_depth_inter; + if(cur_tu == NULL) { + cur_tu = uvg_cu_array_at_const(used_array, x, y); } - */ - int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH); + const int tr_limit = TR_MAX_WIDTH; + const bool ver_split = cu_loc->height > tr_limit; + const bool hor_split = cu_loc->width > tr_limit; - + const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_tu->cbf, COLOR_Y) : 0; + const int cb_flag_u = tree_type != UVG_LUMA_T ?(cur_tu->joint_cb_cr ? (cur_tu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_tu->cbf, COLOR_U)) : 0; + const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_tu->joint_cb_cr ? cur_tu->joint_cb_cr & 1 : cbf_is_set(cur_tu->cbf, COLOR_V)) : 0; - const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0; - const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0; - const int cb_flag_v = tree_type != UVG_LUMA_T ? (cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V)) : 0; - // The split_transform_flag is not signaled when: - // - transform size is greater than 32 (depth == 0) - // - transform size is 4 (depth == MAX_PU_DEPTH) - // - transform depth is max - // - cu is intra NxN and it's the first split - - //ToDo: check BMS transform split in QTBT - /* - if (depth > 0 && - depth < MAX_PU_DEPTH && - tr_depth < max_tr_depth && - !(intra_split_flag && tr_depth == 0)) - { - cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]); - CABAC_BIN(cabac, split, "split_transform_flag"); - } - */ - - // Chroma cb flags are not signaled when one of the following: - // - transform size is 4 (2x2 chroma transform doesn't exist) - // - they have already been signaled to 0 previously - // When they are not present they are inferred to be 0, except for size 4 - // when the flags from previous level are used. - if (state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || only_chroma) && tree_type != UVG_LUMA_T) { - - if (!split) { - if (true) { - assert(tr_depth < 5); - cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]); - CABAC_BIN(cabac, cb_flag_u, "cbf_cb"); - } - if (true) { - cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]); - CABAC_BIN(cabac, cb_flag_v, "cbf_cr"); - } + if (hor_split || ver_split) { + enum split_type split; + if (cu_loc->width > tr_limit && cu_loc->height > tr_limit) { + split = QT_SPLIT; + } + else if (cu_loc->width > tr_limit) { + split = BT_VER_SPLIT; + } + else { + split = BT_HOR_SPLIT; } - } - if (split) { - uint8_t offset = LCU_WIDTH >> (depth + 1); - int x2 = x + offset; - int y2 = y + offset; - encode_transform_coeff(state, x, y, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type); - encode_transform_coeff(state, x2, y, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type); - encode_transform_coeff(state, x, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type); - encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v, only_chroma, coeff, tree_type); + cu_loc_t split_cu_loc[4]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + for (int i = 0; i < split_count; ++i) { + encode_transform_coeff(state, &split_cu_loc[i], only_chroma, + coeff, NULL, tree_type, true, false, luma_cbf_ctx, &split_cu_loc[i], chroma_loc ? &split_cu_loc[i] : NULL); + } return; } + + // Chroma cb flags are not signaled when one of the following: + // No chroma. + // Not the last CU for area of 64 pixels cowered by more than one luma CU. + // Not the last ISP Split + if (state->encoder_control->chroma_format != UVG_CSP_400 + && (chroma_loc || only_chroma) + && tree_type != UVG_LUMA_T + && last_split) { + cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + CABAC_BIN(cabac, cb_flag_u, "cbf_cb"); + cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]); + CABAC_BIN(cabac, cb_flag_v, "cbf_cr"); + } // Luma coded block flag is signaled when one of the following: // - prediction mode is intra // - transform depth > 0 // - we have chroma coefficients at this level // When it is not present, it is inferred to be 1. - if ((cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) { - cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[0]); + if ((cur_tu->type == CU_INTRA || !PU_IS_TU(cur_tu) || cb_flag_u || cb_flag_v) && !only_chroma && tree_type != UVG_CHROMA_T) { + if (can_skip_last_cbf && isp_split && last_split) { + // Do not write luma cbf if first three isp splits have luma cbf 0 + } else { + cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma[*luma_cbf_ctx]); CABAC_BIN(cabac, cb_flag_y, "cbf_luma"); + if (PU_IS_TU(cur_tu)) { + *luma_cbf_ctx = 2 + cb_flag_y; + } + } } if (cb_flag_y | cb_flag_u | cb_flag_v) { - if (state->must_code_qp_delta && (only_chroma || cb_flag_y || depth != 4) ) { - const int qp_pred = uvg_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp); - const int qp_delta = cur_cu->qp - qp_pred; + if (state->must_code_qp_delta && (only_chroma || cb_flag_y || chroma_loc) ) { + const int qp_pred = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, state->last_qp); + const int qp_delta = cur_tu->qp - qp_pred; // Possible deltaQP range depends on bit depth as stated in HEVC specification. assert(qp_delta >= UVG_QP_DELTA_MIN && qp_delta <= UVG_QP_DELTA_MAX && "QP delta not in valid range."); @@ -778,16 +742,18 @@ static void encode_transform_coeff( } if(( ((cb_flag_u || cb_flag_v ) - && cur_cu->type == CU_INTRA) + && cur_tu->type == CU_INTRA) || (cb_flag_u && cb_flag_v)) - && (depth != 4 || only_chroma || tree_type == UVG_CHROMA_T) + && (chroma_loc || only_chroma || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.jccr + && last_split ) { - assert(cur_pu->joint_cb_cr < 4 && "JointCbCr is in search state."); + assert(cur_tu->joint_cb_cr < 4 && "JointCbCr is in search state."); cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1]; - CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag"); + CABAC_BIN(cabac, cur_tu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag"); } - encode_transform_unit(state, x, y, depth, only_chroma, coeff, tree_type); + + encode_transform_unit(state, cu_loc, only_chroma ? cur_tu : NULL, coeff, only_chroma, tree_type, last_split, original_loc, chroma_loc); } } @@ -799,11 +765,13 @@ static void encode_transform_coeff( * \param depth Depth from LCU. * \return if non-zero mvd is coded */ -int uvg_encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu, double* bits_out) +int uvg_encode_inter_prediction_unit( + encoder_state_t * const state, + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + lcu_t* lcu, + double* bits_out, + const cu_loc_t* const cu_loc) { // Mergeflag int16_t num_cand = 0; @@ -838,8 +806,8 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, // Code Inter Dir uint8_t inter_dir = cur_cu->inter.mv_dir; - if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4 - uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1)); + if (cu_loc->width + cu_loc->height > 12) { // ToDo: limit on 4x8/8x4 + uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(cu_loc->width) + uvg_math_floor_log2(cu_loc->height) + 1) >> 1)); CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc"); } @@ -890,16 +858,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, if (lcu) { uvg_inter_get_mv_cand( state, - x, y, width, height, - mv_cand, cur_cu, - lcu, ref_list_idx); + mv_cand, cur_cu, lcu, ref_list_idx, + cu_loc); } else { uvg_inter_get_mv_cand_cua( state, - x, y, width, height, - mv_cand, cur_cu, ref_list_idx - ); + mv_cand, cur_cu, ref_list_idx, cu_loc + ); } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); @@ -922,14 +888,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, } static void encode_chroma_intra_cu( - cabac_data_t* const cabac, - const cu_info_t* const cur_cu, - const int cclm_enabled, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + const int cclm_enabled, + int8_t luma_intra_dir, double* bits_out) { unsigned pred_mode = 0; unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83}; int8_t chroma_intra_dir = cur_cu->intra.mode_chroma; - int8_t luma_intra_dir = !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0; for(int i = 0; i < 4; i++) { if(chroma_pred_modes[i] == luma_intra_dir) { chroma_pred_modes[i] = 66; @@ -1011,10 +977,13 @@ static void encode_chroma_intra_cu( else if (cabac->only_count && bits_out)*bits_out += bits; } -void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int depth, const lcu_t* lcu, double* bits_out) +void uvg_encode_intra_luma_coding_unit( + const encoder_state_t * const state, + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + const cu_loc_t* const cu_loc, + const lcu_t* lcu, + double* bits_out) { const videoframe_t * const frame = state->tile->frame; uint8_t intra_pred_mode_actual; @@ -1026,6 +995,9 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, uint32_t flag; double bits = 0; + const int x = cu_loc->x; + const int y = cu_loc->y; + /* if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) { cabac->cur_ctx = &(cabac->ctx.bdpcm_mode[0]); @@ -1049,16 +1021,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, } */ - // Intra Subpartition mode - uint32_t width = (LCU_WIDTH >> depth); - uint32_t height = (LCU_WIDTH >> depth); - - bool enough_samples = uvg_g_convert_to_bit[width] + uvg_g_convert_to_bit[height] > (uvg_g_convert_to_bit[4 /* MIN_TB_SIZEY*/] << 1); - uint8_t isp_mode = 0; - // ToDo: add height comparison - //isp_mode += ((width > TR_MAX_WIDTH) || !enough_samples) ? 1 : 0; - //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0; - bool allow_isp = enough_samples; + uint32_t width = cu_loc->width; + uint32_t height = cu_loc->height; // TODO: height for non-square blocks // Code MIP related bits bool enable_mip = state->encoder_control->cfg.mip; @@ -1083,9 +1047,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, } if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) { - const int cu_width = LCU_WIDTH >> depth; - const int cu_height = cu_width; // TODO: height for non-square blocks - uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array); + uint8_t ctx_id = uvg_get_mip_flag_context(cu_loc, lcu, lcu ? NULL : frame->cu_array); // Write MIP flag CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag"); @@ -1104,7 +1066,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, int multi_ref_idx = enable_mrl ? cur_cu->intra.multi_ref_idx : 0; #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV - if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, width, multi_ref_idx); + if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, height, multi_ref_idx); #endif if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) { @@ -1116,21 +1078,21 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, } } + bool enable_isp = state->encoder_control->cfg.isp; + // Need at least 16 samples in sub blocks to use isp. If both dimensions are 4, not enough samples. Blocks of size 2 do not exist yet (not for luma at least) + bool allow_isp = enable_isp ? uvg_can_use_isp(width, height) : false; + uint8_t isp_mode = allow_isp ? cur_cu->intra.isp_mode : 0; - // ToDo: update real usage, these if clauses as such don't make any sense - if (isp_mode != 0 && multi_ref_idx == 0) { - if (isp_mode) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subPartitions"); - } else { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions"); - // ToDo: complete this if-clause - if (isp_mode == 3) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor"); - } + if (allow_isp && !multi_ref_idx /*&& !bdpcm && !color_transform*/) { + if (isp_mode == ISP_MODE_NO_ISP) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 0, bits, "intra_subpartitions_mode"); + } + else { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subpartitions_mode"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[1]), isp_mode - 1, bits, "intra_subpartitions_split_type"); // Vertical or horizontal split } } - - const int cu_width = LCU_WIDTH >> depth; + // PREDINFO CODING // If intra prediction mode is found from the predictors, // it can be signaled with two EP's. Otherwise we can send @@ -1145,7 +1107,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, if (x > 0) { assert(x >> 2 > 0); const int x_scu = SUB_SCU(x) - 1; - const int y_scu = SUB_SCU(y + cu_width - 1); + const int y_scu = SUB_SCU(y + height - 1); left_pu = lcu ? LCU_GET_CU_AT_PX( lcu, @@ -1154,7 +1116,7 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, uvg_cu_array_at_const( frame->cu_array, x - 1, - y + cu_width - 1); + y + height - 1); } // Don't take the above PU across the LCU boundary. if (y % LCU_WIDTH > 0 && y > 0) { @@ -1162,11 +1124,11 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, above_pu = lcu ? LCU_GET_CU_AT_PX( lcu, - SUB_SCU(x + cu_width - 1), + SUB_SCU(x + width - 1), SUB_SCU(y) - 1) : uvg_cu_array_at_const( frame->cu_array, - x + cu_width - 1, + x + width - 1, y - 1); } @@ -1185,8 +1147,8 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, } // Is the mode in the MPM array or not flag = (mpm_preds == -1) ? 0 : 1; - if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag"); + if (cur_pu->intra.multi_ref_idx == 0) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "intra_luma_mpm_flag"); } // Signal index of the prediction mode in the prediction list, if it is there @@ -1262,144 +1224,139 @@ void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state, if (cabac->only_count && bits_out) *bits_out += bits; } -bool uvg_write_split_flag( - const encoder_state_t * const state, + +uint8_t uvg_write_split_flag( + const encoder_state_t* const state, cabac_data_t* cabac, - const cu_info_t * left_cu, - const cu_info_t * above_cu, - uint8_t split_flag, - int depth, - int cu_width, - int x, - int y, + const cu_info_t* left_cu, + const cu_info_t* above_cu, + const cu_loc_t* const cu_loc, + split_tree_t split_tree, enum uvg_tree_type tree_type, + bool* is_implicit_out, double* bits_out) { - uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T)); - uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T)); double bits = 0; - const encoder_control_t* const ctrl = state->encoder_control; // Implisit split flag when on border // Exception made in VVC with flag not being implicit if the BT can be used for // horizontal or vertical split, then this flag tells if QT or BT is used + const int cu_width = cu_loc->width; + const int cu_height = cu_loc->height; - bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split; - no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true; - if (depth > MAX_DEPTH) allow_qt = false; - // ToDo: update this when btt is actually used - bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH - - uint8_t implicit_split_mode = UVG_NO_SPLIT; - //bool implicit_split = border; - bool bottom_left_available = ((abs_y + cu_width - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T))); - bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T))); + bool can_split[6]; + const bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split); - if (!bottom_left_available && !top_right_available && allow_qt) { - implicit_split_mode = UVG_QUAD_SPLIT; - } - else if (!bottom_left_available && allow_btt) { - implicit_split_mode = UVG_HORZ_SPLIT; - } - else if (!top_right_available && allow_btt) { - implicit_split_mode = UVG_VERT_SPLIT; - } - else if (!bottom_left_available || !top_right_available) { - implicit_split_mode = UVG_QUAD_SPLIT; - } - - // Check split conditions - if (implicit_split_mode != UVG_NO_SPLIT) { - no_split = th_split = tv_split = false; - bh_split = (implicit_split_mode == UVG_HORZ_SPLIT); - bv_split = (implicit_split_mode == UVG_VERT_SPLIT); - } - if (!allow_btt) { - bh_split = bv_split = th_split = tv_split = false; - } + bool allow_split = can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5]; - bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split; + enum split_type split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7; - split_flag |= implicit_split_mode != UVG_NO_SPLIT; + assert(can_split[split_flag] && "Trying to write an illegal split"); + + // split_flag = is_implicit ? (can_split[QT_SPLIT] ? QT_SPLIT : (can_split[BT_HOR_SPLIT] ? BT_HOR_SPLIT : BT_VER_SPLIT)) : split_flag; + *is_implicit_out = is_implicit; int split_model = 0; - if (no_split && allow_split) { + if (can_split[NO_SPLIT] && allow_split) { // Get left and top block split_flags and if they are present and true, increase model number - // ToDo: should use height and width to increase model, PU_GET_W() ? - if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) { + if (left_cu && (1 << left_cu->log2_height) < cu_height) { split_model++; } - if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { + if (above_cu && (1 << above_cu->log2_width) < cu_width) { split_model++; } uint32_t split_num = 0; - if (allow_qt) split_num += 2; - if (bh_split) split_num++; - if (bv_split) split_num++; - if (th_split) split_num++; - if (tv_split) split_num++; + if (can_split[QT_SPLIT]) split_num += 2; + if (can_split[BT_HOR_SPLIT]) split_num++; + if (can_split[BT_VER_SPLIT]) split_num++; + if (can_split[TT_HOR_SPLIT]) split_num++; + if (can_split[TT_VER_SPLIT]) split_num++; if (split_num > 0) split_num--; split_model += 3 * (split_num >> 1); cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]); - if(cabac->only_count && !split_flag) { - //printf("%hu %hu %d %d %d\n", state->search_cabac.ctx.split_flag_model[split_model].state[0], state->search_cabac.ctx.split_flag_model[split_model].state[1], - // split_model, x, y); - } - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != NO_SPLIT, bits, "split_cu_flag"); } - bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT; - if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) { - split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3); - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag"); + if ((!is_implicit || (can_split[QT_SPLIT] && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT]))) + && (can_split[BT_HOR_SPLIT] || can_split[BT_VER_SPLIT] || can_split[TT_HOR_SPLIT] || can_split[TT_VER_SPLIT]) + && split_flag != NO_SPLIT) { + bool qt_split = split_flag == QT_SPLIT; + if((can_split[BT_VER_SPLIT] || can_split[BT_HOR_SPLIT] || can_split[TT_VER_SPLIT] || can_split[TT_HOR_SPLIT]) && can_split[QT_SPLIT]) { + unsigned left_qt_depth = 0; + unsigned top_qt_depth = 0; + if(left_cu) { + while (((left_cu->split_tree >> (left_qt_depth * 3)) & 7u) == QT_SPLIT) { + left_qt_depth++; + } + } + if(above_cu) { + while (((above_cu->split_tree >> (top_qt_depth * 3)) & 7u) == QT_SPLIT) { + top_qt_depth++; + } + } + split_model = (left_cu && (left_qt_depth > split_tree.current_depth)) + (above_cu && (top_qt_depth > split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag"); + } + if (!qt_split) { + const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT; + if((can_split[BT_HOR_SPLIT] || can_split[TT_HOR_SPLIT]) && (can_split[BT_VER_SPLIT] || can_split[TT_VER_SPLIT])) { + split_model = 0; + if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] > can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) { + split_model = 4; + } else if(can_split[BT_VER_SPLIT] + can_split[TT_VER_SPLIT] < can_split[BT_HOR_SPLIT] + can_split[TT_HOR_SPLIT]) { + split_model = 3; + } else { + const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1); + const int d_l = cu_height / (left_cu ? (1 << left_cu->log2_height) : 1); + if(d_a != d_l && above_cu && left_cu) { + split_model = d_a < d_l ? 1 : 2; + } + } + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag"); + } + if ((can_split[BT_VER_SPLIT] && can_split[TT_VER_SPLIT] && is_vertical) || (can_split[BT_HOR_SPLIT] && can_split[TT_HOR_SPLIT] && !is_vertical)) { + split_model = (2 * is_vertical) + (split_tree.mtt_depth <= 1); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), + split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag"); + } + } } - // Only signal split when it is not implicit, currently only Qt split supported - if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) { - - split_model = 0; - - // Get left and top block split_flags and if they are present and true, increase model number - if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) { - split_model++; - } - - if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) { - split_model++; - } - - split_model += (depth > 2 ? 0 : 3); - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode"); - } if (bits_out) *bits_out += bits; return split_flag; } void uvg_encode_coding_tree( encoder_state_t * const state, - uint16_t x, - uint16_t y, - uint8_t depth, lcu_coeff_t *coeff, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, + split_tree_t split_tree, + bool has_chroma) { cabac_data_t * const cabac = &state->cabac; const encoder_control_t * const ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; const cu_array_t* used_array = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array; - const cu_info_t *cur_cu = uvg_cu_array_at_const(used_array, x, y); + + const int cu_width = cu_loc->width; + const int cu_height = cu_loc->height; + + const int x = cu_loc->x; + const int y = cu_loc->y; - const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth; - const int cu_height = cu_width; // TODO: height for non-square blocks - const int half_cu = cu_width >> 1; + const cu_info_t* cur_cu = uvg_cu_array_at_const(used_array, x, y); + + const int depth = split_tree.current_depth; const cu_info_t *left_cu = NULL; if (x > 0) { @@ -1412,53 +1369,60 @@ void uvg_encode_coding_tree( // Absolute coordinates - uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T)); - uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T)); + uint16_t abs_x = x + state->tile->offset_x; + uint16_t abs_y = y + state->tile->offset_y ; - int32_t frame_width = tree_type != UVG_CHROMA_T ? ctrl->in.width : ctrl->in.width / 2; - int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2; - // Check for slice border - bool border_x = frame_width < abs_x + cu_width; - bool border_y = frame_height < abs_y + cu_width; - bool border_split_x = frame_width >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu; - bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; - bool border = border_x || border_y; /*!< are we in any border CU */ + int32_t frame_width = ctrl->in.width; + int32_t frame_height = ctrl->in.height; + + // Stop if we are outside of the frame + if (abs_x >= frame_width || abs_y >= frame_height) return; if (depth <= state->frame->max_qp_delta_depth) { state->must_code_qp_delta = true; } // When not in MAX_DEPTH, insert split flag and split the blocks if needed - if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) { - - const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, tree_type,NULL); + if (cu_width + cu_height > 8) { + split_tree.split_tree = cur_cu->split_tree; + bool is_implicit; + const int split_flag = uvg_write_split_flag( + state, + cabac, + left_cu, + above_cu, + tree_type != UVG_CHROMA_T ? cu_loc : chroma_loc, + split_tree, + tree_type, + &is_implicit, + NULL + ); - if (split_flag || border) { - // Split blocks and remember to change x and y block positions - uvg_encode_coding_tree(state, x, y, depth + 1, coeff, tree_type); + if (split_flag != NO_SPLIT) { + split_tree_t new_split_tree = { cur_cu->split_tree, + split_tree.current_depth + 1, + split_tree.mtt_depth + (split_flag != QT_SPLIT), + split_tree.implicit_mtt_depth + (split_flag != QT_SPLIT && is_implicit), + 0}; - if (!border_x || border_split_x) { - uvg_encode_coding_tree(state, x + half_cu, y, depth + 1, coeff, tree_type); - } - if (!border_y || border_split_y) { - uvg_encode_coding_tree(state, x, y + half_cu, depth + 1, coeff, tree_type); - } - if (!border || (border_split_x && border_split_y)) { - uvg_encode_coding_tree(state, x + half_cu, y + half_cu, depth + 1, coeff, tree_type); + cu_loc_t new_cu_loc[4]; + uint8_t separate_chroma = 0; + const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc, &separate_chroma); + separate_chroma |= !has_chroma; + for (int split = 0; split frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1); - //ToDo: check if we can actually split - //ToDo: Implement MT split - if (depth < MAX_PU_DEPTH) - { - // cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model[5 - ((uvg_g_convert_to_bit[LCU_WIDTH] + 2) - depth)]); - // CABAC_BIN(cabac, 0, "split_transform_flag"); - } - - DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_width, cur_cu->type-1); + // fprintf(stderr, "%4d %4d %2d %2d %d %d %d\n", x, y, cu_width, cu_height, has_chroma, tree_type, cur_cu->split_tree); if (ctrl->cfg.lossless) { cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass; @@ -1492,8 +1456,8 @@ void uvg_encode_coding_tree( cabac->cur_ctx = &(cabac->ctx.ibc_flag[ctx_ibc]); CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); } - DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu); - uvg_hmvp_add_mv(state, x, y, (uint32_t)cu_width, (uint32_t)cu_width, cur_cu); + DBG_PRINT_MV(state, x, y, (uint32_t)cu_width, (uint32_t)cu_height, cur_cu); + uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu); int16_t num_cand = state->encoder_control->cfg.max_merge; if (num_cand > 1) { for (int ui = 0; ui < num_cand - 1; ui++) { @@ -1510,8 +1474,8 @@ void uvg_encode_coding_tree( } } #ifdef UVG_DEBUG_PRINT_YUVIEW_CSV - if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]); - if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_width, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]); + if (cur_cu->inter.mv_dir & 1) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L0, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]); + if (cur_cu->inter.mv_dir & 2) DBG_YUVIEW_MV(state->frame->poc, DBG_YUVIEW_MVSKIP_L1, abs_x, abs_y, cu_width, cu_height, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]); #endif goto end; @@ -1528,7 +1492,7 @@ void uvg_encode_coding_tree( CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); } - if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4 && cu_height != 4) { int8_t ctx_predmode = 0; @@ -1548,11 +1512,7 @@ void uvg_encode_coding_tree( CABAC_BIN(cabac, (cur_cu->type == CU_IBC), "IBCFlag"); } } - - // part_mode - //encode_part_mode(state, cabac, cur_cu, depth); - - + #if ENABLE_PCM // Code IPCM block @@ -1571,8 +1531,8 @@ void uvg_encode_coding_tree( uvg_pixel *rec_base_v = &frame->rec->v[x / 2 + y / 2 * ctrl->in.width / 2]; // Luma - for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) { - for (unsigned x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) { + for (unsigned y_px = 0; y_px < cu_height; y_px++) { + for (unsigned x_px = 0; x_px < cu_width; x_px++) { uvg_bitstream_put(cabac->stream, base_y[x_px + y_px * ctrl->in.width], 8); rec_base_y[x_px + y_px * ctrl->in.width] = base_y[x_px + y_px * ctrl->in.width]; } @@ -1580,14 +1540,14 @@ void uvg_encode_coding_tree( // Chroma if (ctrl->chroma_format != UVG_CSP_400) { - for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) { - for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) { + for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) { + for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) { uvg_bitstream_put(cabac->stream, base_u[x_px + y_px * (ctrl->in.width >> 1)], 8); rec_base_u[x_px + y_px * (ctrl->in.width >> 1)] = base_u[x_px + y_px * (ctrl->in.width >> 1)]; } } - for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) { - for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) { + for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) { + for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) { uvg_bitstream_put(cabac->stream, base_v[x_px + y_px * (ctrl->in.width >> 1)], 8); rec_base_v[x_px + y_px * (ctrl->in.width >> 1)] = base_v[x_px + y_px * (ctrl->in.width >> 1)]; } @@ -1599,21 +1559,15 @@ void uvg_encode_coding_tree( if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { uint8_t imv_mode = UVG_IMV_OFF; - - const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size]; bool non_zero_mvd = false; + + // TODO: height for non-square blocks + const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc->x, cu_loc->y); - for (int i = 0; i < num_pu; ++i) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); - const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, pu_x, pu_y); - - non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); - DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu); - uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu); - } + non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, NULL, NULL, cu_loc); + DBG_PRINT_MV(state, x, y, cu_width, cu_height, cur_pu); + uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_pu); + // imv mode, select between fullpel, half-pel and quarter-pel resolutions // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel @@ -1631,54 +1585,80 @@ void uvg_encode_coding_tree( } { - int cbf = cbf_is_set_any(cur_cu->cbf, depth); // Only need to signal coded block flag if not skipped or merged // skip = no coded residual, merge = coded residual - if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { + const bool has_coeffs = cur_pu->root_cbf || cur_pu->cbf; + if (!cur_cu->merged) { cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model); - CABAC_BIN(cabac, cbf, "rqt_root_cbf"); + CABAC_BIN(cabac, has_coeffs, "rqt_root_cbf"); } // Code (possible) coeffs to bitstream - - if (cbf) { - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type); + if (has_coeffs) { + int luma_cbf_ctx = 0; + encode_transform_coeff(state, cu_loc, 0, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, cu_loc, cu_loc); } - encode_mts_idx(state, cabac, cur_cu); + encode_mts_idx(state, cabac, cur_cu, cu_loc); } } else if (cur_cu->type == CU_INTRA) { if(tree_type != UVG_CHROMA_T) { - uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL); + uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, NULL, NULL); } + + const bool is_local_dual_tree = (chroma_loc->width != cu_loc->width || chroma_loc->height != cu_loc->height); // Code chroma prediction mode. - if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4 && tree_type == UVG_BOTH_T) { - encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL); + if (state->encoder_control->chroma_format != UVG_CSP_400 + && (chroma_loc->width == cu_loc->width && chroma_loc->height == cu_loc->height) + && tree_type == UVG_BOTH_T) { + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, !cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0, NULL); + } + int luma_cbf_ctx = 0; + + if (tree_type != UVG_CHROMA_T) { + // Cycle through sub partitions if ISP enabled. + // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions. + // Small blocks are split only twice. + int split_type = cur_cu->intra.isp_mode; + int split_limit = split_type == ISP_MODE_NO_ISP ? 1 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true); + luma_cbf_ctx = split_limit != 1 ? 2 : 0; + // If all first three splits have luma cbf 0, the last one must be one. Since the value ca be derived, no need to write it + bool can_skip_last_cbf = true; + for (int i = 0; i < split_limit; ++i) { + cu_loc_t split_loc; + uvg_get_isp_split_loc(&split_loc, x, y, cu_width, cu_height, i, split_type, true); + + // Check if last split to write chroma + bool last_split = (i + 1) == split_limit; + encode_transform_coeff(state, &split_loc, + 0, coeff, NULL, tree_type, last_split, can_skip_last_cbf, &luma_cbf_ctx, + cu_loc, is_local_dual_tree ? NULL : chroma_loc); + can_skip_last_cbf &= luma_cbf_ctx == 2; + } } if (tree_type != UVG_CHROMA_T) { - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff, tree_type); - } + encode_lfnst_idx(state, cabac, cur_cu, is_local_dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc); - if (tree_type != UVG_CHROMA_T) { - bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y); + encode_mts_idx(state, cabac, cur_cu, cu_loc); } - encode_mts_idx(state, cabac, cur_cu); // For 4x4 the chroma PU/TU is coded after the last - if (state->encoder_control->chroma_format != UVG_CSP_400 && - ((depth == 4 && x % 8 && y % 8) || tree_type == UVG_CHROMA_T) && + if (state->encoder_control->chroma_format != UVG_CSP_400 && + ((is_local_dual_tree && + has_chroma) || tree_type == UVG_CHROMA_T) && tree_type != UVG_LUMA_T) { - encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, NULL); + int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T); + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir,NULL); // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints - cu_info_t* tmp = uvg_cu_array_at((cu_array_t*)used_array, x, y); + cu_info_t* tmp = uvg_cu_array_at((cu_array_t *)used_array, chroma_loc->x, chroma_loc->y); tmp->violates_lfnst_constrained_luma = false; tmp->violates_lfnst_constrained_chroma = false; tmp->lfnst_last_scan_pos = false; - encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff, tree_type); + encode_transform_coeff(state, chroma_loc, 1, coeff, NULL, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc); // Write LFNST only once for single tree structure - encode_lfnst_idx(state, cabac, tmp, x, y, depth, cu_width, cu_height, tree_type, COLOR_UV); + encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc); } } @@ -1688,13 +1668,13 @@ void uvg_encode_coding_tree( exit(1); } if (state->encoder_control->cabac_debug_file) { - fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %d %d", x << (tree_type == UVG_CHROMA_T), y << (tree_type == UVG_CHROMA_T), depth, tree_type); + fprintf(state->encoder_control->cabac_debug_file, "E %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type); fwrite(&cabac->ctx, 1, sizeof(cabac->ctx), state->encoder_control->cabac_debug_file); } end: - if (is_last_cu_in_qg(state, x, y, depth)) { + if (is_last_cu_in_qg(state, cu_loc)) { state->last_qp = cur_cu->qp; } @@ -1703,27 +1683,31 @@ end: double uvg_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, - int x, - int y, - int depth, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, lcu_t* lcu, cu_info_t* cur_cu, - enum uvg_tree_type tree_type) { + enum uvg_tree_type tree_type, + const split_tree_t split_tree) { double bits = 0; const encoder_control_t* const ctrl = state->encoder_control; - int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T); - int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T); + const int x = cu_loc->x; + const int y = cu_loc->y; - const int cu_width = LCU_WIDTH >> depth; - + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + + int x_local = cu_loc->local_x; + int y_local = cu_loc->local_y; + const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width; + const cu_info_t* left_cu = NULL, *above_cu = NULL; if (x) { if(x_local || tree_type != UVG_CHROMA_T) { left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); } else { - left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1); + left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x - 1, y); } } if (y) { @@ -1731,7 +1715,7 @@ double uvg_mock_encode_coding_unit( above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1); } else { - above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1); + above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1); } } @@ -1740,23 +1724,23 @@ double uvg_mock_encode_coding_unit( } // When not in MAX_DEPTH, insert split flag and split the blocks if needed - if (tree_type != UVG_CHROMA_T ? depth != MAX_DEPTH : depth != MAX_DEPTH - 1) { + if (cur_cu->log2_height + cur_cu->log2_width > 4) { + // We do not care about whether the split is implicit or not since there is never split here + bool is_implicit; uvg_write_split_flag( state, cabac, left_cu, above_cu, - 0, - depth, - cu_width >> (tree_type == UVG_CHROMA_T), - x >> (tree_type == UVG_CHROMA_T), - y >> (tree_type == UVG_CHROMA_T), - tree_type, - &bits); + cu_loc, + split_tree, + tree_type, &is_implicit, + &bits + ); } // Encode skip flag - if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) { int8_t ctx_skip = 0; if (left_cu && left_cu->skipped) { @@ -1789,7 +1773,7 @@ double uvg_mock_encode_coding_unit( } } // Prediction mode - if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) { + if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) { int8_t ctx_predmode = 0; @@ -1802,7 +1786,7 @@ double uvg_mock_encode_coding_unit( if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { const uint8_t imv_mode = UVG_IMV_OFF; - const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits); + const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, lcu, &bits, cu_loc); if (ctrl->cfg.amvr && non_zero_mvd) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag"); if (imv_mode > UVG_IMV_OFF) { @@ -1815,10 +1799,13 @@ double uvg_mock_encode_coding_unit( } else if (cur_cu->type == CU_INTRA) { if(tree_type != UVG_CHROMA_T) { - uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits); + uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, cu_loc, lcu, &bits); } - if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { - encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm, &bits); + if((chroma_loc || tree_type == UVG_CHROMA_T) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { + int8_t luma_dir = uvg_get_co_located_luma_mode(chroma_loc,cu_loc , cur_cu, tree_type != UVG_CHROMA_T ? lcu : NULL, + tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL, + is_separate_tree ? UVG_CHROMA_T : tree_type); + encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, chroma_loc, cur_cu, tree_type), luma_dir, &bits); } } else { @@ -1872,3 +1859,27 @@ void uvg_encode_mvd(encoder_state_t * const state, if(bits_out) *bits_out = temp_bits_out; } + + +/** + * \brief Get a subset of LCU coeff array. + * + * \param dst Destination array. Should be coeff_t [32*32]. + * \param src Coeff LCU array. + * \param lcu_x Local LCU x coordinate. + * \param lcu_y Local LCU y coordinate. + * \param width Block width. + * \param height Block height. + * \param lcu_width LCU_WIDTH for luma, LCU_WIDTH_C for chroma. + * + */ +void uvg_get_sub_coeff(const coeff_t *dst, const coeff_t * const src, const int lcu_x, const int lcu_y, const int block_w, const int block_h, const int lcu_width) +{ + // Take subset of coeff array + coeff_t* dst_ptr = (coeff_t*)dst; + const coeff_t* coeff_ptr = &src[lcu_x + lcu_y * lcu_width]; + for (int j = 0; j < block_h; ++j) { + //memcpy(dst_coeff + (j * lcu_width), &coeff[j * tr_width], tr_width * sizeof(coeff_t)); + memcpy(&dst_ptr[j * block_w], &coeff_ptr[j * lcu_width], block_w * sizeof(coeff_t)); + } +} diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index c2cd39da..3df702ef 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -40,30 +40,29 @@ #include "encoderstate.h" #include "global.h" -bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu); +bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t* + const cu_loc); bool uvg_is_lfnst_allowed( const encoder_state_t* const state, const cu_info_t* const pred_cu, - const int width, - const int height, - const int x, - const int y, enum uvg_tree_type tree_type, const color_t color, - const lcu_t* lcu); + const cu_loc_t* const cu_loc, const lcu_t* const lcu); void uvg_encode_coding_tree( encoder_state_t * const state, - uint16_t x_ctb, - uint16_t y_ctb, - uint8_t depth, lcu_coeff_t *coeff, - enum uvg_tree_type tree_type); + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, + split_tree_t split_tree, + bool has_chroma); void uvg_encode_ts_residual(encoder_state_t* const state, cabac_data_t* const cabac, const coeff_t* coeff, uint32_t width, + uint32_t height, uint8_t type, int8_t scan_mode, double* bits); @@ -77,41 +76,47 @@ void uvg_encode_mvd(encoder_state_t * const state, double uvg_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, - int x, - int y, - int depth, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, lcu_t* lcu, cu_info_t* cur_cu, - enum uvg_tree_type tree_type); + enum uvg_tree_type tree_type, + const split_tree_t split_tree); -int uvg_encode_inter_prediction_unit(encoder_state_t* const state, - cabac_data_t* const cabac, - const cu_info_t* const cur_cu, - int x, int y, int width, int height, - int depth, - lcu_t* lcu, - double* bits_out); - -void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state, +int uvg_encode_inter_prediction_unit( + encoder_state_t* const state, cabac_data_t* const cabac, const cu_info_t* const cur_cu, - int x, int y, int depth, const lcu_t* lcu, double* bits_out); + lcu_t* lcu, + double* bits_out, + const cu_loc_t* const cu_loc); + +void uvg_encode_intra_luma_coding_unit( + const encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + const cu_loc_t* const cu_loc, + const lcu_t* lcu, + double* bits_out); -bool uvg_write_split_flag( +uint8_t uvg_write_split_flag( const encoder_state_t* const state, cabac_data_t* cabac, const cu_info_t* left_cu, const cu_info_t* above_cu, - uint8_t split_flag, - int depth, - int cu_width, - int x, - int y, + const cu_loc_t* const cu_loc, + split_tree_t, enum uvg_tree_type tree_type, + bool* is_implicit_out, double* bits_out); void uvg_encode_last_significant_xy(cabac_data_t * const cabac, uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, uint8_t type, uint8_t scan, double* bits_out); + +void uvg_get_sub_coeff(const coeff_t* dst, const coeff_t* const src, + const int lcu_x, const int lcu_y, + const int block_w, const int block_h, + const int lcu_width); diff --git a/src/encoder.c b/src/encoder.c index d0121037..56d03305 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -320,6 +320,13 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg) encoder->scaling_list.use_default_list = 1; } + if(cfg->dep_quant) { + if(!uvg_init_nb_info(encoder)) { + fprintf(stderr, "Could not initialize nb info.\n"); + goto init_failed; + } + } + // ROI / delta QP if (cfg->roi.file_path) { const char *mode[2] = { "r", "rb" }; @@ -378,11 +385,7 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg) { goto init_failed; } - - // NOTE: When tr_depth_inter is equal to 0, the transform is still split - // for SMP and AMP partition units. - encoder->tr_depth_inter = 0; - + //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || encoder->cfg.tiles_height_count > 1; diff --git a/src/encoder.h b/src/encoder.h index 0fb46e1b..05750292 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -38,6 +38,7 @@ * Initialization of encoder_control_t. */ +#include "dep_quant.h" #include "global.h" // IWYU pragma: keep #include "uvg266.h" #include "scalinglist.h" @@ -98,6 +99,10 @@ typedef struct encoder_control_t //scaling list scaling_list_t scaling_list; + NbInfoSbb* m_scanId2NbInfoSbbArray[7 + 1][7 + 1]; + NbInfoOut* m_scanId2NbInfoOutArray[7 + 1][7 + 1]; + struct dep_quant_scan_info* scan_info[7 + 1][7 + 1]; + //spec: references to variables defined in Rec. ITU-T H.265 (04/2013) int8_t tiles_enable; /*!cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma"); + WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma"); + if (encoder->cfg.max_btt_depth[0]) { + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma"); + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma"); + } + if (encoder->chroma_format != UVG_CSP_400) { WRITE_U(stream, encoder->cfg.dual_tree, 1, "qtbtt_dual_tree_intra_flag"); } if (encoder->cfg.dual_tree) { - WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma"); - WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_slice_chroma"); - if (0 /*sps_max_mtt_hierarchy_depth_intra_slice_chroma != 0*/) { - WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_slice_chroma"); - WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_slice_chroma"); + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma"); + WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma"); + if (encoder->cfg.max_btt_depth[2]) { + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma"); + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma"); } } - WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_inter_slice"); - WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_inter_slice"); - - -#if 0 // mtt depth intra - if (max_mtt_depth_intra != 0) { - WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_luma"); - WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_luma"); + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice"); + WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice"); + if (encoder->cfg.max_btt_depth[1] != 0) { + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group"); + WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group"); } -#endif -#if 0 // mtt depth inter - if (max_mtt_depth_inter != 0) { - WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_inter_tile_group"); - WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_inter_tile_group"); - } -#endif -#if 0 // Dual Tree - if (encoder->cfg.dual_i_tree) { - WRITE_UE(stream, 0, "sps_log2_diff_min_qt_min_cb_intra_tile_group_chroma"); - WRITE_UE(stream, 0, "sps_max_mtt_hierarchy_depth_intra_tile_group_chroma"); - - if (max_mtt_depth_intra != 0) { - WRITE_UE(stream, 0, "sps_log2_diff_max_bt_min_qt_intra_tile_group_chroma"); - WRITE_UE(stream, 0, "sps_log2_diff_max_tt_min_qt_intra_tile_group_chroma"); - } - } -#endif if (LCU_WIDTH > 32) WRITE_U(stream, (TR_MAX_LOG2_SIZE - 5) ? 1 : 0, 1, "sps_max_luma_transform_size_64_flag"); @@ -665,7 +648,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream, WRITE_UE(stream, encoder->cfg.log2_parallel_merge_level-2, "log2_parallel_merge_level_minus2"); - WRITE_U(stream, 0, 1, "sps_isp_enabled_flag"); + WRITE_U(stream, encoder->cfg.isp, 1, "sps_isp_enabled_flag"); if (state->encoder_control->cfg.mrl) { WRITE_U(stream, 1, 1, "sps_mrl_enabled_flag"); @@ -706,7 +689,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream, WRITE_U(stream, 0, 1, "scaling_list_enabled_flag"); - WRITE_U(stream, 0, 1, "pic_dep_quant_enabled_flag"); + WRITE_U(stream, encoder->cfg.dep_quant, 1, "pic_dep_quant_enabled_flag"); WRITE_U(stream, encoder->cfg.signhide_enable, 1, "pic_sign_data_hiding_enabled_flag"); @@ -1142,7 +1125,7 @@ static void uvg_encoder_state_write_bitstream_picture_header( WRITE_U(stream, 0, 1, "ph_mvd_l1_zero_flag"); } - if (encoder->cfg.jccr) { + if (encoder->cfg.jccr && encoder->chroma_format != UVG_CSP_400) { WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag"); } // END PICTURE HEADER @@ -1375,11 +1358,14 @@ void uvg_encoder_state_write_bitstream_slice_header( } // ToDo: depquant + if (encoder->cfg.dep_quant) { + WRITE_U(stream, 1, 1, "sh_dep_quant_used_flag"); + } - if (state->encoder_control->cfg.signhide_enable) { + if (state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) { WRITE_U(stream, 1, 1, "sh_sign_data_hiding_used_flag"); } - if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable /* && !cfg.dep_quant*/) + if (state->encoder_control->cfg.trskip_enable && !state->encoder_control->cfg.signhide_enable && !encoder->cfg.dep_quant) { // TODO: find out what this is actually about and parametrize it WRITE_U(stream, 0, 1, "sh_ts_residual_coding_disabled_flag"); diff --git a/src/encoderstate.c b/src/encoderstate.c index cdadccf4..78c9c9f2 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -627,43 +627,52 @@ static void encode_sao(encoder_state_t * const state, * \param prev_qp -1 if QP delta has not been coded in current QG, * otherwise the QP of the current QG */ -static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp) +static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const + int depth) { // Stop recursion if the CU is completely outside the frame. - if (x >= state->tile->frame->width || y >= state->tile->frame->height) return; + if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return; - cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y); - const int cu_width = LCU_WIDTH >> depth; + cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y); + const int width = 1 << cu->log2_width; if (depth <= state->frame->max_qp_delta_depth) { *prev_qp = -1; } - if (cu->depth > depth) { + if (cu_loc->width > width) { // Recursively process sub-CUs. - const int d = cu_width >> 1; - set_cu_qps(state, x, y, depth + 1, last_qp, prev_qp); - set_cu_qps(state, x + d, y, depth + 1, last_qp, prev_qp); - set_cu_qps(state, x, y + d, depth + 1, last_qp, prev_qp); - set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp); + const int half_width = cu_loc->width >> 1; + const int half_height = cu_loc->height >> 1; + cu_loc_t split_cu_loc; + uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height); + set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1); + uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height); + set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1); + uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height); + set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1); + uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height); + set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1); } else { bool cbf_found = *prev_qp >= 0; - if (cu->tr_depth > depth) { + int y_limit = cu_loc->y + cu_loc->height; + int x_limit = cu_loc->x + cu_loc->width; + if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) { // The CU is split into smaller transform units. Check whether coded // block flag is set for any of the TUs. - const int tu_width = LCU_WIDTH >> cu->tr_depth; - for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) { - for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) { + const int tu_width = MIN(TR_MAX_WIDTH, 1 << cu->log2_width); + for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) { + for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) { cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu); - if (cbf_is_set_any(tu->cbf, cu->depth)) { + if (cbf_is_set_any(tu->cbf)) { cbf_found = true; } } } - } else if (cbf_is_set_any(cu->cbf, cu->depth)) { + } else if (cbf_is_set_any(cu->cbf)) { cbf_found = true; } @@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las if (cbf_found) { *prev_qp = qp = cu->qp; } else { - qp = uvg_get_cu_ref_qp(state, x, y, *last_qp); + qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp); } // Set the correct QP for all state->tile->frame->cu_array elements in // the area covered by the CU. - for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) { - for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) { + for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) { + for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) { uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp; } } - if (is_last_cu_in_qg(state, x, y, depth)) { + if (is_last_cu_in_qg(state, cu_loc)) { *last_qp = cu->qp; } } @@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) if (state->frame->max_qp_delta_depth >= 0) { int last_qp = state->last_qp; int prev_qp = -1; - set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH); + set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0); } if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) { @@ -870,10 +881,16 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) enum uvg_tree_type tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T; //Encode coding tree - uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff, tree_type); + cu_loc_t start; + uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH); + split_tree_t split_tree = { 0, 0, 0, 0, 0 }; + + uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, &start, split_tree, true); if(tree_type == UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) { - uvg_encode_coding_tree(state, lcu->position.x * LCU_WIDTH_C, lcu->position.y * LCU_WIDTH_C, 0, lcu->coeff, UVG_CHROMA_T); + uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH); + cu_loc_t chroma_tree_loc = start; + uvg_encode_coding_tree(state, lcu->coeff, UVG_CHROMA_T, &start, &chroma_tree_loc, split_tree, true); } if (!state->cabac.only_count) { @@ -1152,6 +1169,12 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) uvg_threadqueue_submit(state->encoder_control->threadqueue, job[0]); uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]); +#ifdef UVG_DEBUG_PRINT_CABAC + // Ensures that the ctus are encoded in raster scan order + if(i >= state->tile->frame->width_in_lcu) { + uvg_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[(lcu->id / state->tile->frame->width_in_lcu - 1) * state->tile->frame->width_in_lcu]); + } +#endif } uvg_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]); @@ -1281,13 +1304,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) { sub_state->tile->frame->width_in_lcu * LCU_WIDTH, sub_state->tile->frame->height_in_lcu * LCU_WIDTH ); - if(main_state->encoder_control->cfg.dual_tree){ + if(main_state->encoder_control->cfg.dual_tree && main_state->frame->is_irap){ sub_state->tile->frame->chroma_cu_array = uvg_cu_subarray( main_state->tile->frame->chroma_cu_array, - offset_x / 2, - offset_y / 2, - sub_state->tile->frame->width_in_lcu * LCU_WIDTH_C, - sub_state->tile->frame->height_in_lcu * LCU_WIDTH_C + offset_x, + offset_y, + sub_state->tile->frame->width_in_lcu * LCU_WIDTH, + sub_state->tile->frame->height_in_lcu * LCU_WIDTH ); } } @@ -1926,10 +1949,9 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict if (cfg->dual_tree && state->encoder_control->chroma_format != UVG_CSP_400 && state->frame->is_irap) { assert(state->tile->frame->chroma_cu_array == NULL); - state->tile->frame->chroma_cu_array = uvg_cu_array_chroma_alloc( - state->tile->frame->width / 2, - state->tile->frame->height / 2, - state->encoder_control->chroma_format + state->tile->frame->chroma_cu_array = uvg_cu_array_alloc( + state->tile->frame->width, + state->tile->frame->height ); } // Set pictype. @@ -2029,9 +2051,9 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame) { #if UVG_DEBUG_PRINT_CABAC == 1 - uvg_cabac_bins_count = 0; + // uvg_cabac_bins_count = 0; if (state->frame->num == 0) uvg_cabac_bins_verbose = true; - else uvg_cabac_bins_verbose = false; + // else uvg_cabac_bins_verbose = false; #endif @@ -2193,11 +2215,12 @@ int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) { const cu_array_t *cua = state->tile->frame->cu_array; // Quantization group width - const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth); + const int qg_width = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_width); + const int qg_height = 1 << MAX(6 - state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->log2_height); // Coordinates of the top-left corner of the quantization group const int x_qg = x & ~(qg_width - 1); - const int y_qg = y & ~(qg_width - 1); + const int y_qg = y & ~(qg_height - 1); if(x_qg == 0 && y_qg > 0 && y_qg % LCU_WIDTH == 0) { return uvg_cu_array_at_const(cua, x_qg, y_qg - 1)->qp; } diff --git a/src/encoderstate.h b/src/encoderstate.h index 55d265e3..88409703 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -332,6 +332,7 @@ typedef struct encoder_state_t { int8_t qp; double c_lambda; + double chroma_weights[4]; /** * \brief Whether a QP delta value must be coded for the current LCU. @@ -359,7 +360,15 @@ typedef struct encoder_state_t { //Constraint structure void * constraint; + // Since lfnst needs the collocated luma intra mode for + // dual tree if the chroma mode is cclm mode and getting all of + // the information that would be necessary to get the collocated + // luma mode in the lfnst functions, instead store the current + // collocated luma mode in the state. + int8_t collocated_luma_mode; + quant_block quant_blocks[3]; // luma, ISP, chroma + rate_estimator_t rate_estimator[4]; // luma, cb, cr, isp } encoder_state_t; void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame); @@ -401,14 +410,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state) * \param depth depth in the CU tree * \return true, if it's the last CU in its QG, otherwise false */ -static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) +static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc) { if (state->frame->max_qp_delta_depth < 0) return false; - - const int cu_width = LCU_WIDTH >> depth; + const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth; - const int right = x + cu_width; - const int bottom = y + cu_width; + const int right = cu_loc->x + cu_loc->width; + const int bottom = cu_loc->y + cu_loc->height; return (right % qg_width == 0 || right >= state->tile->frame->width) && (bottom % qg_width == 0 || bottom >= state->tile->frame->height); } diff --git a/src/filter.c b/src/filter.c index 2d51a17c..a55dc619 100644 --- a/src/filter.c +++ b/src/filter.c @@ -36,6 +36,7 @@ #include "cu.h" #include "encoder.h" +#include "intra.h" #include "uvg266.h" #include "transform.h" #include "videoframe.h" @@ -269,19 +270,19 @@ static bool is_tu_boundary( int32_t x, int32_t y, edge_dir dir, + color_t color, enum uvg_tree_type tree_type) { - x >>= tree_type == UVG_CHROMA_T; - y >>= tree_type == UVG_CHROMA_T; // if (x & 3 || y & 3) return false; const cu_info_t *const scu = uvg_cu_array_at_const(tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, x, y); - const int tu_width = LCU_WIDTH >> (scu->tr_depth + (tree_type == UVG_CHROMA_T)); if (dir == EDGE_HOR) { - return (y & (tu_width - 1)) == 0; + return color == COLOR_Y ? scu->luma_deblocking & EDGE_HOR : + scu->chroma_deblocking & EDGE_HOR; } else { - return (x & (tu_width - 1)) == 0; + return color == COLOR_Y ? scu->luma_deblocking & EDGE_VER : + scu->chroma_deblocking & EDGE_VER; } } @@ -306,32 +307,6 @@ static bool is_pu_boundary(const encoder_state_t *const state, it for now, in case some other tool requires it. */ return false; - //const cu_info_t *const scu = - // uvg_cu_array_at_const(state->tile->frame->cu_array, x, y); - //// Get the containing CU. - //const int32_t cu_width = LCU_WIDTH >> scu->depth; - //const int32_t x_cu = x & ~(cu_width - 1); - //const int32_t y_cu = y & ~(cu_width - 1); - //const cu_info_t *const cu = - // uvg_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu); - - //const int num_pu = uvg_part_mode_num_parts[cu->part_size]; - //for (int i = 0; i < num_pu; i++) { - // if (dir == EDGE_HOR) { - // int y_pu = PU_GET_Y(cu->part_size, cu_width, y_cu, i); - // if (y_pu == y) { - // return true; - // } - - // } else { - // int x_pu = PU_GET_X(cu->part_size, cu_width, x_cu, i); - // if (x_pu == x) { - // return true; - // } - // } - //} - - //return false; } @@ -346,9 +321,9 @@ static bool is_pu_boundary(const encoder_state_t *const state, static bool is_on_8x8_grid(int x, int y, edge_dir dir) { if (dir == EDGE_HOR) { - return (y & 7) == 0 && (x & 2) == 0; + return (y & 7) == 0; } else { - return (x & 7) == 0 && (y & 2) == 0; + return (x & 7) == 0; } } @@ -628,10 +603,10 @@ static INLINE void get_max_filter_length(uint8_t *filt_len_P, uint8_t *filt_len_ bool transform_edge_4x4[2] = { false, false }; bool transform_edge_8x8[2] = { false, false }; - if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, tree_type); - if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, tree_type); - if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, tree_type); - if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, tree_type); + if (pos >= 4) transform_edge_4x4[0] = is_tu_boundary(state, x - x_mul * 4, y - y_mul * 4, dir, comp, tree_type); + if (pos >= 8) transform_edge_8x8[0] = is_tu_boundary(state, x - x_mul * 8, y - y_mul * 8, dir, comp, tree_type); + if (pos + 4 < len) transform_edge_4x4[1] = is_tu_boundary(state, x + x_mul * 4, y + y_mul * 4, dir, comp, tree_type); + if (pos + 8 < len) transform_edge_8x8[1] = is_tu_boundary(state, x + x_mul * 8, y + y_mul * 8, dir, comp, tree_type); if (comp == COLOR_Y) { if (tu_size_P_side <= 4 || tu_size_Q_side <= 4){ @@ -756,8 +731,8 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, cu_q = uvg_cu_array_at(frame->cu_array, x_coord, y); } - bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y) - || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y); + bool nonzero_coeffs = cbf_is_set(cu_q->cbf, COLOR_Y) + || cbf_is_set(cu_p->cbf, COLOR_Y); // Filter strength strength = 0; @@ -766,7 +741,6 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, } else if (tu_boundary && nonzero_coeffs) { // Non-zero residual/coeffs and transform boundary - // Neither CU is intra so tr_depth <= MAX_DEPTH. strength = 1; } else if(cu_p->inter.mv_dir == 3 || cu_q->inter.mv_dir == 3 || state->frame->slicetype == UVG_SLICE_B) { // B-slice related checks. TODO: Need to account for cu_p being in another slice? @@ -854,18 +828,50 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, bool is_side_Q_large = false; uint8_t max_filter_length_P = 0; uint8_t max_filter_length_Q = 0; - const int cu_size = LCU_WIDTH >> cu_q->depth; - const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? - 1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0) - + (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0); - const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx) - : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx); - const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) - : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx); + + const int cu_width = 1 << cu_q->log2_width; + const int cu_height = 1 << cu_q->log2_height; + const int pu_size = dir == EDGE_HOR ? cu_height : cu_width; + const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord; + int tu_size_q_side = 0; + if (cu_q->type == CU_INTRA && cu_q->intra.isp_mode != ISP_MODE_NO_ISP) { + if (cu_q->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) { + tu_size_q_side = MAX(4, cu_width >> 2); + } else if (cu_q->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) { + tu_size_q_side = MAX(4, cu_height >> 2); + } else { + tu_size_q_side = dir == EDGE_HOR ? + MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) : + MIN(1 << cu_q->log2_width, TR_MAX_WIDTH); + } + } else { + tu_size_q_side = dir == EDGE_HOR ? + MIN(1 << cu_q->log2_height, TR_MAX_WIDTH) : + MIN(1 << cu_q->log2_width, TR_MAX_WIDTH); + } + + int tu_size_p_side = 0; + if (cu_p->type == CU_INTRA && cu_p->intra.isp_mode != ISP_MODE_NO_ISP) { + if (cu_p->intra.isp_mode == ISP_MODE_VER && dir == EDGE_VER) { + tu_size_p_side = MAX(4, (1 << cu_p->log2_width) >> 2); + } else if (cu_p->intra.isp_mode == ISP_MODE_HOR && dir == EDGE_HOR) { + tu_size_p_side = MAX(4, (1 << cu_p->log2_height) >> 2); + } else { + tu_size_p_side = dir == EDGE_HOR ? + MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) : + MIN(1 << cu_p->log2_width, TR_MAX_WIDTH); + } + } else { + tu_size_p_side = dir == EDGE_HOR ? + MIN(1 << cu_p->log2_height, TR_MAX_WIDTH) : + MIN(1 << cu_p->log2_width, TR_MAX_WIDTH); + + } + get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord, dir, tu_boundary, - LCU_WIDTH >> cu_p->tr_depth, - LCU_WIDTH >> cu_q->tr_depth, + tu_size_p_side, + tu_size_q_side, pu_pos, pu_size, cu_q->merged, COLOR_Y, UVG_LUMA_T); @@ -1073,41 +1079,44 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state, // CUs on both sides of the edge cu_info_t *cu_p; cu_info_t *cu_q; - int32_t x_coord = x << (tree_type != UVG_CHROMA_T); - int32_t y_coord = y << (tree_type != UVG_CHROMA_T); + int32_t x_coord = x << 1; + int32_t y_coord = y << 1; cu_array_t* cua = tree_type != UVG_CHROMA_T ? frame->cu_array : frame->chroma_cu_array; if (dir == EDGE_VER) { - y_coord = (y + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T); + y_coord = (y + min_chroma_length * blk_idx) << (1); cu_p = uvg_cu_array_at(cua, x_coord - 1, y_coord); cu_q = uvg_cu_array_at(cua, x_coord , y_coord); } else { - x_coord = (x + min_chroma_length * blk_idx) << (tree_type != UVG_CHROMA_T); + x_coord = (x + min_chroma_length * blk_idx) << (1); cu_p = uvg_cu_array_at(cua, x_coord, y_coord - 1); cu_q = uvg_cu_array_at(cua, x_coord, y_coord ); } - - const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T)); - const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? - 1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0) - + ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0); - const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx) - : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx); - const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) - : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx); + uint8_t max_filter_length_P = 0; uint8_t max_filter_length_Q = 0; - const int tu_p_size = LCU_WIDTH >> (cu_p->tr_depth + (chroma_shift)); - const int tu_q_size = LCU_WIDTH >> (cu_q->tr_depth + (chroma_shift)); + const int cu_width = 1 << (cu_q->log2_chroma_width ); + const int cu_height = 1 << (cu_q->log2_chroma_height); + const int pu_size = dir == EDGE_HOR ? cu_height : cu_width; + const int pu_pos = dir == EDGE_HOR ? y_coord : x_coord; + + + const int tu_size_p_side = dir == EDGE_HOR ? + MIN(1 << (cu_p->log2_chroma_height), TR_MAX_WIDTH) : + MIN(1 << (cu_p->log2_chroma_width), TR_MAX_WIDTH); + const int tu_size_q_side = dir == EDGE_HOR ? + MIN(1 << (cu_q->log2_chroma_height ), TR_MAX_WIDTH) : + MIN(1 << (cu_q->log2_chroma_width ), TR_MAX_WIDTH); + get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord, - dir, tu_boundary, tu_p_size, tu_q_size, + dir, tu_boundary, tu_size_p_side, tu_size_q_side, pu_pos, pu_size, cu_q->merged, COLOR_U, tree_type); const bool large_boundary = (max_filter_length_P >= 3 && max_filter_length_Q >= 3); - const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % (LCU_WIDTH >> (tree_type == UVG_CHROMA_T)) == 0); + const bool is_chroma_hor_CTB_boundary = (dir == EDGE_HOR && y_coord % LCU_WIDTH == 0); uint8_t c_strength[2] = { 0, 0 }; @@ -1116,10 +1125,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state, c_strength[1] = 2; } else if (tu_boundary){ //TODO: Add ciip/IBC related stuff - bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_U) - || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_U); - bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_V) - || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_V); + bool nonzero_coeffs_U = cbf_is_set(cu_q->cbf, COLOR_U) + || cbf_is_set(cu_p->cbf, COLOR_U); + bool nonzero_coeffs_V = cbf_is_set(cu_q->cbf, COLOR_V) + || cbf_is_set(cu_p->cbf, COLOR_V); c_strength[0] = nonzero_coeffs_U ? 1 : 0; c_strength[1] = nonzero_coeffs_V ? 1 : 0; } @@ -1237,11 +1246,12 @@ static void filter_deblock_unit( // Chroma pixel coordinates. const int32_t x_c = x >> 1; const int32_t y_c = y >> 1; - if (state->encoder_control->chroma_format != UVG_CSP_400 && - (is_on_8x8_grid(x_c, y_c, dir && (x_c + 4) % 32) - || (x == state->tile->frame->width - 8 && dir == 1 && y_c % 8 == 0)) + if (state->encoder_control->chroma_format != UVG_CSP_400 && + is_tu_boundary(state, x, y, dir, COLOR_UV, tree_type) + && (is_on_8x8_grid(x_c, y_c, dir == EDGE_HOR && (x_c + 4) % 32 ? EDGE_HOR : EDGE_VER) + || (x == state->tile->frame->width - 8 && dir == EDGE_HOR && y_c % 8 == 0)) && tree_type != UVG_LUMA_T) { - filter_deblock_edge_chroma(state, x_c, y_c, length, dir, tu_boundary, tree_type); + filter_deblock_edge_chroma(state, x_c, y_c, 2, dir, tu_boundary, tree_type); } } @@ -1271,11 +1281,11 @@ static void filter_deblock_lcu_inside(encoder_state_t * const state, for (int edge_y = y; edge_y < end_y; edge_y += 4) { for (int edge_x = x; edge_x < end_x; edge_x += 4) { - bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, luma_tree); + bool tu_boundary = is_tu_boundary(state, edge_x, edge_y, dir, COLOR_Y, luma_tree); if (tu_boundary || is_pu_boundary(state, edge_x, edge_y, dir)) { filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, luma_tree); } - if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, chroma_tree)) { + if(chroma_tree == UVG_CHROMA_T && is_tu_boundary(state, edge_x, edge_y, dir, COLOR_UV, chroma_tree)) { filter_deblock_unit(state, edge_x, edge_y, 4, 4, dir, tu_boundary, edge_x < x, chroma_tree); } } @@ -1302,7 +1312,7 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state, for (int x = x_px - 8; x < x_px; x += 4) { for (int y = y_px; y < end; y += 4) { // The top edge of the whole frame is not filtered. - bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, luma_tree); + bool tu_boundary = is_tu_boundary(state, x, y, EDGE_HOR, COLOR_Y, luma_tree); if (y > 0 && (tu_boundary || is_pu_boundary(state, x, y, EDGE_HOR))) { filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR, tu_boundary); } @@ -1313,13 +1323,15 @@ static void filter_deblock_lcu_rightmost(encoder_state_t * const state, if (state->encoder_control->chroma_format != UVG_CSP_400) { const int x_px_c = x_px >> 1; const int y_px_c = y_px >> 1; - const int x_c = x_px_c - 4; - const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1); - for (int y_c = y_px_c; y_c < end_c; y_c += 8) { - // The top edge of the whole frame is not filtered. - bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, chroma_tree); - if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) { - filter_deblock_edge_chroma(state, x_c , y_c, 4, EDGE_HOR, tu_boundary, chroma_tree); + int x_c = x_px_c - 4; + const int end_c_y = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1); + for(; x_c < x_px_c; x_c += 2) { + for (int y_c = y_px_c; y_c < end_c_y; y_c += 8) { + // The top edge of the whole frame is not filtered. + bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR, COLOR_UV, chroma_tree); + if (y_c > 0 && (tu_boundary || is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR))) { + filter_deblock_edge_chroma(state, x_c , y_c, 2, EDGE_HOR, tu_boundary, chroma_tree); + } } } } diff --git a/src/filter.h b/src/filter.h index 0d98eedd..2db9c871 100644 --- a/src/filter.h +++ b/src/filter.h @@ -46,8 +46,8 @@ * \brief Edge direction. */ typedef enum edge_dir { - EDGE_VER = 0, // vertical - EDGE_HOR = 1, // horizontal + EDGE_VER = 1, // vertical + EDGE_HOR = 2, // horizontal } edge_dir; diff --git a/src/global.h b/src/global.h index 65ca2fa9..972b7e82 100644 --- a/src/global.h +++ b/src/global.h @@ -145,11 +145,11 @@ typedef int32_t mv_t; #define INTERNAL_MV_PREC 4 // Internal motion vector precision, 4 = 1/16 pel -//! Limits for prediction block sizes. 0 = 64x64, 4 = 4x4. +//! Limits for prediction block sizes. #define PU_DEPTH_INTER_MIN 0 -#define PU_DEPTH_INTER_MAX 3 +#define PU_DEPTH_INTER_MAX 8 #define PU_DEPTH_INTRA_MIN 0 -#define PU_DEPTH_INTRA_MAX 4 +#define PU_DEPTH_INTRA_MAX 8 //! Maximum number of layers in GOP structure (for allocating structures) #define MAX_GOP_LAYERS 6 @@ -273,7 +273,6 @@ typedef int32_t mv_t; #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value)) #define CLIP_TO_QP(value) CLIP(0, 51, (value)) #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; } -#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth) #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val)) #define CEILDIV(x,y) (((x) + (y) - 1) / (y)) diff --git a/src/inter.c b/src/inter.c index 3bbef427..d275f4ea 100644 --- a/src/inter.c +++ b/src/inter.c @@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -static unsigned inter_recon_unipred(const encoder_state_t * const state, - const uvg_picture * const ref, - int32_t pu_x, - int32_t pu_y, - int32_t pu_w, - int32_t pu_h, - int32_t out_stride_luma, - const mv_t mv_param[2], - yuv_t *yuv_px, - yuv_im_t *yuv_im, - bool predict_luma, - bool predict_chroma) +static unsigned inter_recon_unipred( + const encoder_state_t * const state, + const uvg_picture * const ref, + int32_t out_stride_luma, + const mv_t mv_param[2], + yuv_t *yuv_px, + yuv_im_t *yuv_im, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { vector2d_t int_mv = { mv_param[0], mv_param[1] }; uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv); + const int pu_x = cu_loc->x; + const int pu_y = cu_loc->y; + const int pu_w = cu_loc->width; + const int pu_h = cu_loc->height; + const vector2d_t int_mv_in_frame = { int_mv.x + pu_x + state->tile->offset_x, int_mv.y + pu_y + state->tile->offset_y @@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -void uvg_inter_recon_bipred(const encoder_state_t *const state, +void uvg_inter_recon_bipred( + const encoder_state_t *const state, const uvg_picture *ref1, const uvg_picture *ref2, - int32_t pu_x, - int32_t pu_y, - int32_t pu_w, - int32_t pu_h, mv_t mv_param[2][2], lcu_t *lcu, bool predict_luma, - bool predict_chroma) + bool predict_chroma, + const cu_loc_t* const cu_loc) { // Allocate maximum size arrays for interpolated and copied samples ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; @@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; + const int pu_x = cu_loc->x; + const int pu_y = cu_loc->y; + const int pu_w = cu_loc->width; + const int pu_h = cu_loc->height; + yuv_t px_L0; px_L0.size = pu_w * pu_h; px_L0.y = &px_buf_L0[0]; @@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, // Sample blocks from both reference picture lists. // Flags state if the outputs were written to high-precision / interpolated sample buffers. - unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0], - &px_L0, &im_L0, predict_luma, predict_chroma); - unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1], - &px_L1, &im_L1, predict_luma, predict_chroma); + unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma, + cu_loc); + unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma, + cu_loc); // After reconstruction, merge the predictors by taking an average of each pixel uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1, @@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -void uvg_inter_recon_cu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma) +void uvg_inter_recon_cu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); - const int num_pu = uvg_part_mode_num_parts[cu->part_size]; - for (int i = 0; i < num_pu; ++i) { - uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i); - } + uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc); } static void ibc_recon_cu(const encoder_state_t * const state, @@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state, int32_t y, int32_t width, bool predict_luma, - bool predict_chroma, - int i_pu) + bool predict_chroma) { const int x_scu = SUB_SCU(x); const int y_scu = SUB_SCU(y); @@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state, * \param predict_chroma Enable or disable chroma prediction for this call. * \param i_pu Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP. */ -void uvg_inter_pred_pu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma, - int i_pu) +void uvg_inter_pred_pu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { - const int x_scu = SUB_SCU(x); - const int y_scu = SUB_SCU(y); - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); - const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu); - const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu); - const int pu_w = PU_GET_W(cu->part_size, width, i_pu); - const int pu_h = PU_GET_H(cu->part_size, width, i_pu); - cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + const int x_scu = SUB_SCU(cu_loc->x); + const int y_scu = SUB_SCU(cu_loc->y); + cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); - if (cu->type == CU_IBC) { - ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu); - } else { + if (pu->inter.mv_dir == 3) { + const uvg_picture *const refs[2] = { + state->frame->ref->images[ + state->frame->ref_LX[0][ + pu->inter.mv_ref[0]]], + state->frame->ref->images[ + state->frame->ref_LX[1][ + pu->inter.mv_ref[1]]], + }; + uvg_inter_recon_bipred(state, + refs[0], refs[1], + pu->inter.mv, lcu, + predict_luma, predict_chroma, + cu_loc); + } + else if (pu->type == CU_IBC) { + ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma); + } else{ + const int mv_idx = pu->inter.mv_dir - 1; + const uvg_picture *const ref = + state->frame->ref->images[ + state->frame->ref_LX[mv_idx][ + pu->inter.mv_ref[mv_idx]]]; - if (pu->inter.mv_dir == 3) { - const uvg_picture * const refs[2] = { - state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]], - state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]], - }; - uvg_inter_recon_bipred( - state, - refs[0], - refs[1], - pu_x, - pu_y, - pu_w, - pu_h, - pu->inter.mv, - lcu, - predict_luma, - predict_chroma); - } else { - const int mv_idx = pu->inter.mv_dir - 1; - const uvg_picture * const ref = - state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]; + const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x); + const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2; + yuv_t lcu_adapter; + lcu_adapter.size = cu_loc->width * cu_loc->height; + lcu_adapter.y = lcu->rec.y + offset_luma; + lcu_adapter.u = lcu->rec.u + offset_chroma; + lcu_adapter.v = lcu->rec.v + offset_chroma; - const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x); - const unsigned offset_chroma = - SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2; - yuv_t lcu_adapter; - lcu_adapter.size = pu_w * pu_h; - lcu_adapter.y = lcu->rec.y + offset_luma, - lcu_adapter.u = lcu->rec.u + offset_chroma, - lcu_adapter.v = lcu->rec.v + offset_chroma, - - inter_recon_unipred( - state, - ref, - pu_x, - pu_y, - pu_w, - pu_h, - LCU_WIDTH, - pu->inter.mv[mv_idx], - &lcu_adapter, - NULL, - predict_luma, - predict_chroma); - } + inter_recon_unipred(state, + ref, + LCU_WIDTH, pu->inter.mv[mv_idx], + &lcu_adapter, + NULL, + predict_luma, + predict_chroma, + cu_loc); } if (predict_chroma && state->encoder_control->cfg.jccr) { const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; - uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); } } @@ -915,14 +899,12 @@ static bool is_b0_cand_coded(int x, int y, int width, int height) * \param ref_idx index in the reference list * \param cand_out will be filled with C0 and C1 candidates */ -static void get_temporal_merge_candidates(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - uint8_t ref_list, - uint8_t ref_idx, - merge_candidates_t *cand_out) +static void get_temporal_merge_candidates( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + uint8_t ref_list, + uint8_t ref_idx, + merge_candidates_t *cand_out) { /* Predictor block locations @@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref]; int cu_per_width = ref_cu_array->width / SCU_WIDTH; - int32_t xColBr = x + width; - int32_t yColBr = y + height; + int32_t xColBr = cu_loc->x + cu_loc->width; + int32_t yColBr = cu_loc->y + cu_loc->height; // C0 must be available if (xColBr < state->encoder_control->in.width && @@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, } } } - int32_t xColCtr = x + (width / 2); - int32_t yColCtr = y + (height / 2); + int32_t xColCtr = cu_loc->x + (cu_loc->width / 2); + int32_t yColCtr = cu_loc->y + (cu_loc->height / 2); // C1 must be inside the LCU, in the center position of current CU if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) { @@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, * \param lcu current LCU * \param cand_out will be filled with A and B candidates */ -static void get_spatial_merge_candidates(int32_t x, - int32_t y, - int32_t width, - int32_t height, +static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc, int32_t picture_width, int32_t picture_height, lcu_t *lcu, @@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x, |A1|_________| |A0| */ - int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU - int32_t y_local = SUB_SCU(y); + const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU + const int32_t y_local = SUB_SCU(cu_loc->y); + + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = cu_loc->width; + const int height = cu_loc->height; // A0 and A1 availability testing if (x != 0) { cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1); @@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x, * \param picture_height tile height in pixels * \param cand_out will be filled with A and B candidates */ -static void get_spatial_merge_candidates_cua(const cu_array_t *cua, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - int32_t picture_width, - int32_t picture_height, - merge_candidates_t *cand_out, - bool wpp) +static void get_spatial_merge_candidates_cua( + const cu_array_t *cua, + int32_t picture_width, + int32_t picture_height, + merge_candidates_t *cand_out, + bool wpp, + const cu_loc_t* const cu_loc) { /* Predictor block locations @@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua, |A1|_________| |A0| */ - int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU - int32_t y_local = SUB_SCU(y); + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = cu_loc->width; + const int height = cu_loc->height; + const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU + const int32_t y_local = SUB_SCU(y); // A0 and A1 availability testing if (x != 0) { const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1); @@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state, /** * \brief Pick two mv candidates from the spatial and temporal candidates. */ -static void get_mv_cand_from_candidates(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - const merge_candidates_t *merge_cand, - const cu_info_t * const cur_cu, - int8_t reflist, - mv_t mv_cand[2][2]) +static void get_mv_cand_from_candidates( + const encoder_state_t * const state, + const merge_candidates_t *merge_cand, + const cu_info_t * const cur_cu, + int8_t reflist, + mv_t mv_cand[2][2], + int ctu_row) { const cu_info_t *const *a = merge_cand->a; const cu_info_t *const *b = merge_cand->b; @@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, if (candidates < AMVP_MAX_NUM_CANDS) { - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size[ctu_row]; for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) { @@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, * \param lcu current LCU * \param reflist reflist index (either 0 or 1) */ -void uvg_inter_get_mv_cand(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t * const cur_cu, - lcu_t *lcu, - int8_t reflist) +void uvg_inter_get_mv_cand( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t * const cur_cu, + lcu_t *lcu, + int8_t reflist, + const cu_loc_t* const cu_loc) { merge_candidates_t merge_cand = { 0 }; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; if (cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); - } else { - get_spatial_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } else { + get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu, + &merge_cand, + parallel_merge_level, + state->encoder_control->cfg.wpp); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH); } + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, * \param cur_cu current CU * \param reflist reflist index (either 0 or 1) */ -void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - int8_t reflist) +void uvg_inter_get_mv_cand_cua( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + int8_t reflist, + const cu_loc_t* const cu_loc) { merge_candidates_t merge_cand = { 0 }; const cu_array_t *cua = state->tile->frame->cu_array; if (cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); } else { get_spatial_merge_candidates_cua(cua, - x, y, width, height, - state->tile->frame->width, state->tile->frame->height, - &merge_cand, state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp, + cu_loc); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH); } + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1885,23 +1864,23 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) { * \param lcu lcu containing the block * \return number of merge candidates */ -uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, - int32_t x, int32_t y, - int32_t width, int32_t height, - bool use_a1, bool use_b1, - inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], - lcu_t *lcu) +uint8_t uvg_inter_get_merge_cand( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], + lcu_t *lcu) { uint8_t candidates = 0; int8_t zero_idx = 0; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; merge_candidates_t merge_cand = { 0 }; const uint8_t max_num_cands = state->encoder_control->cfg.max_merge; + // Current CU + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y)); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); if(cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) { mv_cand[i].dir = 1; mv_cand[i].mv[0][0] = ibc_mv_cand[i][0]; @@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, } return IBC_MRG_MAX_NUM_CANDS; } - - get_spatial_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp); + get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu, + &merge_cand, + parallel_merge_level, + state->encoder_control->cfg.wpp); const cu_info_t **a = merge_cand.a; const cu_info_t **b = merge_cand.b; - if (!use_a1) a[1] = NULL; - if (!use_b1) b[1] = NULL; + const int x = cu_loc->x; + const int y = cu_loc->y; if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++; if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++; @@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, for (int reflist = 0; reflist <= max_reflist; reflist++) { // Fetch temporal candidates for the current CU // ToDo: change collocated_from_l0_flag to allow L1 ref - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); // TODO: enable L1 TMVP candidate // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand); @@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, if (candidates == max_num_cands) return candidates; if (candidates != max_num_cands - 1) { - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size[ctu_row]; diff --git a/src/inter.h b/src/inter.h index 45f5e5ea..4d5fccd5 100644 --- a/src/inter.h +++ b/src/inter.h @@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv); void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver); void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv); -void uvg_inter_recon_cu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma); - -void uvg_inter_pred_pu(const encoder_state_t * const state, +void uvg_inter_recon_cu( + const encoder_state_t * const state, lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, bool predict_luma, bool predict_chroma, - int i_pu); + const cu_loc_t* const cu_loc); + +void uvg_inter_pred_pu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc); void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu); -void uvg_inter_recon_bipred(const encoder_state_t * const state, - const uvg_picture * ref1, - const uvg_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - mv_t mv_param[2][2], - lcu_t* lcu, - bool predict_luma, - bool predict_chroma); +void uvg_inter_recon_bipred( + const encoder_state_t * const state, + const uvg_picture * ref1, + const uvg_picture * ref2, + mv_t mv_param[2][2], + lcu_t* lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc); -void uvg_inter_get_mv_cand(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - lcu_t *lcu, - int8_t reflist); +void uvg_inter_get_mv_cand( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + lcu_t *lcu, + int8_t reflist, + const cu_loc_t* const cu_loc); -void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - int8_t reflist); +void uvg_inter_get_mv_cand_cua( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + int8_t reflist, + const cu_loc_t* const cu_loc); -uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, - int32_t x, int32_t y, - int32_t width, int32_t height, - bool use_a1, bool use_b1, - inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], - lcu_t *lcu); +uint8_t uvg_inter_get_merge_cand( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], + lcu_t *lcu); #endif diff --git a/src/intra.c b/src/intra.c index 7e742d46..22eb93c7 100644 --- a/src/intra.c +++ b/src/intra.c @@ -37,6 +37,10 @@ #include "image.h" #include "uvg_math.h" #include "mip_data.h" +#include "rdo.h" +#include "search.h" +#include "search_intra.h" +#include "strategies-picture.h" #include "strategies/strategies-intra.h" #include "tables.h" #include "transform.h" @@ -197,6 +201,7 @@ int8_t uvg_intra_get_dir_luma_predictor( static void intra_filter_reference( int_fast8_t log2_width, + int_fast8_t log2_height, uvg_intra_references *refs) { if (refs->filtered_initialized) { @@ -206,6 +211,7 @@ static void intra_filter_reference( } const int_fast8_t ref_width = 2 * (1 << log2_width) + 1; + const int_fast8_t ref_height = 2 * (1 << log2_height) + 1; uvg_intra_ref *ref = &refs->ref; uvg_intra_ref *filtered_ref = &refs->filtered_ref; @@ -213,14 +219,13 @@ static void intra_filter_reference( filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) >> 2; filtered_ref->top[0] = filtered_ref->left[0]; - // TODO: use block height here instead of ref_width // Top to bottom - for (int_fast8_t y = 1; y < ref_width - 1; ++y) { + for (int_fast8_t y = 1; y < ref_height - 1; ++y) { uvg_pixel *p = &ref->left[y]; filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2; } // Bottom left (not filtered) - filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1]; + filtered_ref->left[ref_height - 1] = ref->left[ref_height - 1]; // Left to right for (int_fast8_t x = 1; x < ref_width - 1; ++x) { @@ -231,39 +236,48 @@ static void intra_filter_reference( filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1]; } - /** * \brief Generate dc prediction. -* \param log2_width Log2 of width, range 2..5. +* \param cu_loc CU location and size data. +* \param color Color channel. * \param ref_top Pointer to -1 index of above reference, length=width*2+1. * \param ref_left Pointer to -1 index of left reference, length=width*2+1. * \param dst Buffer of size width*width. * \param multi_ref_idx Multi reference line index for use with MRL. */ static void intra_pred_dc( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, + const color_t color, const uvg_pixel *const ref_top, const uvg_pixel *const ref_left, uvg_pixel *const out_block, const uint8_t multi_ref_idx) { - int_fast8_t width = 1 << log2_width; - + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + int_fast16_t sum = 0; - for (int_fast8_t i = 0; i < width; ++i) { - sum += ref_top[i + 1 + multi_ref_idx]; - sum += ref_left[i + 1 + multi_ref_idx]; + // Only one loop is done for non-square blocks. + // In case of non-square blocks, only the longer reference is summed. + if (width >= height) { + for (int_fast8_t i = 0; i < width; ++i) { + sum += ref_top[i + 1 + multi_ref_idx]; + } + } + if (width <= height) { + for (int_fast8_t j = 0; j < height; ++j) { + sum += ref_left[j + 1 + multi_ref_idx]; + } } // JVET_K0122 - // TODO: take non-square blocks into account - const int denom = width << 1; + const int denom = width == height ? width << 1 : MAX(width, height); const int divShift = uvg_math_floor_log2(denom); const int divOffset = denom >> 1; const uvg_pixel dc_val = (sum + divOffset) >> divShift; //const uvg_pixel dc_val = (sum + width) >> (log2_width + 1); - const int_fast16_t block_size = 1 << (log2_width * 2); + const int_fast16_t block_size = width * height; for (int_fast16_t i = 0; i < block_size; ++i) { out_block[i] = dc_val; @@ -271,6 +285,33 @@ static void intra_pred_dc( } +bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t * const luma_loc, cu_info_t const * const cur_cu, enum + uvg_tree_type tree_type) +{ + if (tree_type != UVG_CHROMA_T) { + return true; + } + uint32_t chroma_split_depth0 = GET_SPLITDATA(cur_cu, 0); + uint32_t chroma_split_depth1 = GET_SPLITDATA(cur_cu, 1); + bool allow = false; + if (chroma_split_depth0 == QT_SPLIT || (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == BT_VER_SPLIT)) allow = true; + else if (chroma_split_depth0 == NO_SPLIT) allow = true; + else if (chroma_split_depth0 == BT_HOR_SPLIT && chroma_split_depth1 == NO_SPLIT) allow = true; + if (!allow) { + return false; + } + const cu_info_t* const luma_cu = uvg_cu_array_at_const(state->tile->frame->cu_array, luma_loc->x, luma_loc->y); + uint32_t split = GET_SPLITDATA(luma_cu, 0); + if (split != NO_SPLIT) { + allow = split == QT_SPLIT; + } + else if (split != NO_SPLIT && luma_cu->intra.isp_mode != ISP_MODE_NO_ISP) { + allow = false; + } + return allow; +} + + enum lm_mode { LM_CHROMA_IDX = 81, @@ -286,7 +327,7 @@ static void get_cclm_parameters( uvg_intra_ref* luma_src, uvg_intra_references*chroma_ref, int16_t *a, int16_t*b, int16_t*shift) { - const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX); + const int base_unit_size = 4; // TODO: take into account YUV422 const int unit_w = base_unit_size >> 1; @@ -312,8 +353,8 @@ static void get_cclm_parameters( //int left_below_units = total_left_units - tu_height_in_units; //int avai_above_right_units = 0; // TODO these are non zero only with non-square CUs //int avai_left_below_units = 0; - int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size); - int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size); + int avai_above_units = y0 ? tu_width_in_units : 0; + int avai_left_units = x0 ? tu_height_in_units : 0; bool above_available = avai_above_units != 0; bool left_available = avai_left_units != 0; @@ -491,9 +532,8 @@ static void predict_cclm( const lcu_t* const lcu, uvg_intra_references* chroma_ref, uvg_pixel* dst, - cclm_parameters_t* cclm_params, - enum uvg_tree_type tree_type - ) + cclm_parameters_t* cclm_params +) { assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX); assert(state->encoder_control->cfg.cclm); @@ -511,20 +551,14 @@ static void predict_cclm( const uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH; const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); - - // Essentially what this does is that it uses 6-tap filtering to downsample - // the luma intra references down to match the resolution of the chroma channel. - // The luma reference is only needed when we are not on the edge of the picture. - // Because the reference pixels that are needed on the edge of the ctu this code - // is kinda messy but what can you do - const int ctu_size = tree_type == UVG_CHROMA_T ? LCU_WIDTH_C : LCU_WIDTH; + + const int ctu_size = LCU_WIDTH; if (y0) { - if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 2), (state->tile->frame->width - x0 - width* 2) / 2); + if (y_scu == 0) available_above_right = MIN(MIN(width / 2, (64-x_scu - width * 2) / 4), (state->tile->frame->width - x0 - width* 2) / 4); for (; available_above_right < width / 2; available_above_right++) { int x_extension = x_scu + width * 2 + 4 * available_above_right; - x_extension >>= tree_type == UVG_CHROMA_T; - const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu >> (tree_type==UVG_CHROMA_T)) - 4); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, (y_scu) - 4); if (x_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break; } if(y_scu == 0) { @@ -547,13 +581,12 @@ static void predict_cclm( } if(x0) { - if (x_scu == 0) available_left_below = MIN(MIN(width / 2, (64 - y_scu - height * 2) / 2), (state->tile->frame->height - y0 - height * 2) / 2); + if (x_scu == 0) available_left_below = MIN(MIN(height / 2, (64 - y_scu - height * 2) / 4), (state->tile->frame->height - y0 - height * 2) / 4); for (; available_left_below < height / 2; available_left_below++) { int y_extension = y_scu + height * 2 + 4 * available_left_below; - y_extension >>= tree_type == UVG_CHROMA_T; - const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu >> (tree_type == UVG_CHROMA_T)) - 4, y_extension); + const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, (x_scu) - 4, y_extension); if (y_extension >= ctu_size || pu->type == CU_NOTSET || (pu->type == CU_INTRA && pu->intra.mode_chroma == -1)) break; - if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break; + if(x_scu == 32 && y_scu == 0 && pu->log2_height == 6 && pu->log2_width == 6 ) break; } for(int i = 0; i < height + available_left_below * 2; i++) { sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1]; @@ -573,12 +606,18 @@ static void predict_cclm( } -int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a) { +uint8_t uvg_get_mip_flag_context( + const cu_loc_t* const cu_loc, + const lcu_t* lcu, + cu_array_t* const cu_a) { assert(!(lcu && cu_a)); - if (width > 2 * height || height > 2 * width) { + if (cu_loc->width > 2 * cu_loc->height || cu_loc->height > 2 * cu_loc->width) { return 3; } - + + const int x = cu_loc->x; + const int y = cu_loc->y; + int context = 0; const cu_info_t* left = NULL; const cu_info_t* top = NULL; @@ -898,39 +937,77 @@ static void mip_predict( } +int8_t uvg_wide_angle_correction( + int_fast8_t mode, + const int log2_width, + const int log2_height, + const + bool account_for_dc_planar) +{ + int8_t pred_mode = mode; + if (log2_width != log2_height) { + if (mode > 1 && mode <= 66) { + const int modeShift[] = { 0, 6, 10, 12, 14, 15 }; + const int deltaSize = abs(log2_width - log2_height); + if (log2_width > log2_height && mode < 2 + modeShift[deltaSize]) { + pred_mode += (66 - 1); + } + else if (log2_height > log2_width && mode > 66 - modeShift[deltaSize]) { + pred_mode -= (66 - 1) + (account_for_dc_planar ? 2 : 0); + } + } + } + return pred_mode; +} + static void intra_predict_regular( const encoder_state_t* const state, uvg_intra_references *refs, - int_fast8_t log2_width, + const cu_info_t* const cur_cu, + const cu_loc_t* const cu_loc, + const cu_loc_t* const pu_loc, int_fast8_t mode, color_t color, uvg_pixel *dst, - const uint8_t multi_ref_idx) + const uint8_t multi_ref_idx, + const uint8_t isp_mode) { - const int_fast8_t width = 1 << log2_width; + const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width; + const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; const uvg_config *cfg = &state->encoder_control->cfg; // MRL only for luma uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0; + uint8_t isp = color == COLOR_Y ? isp_mode : 0; + + // Wide angle correction + int8_t pred_mode = uvg_wide_angle_correction( + mode, + color == COLOR_Y ? cur_cu->log2_width : log2_width, + color == COLOR_Y ? cur_cu->log2_height : log2_height, + false + ); const uvg_intra_ref *used_ref = &refs->ref; - if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || width == 4 || multi_ref_index) { + if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index || isp_mode /*ISP_TODO: replace this fake ISP check*/) { // For chroma, DC and 4x4 blocks, always use unfiltered reference. } else if (mode == 0) { // Otherwise, use filtered for planar. - if (width * width > 32) { + if (width * height > 32) { used_ref = &refs->filtered_ref; } } else { // Angular modes use smoothed reference pixels, unless the mode is close // to being either vertical or horizontal. static const int uvg_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 }; - int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_width) >> 1]; - int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18)); + int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1]; + int dist_from_vert_or_hor = MIN(abs(pred_mode - 50), abs(pred_mode - 18)); if (dist_from_vert_or_hor > filter_threshold) { static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; - const int_fast8_t mode_disp = (mode >= 34) ? mode - 50 : 18 - mode; + const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode; const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; if ((abs(sample_disp) & 0x1F) == 0) { used_ref = &refs->filtered_ref; @@ -939,38 +1016,66 @@ static void intra_predict_regular( } if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) { - intra_filter_reference(log2_width, refs); + intra_filter_reference(log2_width, log2_height, refs); } if (mode == 0) { - uvg_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst); + uvg_intra_pred_planar(pu_loc, color, used_ref->top, used_ref->left, dst); } else if (mode == 1) { - intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst, multi_ref_index); + intra_pred_dc(pu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index); } else { - uvg_angular_pred(log2_width, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index); + uvg_angular_pred( + pu_loc, + pred_mode, + color, + used_ref->top, + used_ref->left, + dst, + multi_ref_index, + isp, + isp_mode == ISP_MODE_HOR ? cu_loc->height : cu_loc->width); } // pdpc // bool pdpcCondition = (mode == 0 || mode == 1 || mode == 18 || mode == 50); bool pdpcCondition = (mode == 0 || mode == 1); // Planar and DC + pdpcCondition &= width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH; if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL. { - uvg_pdpc_planar_dc(mode, width, log2_width, used_ref, dst); + uvg_pdpc_planar_dc(mode, pu_loc, color, used_ref, dst); } } void uvg_intra_build_reference_any( - const int_fast8_t log2_width, + const encoder_state_t* const state, + const cu_loc_t* const pu_loc, + const cu_loc_t* const cu_loc, const color_t color, const vector2d_t *const luma_px, const vector2d_t *const pic_px, const lcu_t *const lcu, uvg_intra_references *const refs, const uint8_t multi_ref_idx, - uvg_pixel *extra_ref_lines) + uvg_pixel *extra_ref_lines, + const uint8_t isp_mode) { - assert(log2_width >= 2 && log2_width <= 5); + const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width; + const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + // These are only used with ISP, so no need to check chroma + const int cu_width = cu_loc->width; + const int cu_height = cu_loc->height; + const int pu_x = pu_loc->x; + const int pu_y = pu_loc->y; + const int cu_x = cu_loc->x; + const int cu_y = cu_loc->y; + + bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false; + + assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5); refs->filtered_initialized = false; uvg_pixel *out_left_ref = &refs->ref.left[0]; @@ -978,8 +1083,7 @@ void uvg_intra_build_reference_any( const uvg_pixel dc_val = 1 << (UVG_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable const int is_chroma = color != COLOR_Y ? 1 : 0; - // TODO: height for non-square blocks - const int_fast8_t width = 1 << log2_width; + const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap; // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0; @@ -1038,12 +1142,24 @@ void uvg_intra_build_reference_any( // Generate left reference. if (luma_px->x > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. - int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma; + int px_available_left; + if (isp_mode && !is_first_isp_block && !is_chroma) { + if (isp_mode == ISP_MODE_VER) { + px_available_left = height; + } + else { + px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4; + px_available_left -= pu_loc->y - cu_loc->y; + } + } + else { + const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true); + px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2; + } // Limit the number of available pixels based on block size and dimensions // of the picture. - // TODO: height for non-square blocks - px_available_left = MIN(px_available_left, width * 2 + multi_ref_index); + px_available_left = MIN(px_available_left, cu_height * 2 + multi_ref_index); px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma); // Copy pixels from coded CUs. @@ -1053,13 +1169,18 @@ void uvg_intra_build_reference_any( } // Extend the last pixel for the rest of the reference values. uvg_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride]; - for (int i = px_available_left; i < width * 2 + multi_ref_index * 2; ++i) { + + // If first isp split, take samples as if it were normal square block + int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); + for (int i = px_available_left; i < tmp_h + multi_ref_index * 2; ++i) { out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { // If we are on the left edge, extend the first pixel of the top row. uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val; - for (int i = 0; i < width * 2 + multi_ref_index; i++) { + // If first isp split, take samples as if it were normal square block + int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); + for (int i = 0; i < tmp_h + multi_ref_index; i++) { // Reserve space for top left reference out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } @@ -1142,13 +1263,26 @@ void uvg_intra_build_reference_any( } // Generate top reference. + int px_available_top; if (luma_px->y > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. - int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma; - + if (isp_mode && !is_first_isp_block && !is_chroma) { + if (isp_mode == ISP_MODE_HOR) { + px_available_top = width; + } + else { + px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4; + px_available_top -= pu_loc->x - cu_loc->x; + } + } + else { + const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false); + px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2; + } + // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_top = MIN(px_available_top, width * 2 + multi_ref_index); + px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index); px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma); // Copy all the pixels we can. @@ -1157,20 +1291,28 @@ void uvg_intra_build_reference_any( } // Extend the last pixel for the rest of the reference values. uvg_pixel nearest_pixel = top_border[px_available_top - 1]; - for (int i = px_available_top; i < width * 2 + multi_ref_index * 2; ++i) { + + // If first isp split, take samples as if it were normal square block + int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); + for (int i = px_available_top; i < tmp_w + multi_ref_index * 2; ++i) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { // Extend nearest pixel. uvg_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val; - for (int i = 0; i < width * 2 + multi_ref_index; i++) { + + // If first isp split, take samples as if it were normal square block + int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); + for (int i = 0; i < tmp_w + multi_ref_index * 2; i++) { out_top_ref[i + 1] = nearest_pixel; } } } void uvg_intra_build_reference_inner( - const int_fast8_t log2_width, + const encoder_state_t* const state, + const cu_loc_t* const pu_loc, + const cu_loc_t* const cu_loc, const color_t color, const vector2d_t *const luma_px, const vector2d_t *const pic_px, @@ -1178,17 +1320,33 @@ void uvg_intra_build_reference_inner( uvg_intra_references *const refs, bool entropy_sync, const uint8_t multi_ref_idx, - uvg_pixel* extra_ref_lines) + uvg_pixel* extra_ref_lines, + uint8_t isp_mode) { - assert(log2_width >= 2 && log2_width <= 5); + const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width; + const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height; + const int cu_width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int cu_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + // These are only used with ISP, so no need to check chroma + const int pu_x = pu_loc->x; + const int pu_y = pu_loc->y; + const int cu_x = cu_loc->x; + const int cu_y = cu_loc->y; + + bool is_first_isp_block = isp_mode ? pu_x == cu_x && pu_y == cu_y : false; + + // Log2_dim 1 is possible with ISP blocks + assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5); refs->filtered_initialized = false; uvg_pixel * __restrict out_left_ref = &refs->ref.left[0]; uvg_pixel * __restrict out_top_ref = &refs->ref.top[0]; const int is_chroma = color != COLOR_Y ? 1 : 0; - // TODO: height for non-sqaure blocks - const int_fast8_t width = 1 << log2_width; + const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap; // Get multiRefIdx from CU under prediction. Do not use MRL if not luma const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0; @@ -1288,27 +1446,59 @@ void uvg_intra_build_reference_inner( } // Generate left reference. -// Get the number of reference pixels based on the PU coordinate within the LCU. - int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma; + // Get the number of reference pixels based on the PU coordinate within the LCU. + int px_available_left; + if (isp_mode && !is_first_isp_block && !is_chroma) { + if (isp_mode == ISP_MODE_VER) { + px_available_left = height; + } + else { + px_available_left = uvg_count_available_edge_cus(cu_loc, lcu, true) * 4; + px_available_left -= pu_loc->y - cu_loc->y; + } + + } + else { + if(!is_dual_tree) { + const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true); + px_available_left = is_dual_tree || !is_chroma ? num_cus * 4 : num_cus * 2; + } else { + const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, true); + px_available_left = !is_chroma ? num_cus * 4 : num_cus * 2; + } + } // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_left = MIN(px_available_left, width * 2); + px_available_left = MIN(px_available_left, cu_height * 2); px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma); // Copy pixels from coded CUs. int i = multi_ref_index; // Offset by multi_ref_index - do { - out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride]; - out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride]; - out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride]; - out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride]; - i += 4; - } while (i < px_available_left); + + // Do different loop for heights smaller than 4 (possible for some ISP splits) + if (px.y % 4 != 0 || px_available_left < 4) { + do { + out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride]; + i += 1; + } while (i < px_available_left); + } + else { + do { + out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride]; + out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride]; + out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride]; + out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride]; + i += 4; + } while (i < px_available_left); + } // Extend the last pixel for the rest of the reference values. uvg_pixel nearest_pixel = out_left_ref[i]; - for (; i < width * 2; i += 4) { + + // If first isp split, take samples as if it were normal square block + int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); + for (; i < tmp_h; i += 4) { out_left_ref[i + 1] = nearest_pixel; out_left_ref[i + 2] = nearest_pixel; out_left_ref[i + 3] = nearest_pixel; @@ -1317,7 +1507,7 @@ void uvg_intra_build_reference_inner( // Extend for MRL if (multi_ref_index) { - for (; i < width * 2 + multi_ref_index; ++i) { + for (; i < height * 2 + multi_ref_index; ++i) { out_left_ref[i + 1] = nearest_pixel; } } @@ -1325,11 +1515,24 @@ void uvg_intra_build_reference_inner( // Generate top reference. // Get the number of reference pixels based on the PU coordinate within the LCU. - int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma; + int px_available_top; + if (isp_mode && !is_first_isp_block && !is_chroma) { + if (isp_mode == ISP_MODE_HOR) { + px_available_top = width; + } + else { + px_available_top = uvg_count_available_edge_cus(cu_loc, lcu, false) * 4; + px_available_top -= pu_loc->x - cu_loc->x; + } + } + else { + const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false); + px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2; + } // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_top = MIN(px_available_top, width * 2 + multi_ref_index); + px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index); px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma); if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1); @@ -1343,7 +1546,10 @@ void uvg_intra_build_reference_inner( // Extend the last pixel for the rest of the reference values. nearest_pixel = out_top_ref[i + multi_ref_index]; - for (; i < (width + multi_ref_index) * 2; i += 4) { + + // If first isp split, take samples as if it were normal square block + int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); + for (; i < tmp_w + (multi_ref_index * 2); i += 4) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; out_top_ref[i + 2 + multi_ref_index] = nearest_pixel; out_top_ref[i + 3 + multi_ref_index] = nearest_pixel; @@ -1351,8 +1557,11 @@ void uvg_intra_build_reference_inner( } } + void uvg_intra_build_reference( - const int_fast8_t log2_width, + const encoder_state_t* const state, + const cu_loc_t* const pu_loc, + const cu_loc_t* const cu_loc, const color_t color, const vector2d_t *const luma_px, const vector2d_t *const pic_px, @@ -1360,15 +1569,19 @@ void uvg_intra_build_reference( uvg_intra_references *const refs, bool entropy_sync, uvg_pixel *extra_ref_lines, - uint8_t multi_ref_idx) + uint8_t multi_ref_idx, + const uint8_t isp_mode) { assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references."); + //bool first_split = color == COLOR_Y && isp_mode && pu_loc->x == cu_loc->x && pu_loc->y == cu_loc->y; + //uint8_t isp = first_split ? 0 : isp_mode; + // Much logic can be discarded if not on the edge if (luma_px->x > 0 && luma_px->y > 0) { - uvg_intra_build_reference_inner(log2_width, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines); + uvg_intra_build_reference_inner(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines, isp_mode); } else { - uvg_intra_build_reference_any(log2_width, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines); + uvg_intra_build_reference_any(state, pu_loc, cu_loc, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines, isp_mode); } } @@ -1377,21 +1590,21 @@ void uvg_intra_predict( const encoder_state_t* const state, uvg_intra_references* const refs, const cu_loc_t* const cu_loc, + const cu_loc_t* const pu_loc, const color_t color, uvg_pixel* dst, const intra_search_data_t* data, - const lcu_t* lcu, - enum uvg_tree_type tree_type - ) + const lcu_t* lcu +) { const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); // TODO: what is this used for? // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm); bool use_mip = false; - const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; - const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; - const int x = cu_loc->x; - const int y = cu_loc->y; + const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width; + const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height; + const int x = pu_loc->x; + const int y = pu_loc->y; int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma; if (data->pred_cu.intra.mip_flag) { if (color == COLOR_Y) { @@ -1407,68 +1620,153 @@ void uvg_intra_predict( mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed); } else { - intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx); + intra_predict_regular(state, refs, &data->pred_cu, cu_loc, pu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx, data->pred_cu.intra.isp_mode); } } else { - uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width); - if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) { + uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, height, stride / 2, width); + if (width != 1 << data->pred_cu.log2_chroma_width || height != 1 << data->pred_cu.log2_chroma_height || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) { predict_cclm( - state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, - (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1], - tree_type); + state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, + (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]); } else { - linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width); + linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, height); } } } // This function works on luma coordinates -const cu_info_t* uvg_get_co_located_luma_cu( - int x, - int y, - int width, - int height, +int8_t uvg_get_co_located_luma_mode( + const cu_loc_t* const chroma_loc, + const cu_loc_t* const cu_loc, + const cu_info_t* luma_cu, const lcu_t* const lcu, const cu_array_t* const cu_array, enum uvg_tree_type tree_type) { + int x = chroma_loc->x; + int y = chroma_loc->y; assert((cu_array || lcu) && !(cu_array && lcu)); assert(tree_type != UVG_LUMA_T && "Luma only CU shouldn't need colocated luma CU"); if(tree_type == UVG_CHROMA_T) { - x += width >> 1; - y += height >> 1; + x += chroma_loc->width >> 1; + y += chroma_loc->height >> 1; } - if(cu_array) { - return uvg_cu_array_at_const(cu_array, x, y); + const cu_info_t* cu; + if (lcu && cu_loc->x <= x && x < cu_loc->x + cu_loc->width && cu_loc->y <= y && y < cu_loc->y + cu_loc->height) { + cu = luma_cu; + } + else if(cu_array) { + cu = uvg_cu_array_at_const(cu_array, x, y); } else { - return LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); } + if (cu->intra.mip_flag) { + return 0; + } + return cu->intra.mode; +} + + + + +/** +* \brief Returns ISP split partition size based on block dimensions and split type. +* +* Returns ISP split partition size based on block dimensions and split type. +* Will fail if resulting partition size has less than 16 samples. +* +* \param width Block width. +* \param height Block height. +* \param split_type Horizontal or vertical split. +*/ +int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_split) +{ + assert(split_type != ISP_MODE_NO_ISP && "Cannot calculate split dimension if no split type is set. Make sure this function is not called in this case."); + + bool divide_in_rows = split_type == SPLIT_TYPE_HOR; + int split_dim_size, non_split_dim_size, partition_size, div_shift = 2; + + if (divide_in_rows) { + split_dim_size = height; + non_split_dim_size = width; + } + else { + split_dim_size = width; + non_split_dim_size = height; + } + + const int min_num_samples = 16; // Minimum allowed number of samples for split block + const int factor_to_min_samples = non_split_dim_size < min_num_samples ? min_num_samples >> uvg_math_floor_log2(non_split_dim_size) : 1; + partition_size = (split_dim_size >> div_shift) < factor_to_min_samples ? factor_to_min_samples : (split_dim_size >> div_shift); + + // Minimum width for ISP splits are 4. (JVET-T2001 chapter 8.4.5.1 equation 246: nPbW = Max(4, nW)) + // Except this does not apply for transform blocks for some reason. VTM does seem to expect 4 transform blocks even if only two pred blocks were used + // Height can be 2. + if (!divide_in_rows && !is_transform_split) { + partition_size = MAX(4, partition_size); + } + + assert((uvg_math_floor_log2(partition_size) + uvg_math_floor_log2(non_split_dim_size) >= uvg_math_floor_log2(min_num_samples)) && + "Partition has less than allowed minimum number of samples."); + return partition_size; +} + + +int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_split) +{ + assert((split_type != ISP_MODE_NO_ISP) && "This function cannot be called if ISP mode is 0."); + int split_dim = uvg_get_isp_split_dim(width, height, split_type, is_transform_split); + int num = split_type == ISP_MODE_HOR ? height / split_dim : width / split_dim; + + return num; +} + + +void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_split) +{ + // Check for illegal splits + assert(!(block_w == 4 && block_h == 4) || split_idx == 0 && "Trying to get ISP split CU when split is not allowed."); + assert(!((block_w * block_h) <= 16) || split_idx < 2 && "Split index for small blocks must be in [0, 1]"); + assert((split_idx >= 0 && split_idx <= 3) && "ISP split index must be in [0, 3]."); + assert((split_type != ISP_MODE_NO_ISP || split_idx == 0) && "Trying to ISP split when split type = NO_ISP."); + int part_dim = block_w; + if (split_type != ISP_MODE_NO_ISP) { + part_dim = uvg_get_isp_split_dim(block_w, block_h, split_type, is_transform_split); + } + if(split_type == ISP_MODE_VER && block_w < 16 && block_h != 4 && !is_transform_split) { + split_idx /= 2; + } + const int offset = part_dim * split_idx; + + const int part_x = split_type == ISP_MODE_HOR ? x : x + offset; + const int part_y = split_type == ISP_MODE_HOR ? y + offset : y; + const int part_w = split_type == ISP_MODE_HOR ? block_w : part_dim; + const int part_h = split_type == ISP_MODE_HOR ? part_dim : block_h; + + uvg_cu_loc_ctor(loc, part_x, part_y, part_w, part_h); } static void intra_recon_tb_leaf( encoder_state_t* const state, - int x, - int y, - int depth, + const cu_loc_t* pu_loc, + const cu_loc_t* cu_loc, lcu_t *lcu, color_t color, - const intra_search_data_t* search_data, - enum uvg_tree_type tree_type) + const intra_search_data_t* search_data) { const uvg_config *cfg = &state->encoder_control->cfg; const int shift = color == COLOR_Y ? 0 : 1; - int log2width = LOG2_LCU_WIDTH - depth; - if (color != COLOR_Y && depth < MAX_PU_DEPTH) { - // Chroma width is half of luma width, when not at maximum depth. - log2width -= 1; - } - const int width = 1 << log2width; - const int height = width; // TODO: proper height for non-square blocks + const int x = pu_loc->x; + const int y = pu_loc->y; + + const int width = color == COLOR_Y ? pu_loc->width : pu_loc->chroma_width; + const int height = color == COLOR_Y ? pu_loc->height : pu_loc->chroma_height; + const int lcu_width = LCU_WIDTH >> shift; const vector2d_t luma_px = { x, y }; @@ -1480,8 +1778,10 @@ static void intra_recon_tb_leaf( int y_scu = SUB_SCU(y); const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift }; uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0; + uint8_t isp_mode = color == COLOR_Y ? search_data->pred_cu.intra.isp_mode : 0; uvg_intra_references refs; + // Extra reference lines for use with MRL. Extra lines needed only for left edge. uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 }; @@ -1490,26 +1790,20 @@ static void intra_recon_tb_leaf( // Copy extra ref lines, including ref line 1 and top left corner. for (int i = 0; i < MAX_REF_LINE_IDX; ++i) { - int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX; - height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist. - height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX); + int ref_height = height * 2 + MAX_REF_LINE_IDX; + ref_height = MIN(ref_height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist. + ref_height = MIN(ref_height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX); uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)], &extra_refs[i * 128], - 1, height, + 1, ref_height, frame->rec->stride, 1); } } - uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index); + + uvg_intra_build_reference(state, pu_loc, cu_loc, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index, isp_mode); uvg_pixel pred[32 * 32]; - - cu_loc_t loc = { - x, y, - width, height, - width, height, - }; - - uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu, tree_type); + uvg_intra_predict(state, &refs, cu_loc, pu_loc, color, pred, search_data, lcu); const int index = lcu_px.x + lcu_px.y * lcu_width; uvg_pixel *block = NULL; @@ -1529,12 +1823,13 @@ static void intra_recon_tb_leaf( default: break; } - uvg_pixels_blit(pred, block , width, width, width, lcu_width); + uvg_pixels_blit(pred, block , width, height, width, lcu_width); if(color != COLOR_Y && cfg->jccr) { - uvg_pixels_blit(pred, block2, width, width, width, lcu_width); + uvg_pixels_blit(pred, block2, width, height, width, lcu_width); } } + /** * \brief Reconstruct an intra CU * @@ -1552,79 +1847,219 @@ static void intra_recon_tb_leaf( */ void uvg_intra_recon_cu( encoder_state_t* const state, - int x, - int y, - int depth, intra_search_data_t* search_data, + const cu_loc_t* cu_loc, cu_info_t *cur_cu, lcu_t *lcu, enum uvg_tree_type tree_type, bool recon_luma, bool recon_chroma) { - const vector2d_t lcu_px = { SUB_SCU(x) >> (tree_type == UVG_CHROMA_T), SUB_SCU(y) >> (tree_type == UVG_CHROMA_T) }; - const int8_t width = LCU_WIDTH >> depth; + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + const vector2d_t lcu_px = { + cu_loc->local_x, + cu_loc->local_y, + }; + const int8_t width = cu_loc->width; + const int8_t height = cu_loc->height; if (cur_cu == NULL) { cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); } - if(!recon_luma && recon_chroma) { - x &= ~7; - y &= ~7; - } - + // Reset CBFs because CBFs might have been set // for depth earlier if (recon_luma) { - cbf_clear(&cur_cu->cbf, depth, COLOR_Y); + cbf_clear(&cur_cu->cbf, COLOR_Y); } if (recon_chroma) { - cbf_clear(&cur_cu->cbf, depth, COLOR_U); - cbf_clear(&cur_cu->cbf, depth, COLOR_V); + cbf_clear(&cur_cu->cbf, COLOR_U); + cbf_clear(&cur_cu->cbf, COLOR_V); } - if (depth == 0 || cur_cu->tr_depth > depth) { - - const int offset = width / 2; - const int32_t x2 = x + offset; - const int32_t y2 = y + offset; - - uvg_intra_recon_cu(state, x, y, depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma); - uvg_intra_recon_cu(state, x2, y, depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma); - uvg_intra_recon_cu(state, x, y2, depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma); - uvg_intra_recon_cu(state, x2, y2, depth + 1, search_data, NULL, lcu, tree_type, recon_luma, recon_chroma); - - // Propagate coded block flags from child CUs to parent CU. - uint16_t child_cbfs[3] = { - LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), lcu_px.y >> (tree_type == UVG_CHROMA_T))->cbf, - LCU_GET_CU_AT_PX(lcu, lcu_px.x >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf, - LCU_GET_CU_AT_PX(lcu, (lcu_px.x + offset) >> (tree_type == UVG_CHROMA_T), (lcu_px.y + offset) >> (tree_type == UVG_CHROMA_T))->cbf, - }; - - if (recon_luma && depth <= MAX_DEPTH) { - cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y); + if (width > TR_MAX_WIDTH || height > TR_MAX_WIDTH) { + enum split_type split; + if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; } - if (recon_chroma && depth <= MAX_DEPTH) { - cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U); - cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V); + else if (cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; } - } else { - const bool has_luma = recon_luma; - const bool has_chroma = recon_chroma && (x % 8 == 0 && y % 8 == 0); - - // Process a leaf TU. - if (has_luma) { - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data, tree_type); - } - if (has_chroma) { - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data, tree_type); - intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data, tree_type); + else { + split = BT_HOR_SPLIT; } - uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3), - search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma, - x, y, depth, cur_cu, lcu, - false, - tree_type); + cu_loc_t split_cu_loc[4]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + for (int i = 0; i < split_count; ++i) { + uvg_intra_recon_cu( + state, search_data, &split_cu_loc[i], + NULL, lcu, + state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T, + recon_luma, recon_chroma); + } + + return; } + if (search_data->pred_cu.intra.isp_mode != ISP_MODE_NO_ISP && recon_luma ) { + search_data->best_isp_cbfs = 0; + // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions. + // Small blocks are split only twice. + int split_type = search_data->pred_cu.intra.isp_mode; + int split_limit = uvg_get_isp_split_num(width, height, split_type, true); + + state->quant_blocks[1].needs_init = true; + + for (int i = 0; i < split_limit; ++i) { + cu_loc_t tu_loc; + uvg_get_isp_split_loc(&tu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true); + cu_loc_t pu_loc; + uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false); + cur_cu->intra.isp_index = 0; + if(tu_loc.x % 4 == 0) { + intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data); + } + state->rate_estimator[3].needs_init = true; + uvg_quantize_lcu_residual(state, true, false, false, + &tu_loc, cur_cu, lcu, + false, tree_type); + search_data->best_isp_cbfs |= cbf_is_set(cur_cu->cbf, COLOR_Y) << i; + cur_cu->intra.isp_cbfs = search_data->best_isp_cbfs; + } + } + const bool has_luma = recon_luma && search_data->pred_cu.intra.isp_mode == ISP_MODE_NO_ISP; + const bool has_chroma = recon_chroma; + + // Process a leaf TU. + if (has_luma) { + intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_Y, search_data); + } + if (has_chroma) { + intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_U, search_data); + intra_recon_tb_leaf(state, cu_loc, cu_loc, lcu, COLOR_V, search_data); + } + + // TODO: not necessary to call if only luma and ISP is on + uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3), + search_data->pred_cu.joint_cb_cr & 3 && state->encoder_control->cfg.jccr && has_chroma, + cu_loc, cur_cu, lcu, + false, + tree_type); } + + +/** +* \brief Check if ISP can be used for block size. +* +* \return True if isp can be used. +* \param width Block width. +* \param height Block height. +* \param max_tr_size Maximum supported transform block size (64). +*/ +bool uvg_can_use_isp(const int width, const int height) +{ + assert(!(width > LCU_WIDTH || height > LCU_WIDTH) && "Block size larger than max LCU size."); + assert(!(width < TR_MIN_WIDTH || height < TR_MIN_WIDTH) && "Block size smaller than min TR_WIDTH."); + + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + // Each split block must have at least 16 samples. + bool not_enough_samples = (log2_width + log2_height <= 4); + bool cu_size_larger_than_max_tr_size = width > TR_MAX_WIDTH || height > TR_MAX_WIDTH; + if (not_enough_samples || cu_size_larger_than_max_tr_size) { + return false; + } + return true; +} + + +/** +* \brief Check if given ISP mode can be used with LFNST. +* +* \return True if isp can be used. +* \param width Block width. +* \param height Block height. +* \param isp_mode ISP mode. +* \param tree_type Tree type. Dual, luma or chroma tree. +*/ +bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_split_type, const enum uvg_tree_type tree_type) +{ + if (tree_type == UVG_CHROMA_T) { + return false; + } + if (isp_split_type == ISP_MODE_NO_ISP) { + return true; + } + + const int tu_width = (isp_split_type == ISP_MODE_HOR) ? width : uvg_get_isp_split_dim(width, height, SPLIT_TYPE_VER, true); + const int tu_height = (isp_split_type == ISP_MODE_HOR) ? uvg_get_isp_split_dim(width, height, SPLIT_TYPE_HOR, true) : height; + + if (!(tu_width >= TR_MIN_WIDTH && tu_height >= TR_MIN_WIDTH)) + { + return false; + } + return true; +} + + +double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state, + const cu_loc_t* const cu_loc, + double cost_treshold, + intra_search_data_t* const search_data, + lcu_t* const lcu, bool* violates_lfnst) { + assert(state->search_cabac.update && "ISP reconstruction must be done with CABAC update"); + double cost = 0; + + const int width = cu_loc->width; + const int height = cu_loc->height; + + search_data->best_isp_cbfs = 0; + search_data->pred_cu.intra.isp_cbfs = 0; + // ISP split is done horizontally or vertically depending on ISP mode, 2 or 4 times depending on block dimensions. + // Small blocks are split only twice. + int split_type = search_data->pred_cu.intra.isp_mode; + int split_limit = uvg_get_isp_split_num(width, height, split_type, true); + + int cbf_context = 2; + state->quant_blocks[1].needs_init = true; + + for (int i = 0; i < split_limit; ++i) { + search_data->pred_cu.intra.isp_index = i; + cu_loc_t tu_loc; + uvg_get_isp_split_loc(&tu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true); + cu_loc_t pu_loc; + uvg_get_isp_split_loc(&pu_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, false); + if (tu_loc.x % 4 == 0) { + intra_recon_tb_leaf(state, &pu_loc, cu_loc, lcu, COLOR_Y, search_data); + } + + state->rate_estimator[3].needs_init = true; + uvg_quantize_lcu_residual(state, true, false, false, + &tu_loc, &search_data->pred_cu, lcu, + false, UVG_LUMA_T); + + int index = tu_loc.local_y * LCU_WIDTH + tu_loc.local_x; + int ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], + LCU_WIDTH, LCU_WIDTH, + tu_loc.width, tu_loc.height); + double coeff_bits = uvg_get_coeff_cost(state, lcu->coeff.y, &search_data->pred_cu, &tu_loc, 0, SCAN_DIAG, false, COEFF_ORDER_CU); + + + int cbf = cbf_is_set(search_data->pred_cu.cbf, COLOR_Y); + if (i + 1 != split_limit || search_data->best_isp_cbfs != 0) { + CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_luma[cbf_context], cbf, coeff_bits, "cbf_luma_isp_recon"); + } + cost += ssd + coeff_bits * state->lambda; + + cbf_context = 2 + cbf; + if(violates_lfnst) *violates_lfnst |= search_data->pred_cu.violates_lfnst_constrained_luma; + search_data->pred_cu.violates_lfnst_constrained_luma = false; + + search_data->best_isp_cbfs |= cbf << i; + search_data->pred_cu.intra.isp_cbfs = search_data->best_isp_cbfs; + + } + search_data->pred_cu.intra.isp_index = 0; + return cost; +} \ No newline at end of file diff --git a/src/intra.h b/src/intra.h index a2ffa230..c15b182a 100644 --- a/src/intra.h +++ b/src/intra.h @@ -71,6 +71,7 @@ typedef struct { double coeff_bits; double distortion; double lfnst_costs[3]; + uint8_t best_isp_cbfs; } intra_search_data_t ; @@ -107,7 +108,9 @@ int8_t uvg_intra_get_dir_luma_predictor( * \param multi_ref_idx Multi reference line index for the prediction block. */ void uvg_intra_build_reference( - const int_fast8_t log2_width, + const encoder_state_t* const state, + const cu_loc_t* const pu_loc, + const cu_loc_t* const cu_loc, const color_t color, const vector2d_t *const luma_px, const vector2d_t *const pic_px, @@ -115,7 +118,8 @@ void uvg_intra_build_reference( uvg_intra_references *const refs, bool entropy_sync, uvg_pixel *extra_refs, - uint8_t multi_ref_idx); + uint8_t multi_ref_idx, + const uint8_t isp_mode); /** * \brief Generate intra predictions. @@ -130,32 +134,60 @@ void uvg_intra_predict( const encoder_state_t* const state, uvg_intra_references* const refs, const cu_loc_t* const cu_loc, + const cu_loc_t* const pu_loc, const color_t color, uvg_pixel* dst, const intra_search_data_t* data, - const lcu_t* lcu, - enum uvg_tree_type tree_type - ); + const lcu_t* lcu +); void uvg_intra_recon_cu( encoder_state_t* const state, - int x, - int y, - int depth, intra_search_data_t* search_data, + const cu_loc_t* cu_loc, cu_info_t *cur_cu, lcu_t *lcu, enum uvg_tree_type tree_type, bool recon_luma, bool recon_chroma); -const cu_info_t* uvg_get_co_located_luma_cu( - int x, - int y, - int width, - int height, +double uvg_recon_and_estimate_cost_isp(encoder_state_t* const state, + const cu_loc_t* const cu_loc, + double cost_treshold, + intra_search_data_t* const search_data, + lcu_t* const lcu, bool* violates_lfnst); + +int8_t uvg_get_co_located_luma_mode( + const cu_loc_t* const chroma_loc, + const cu_loc_t* const cu_loc, + const cu_info_t* luma_cu, const lcu_t* const lcu, const cu_array_t* const cu_array, enum uvg_tree_type tree_type); +bool uvg_cclm_is_allowed(const encoder_state_t* const state, const cu_loc_t* const luma_loc, cu_info_t const* const cur_cu, enum + uvg_tree_type tree_type); -int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a); +uint8_t uvg_get_mip_flag_context( + const cu_loc_t* const cu_loc, + const lcu_t* lcu, + cu_array_t* const cu_a); + +int8_t uvg_wide_angle_correction( + int_fast8_t mode, + const int log2_width, + const int log2_height, + const bool account_for_dc_planar); + +// ISP related defines +#define NUM_ISP_MODES 3 +#define ISP_MODE_NO_ISP 0 +#define ISP_MODE_HOR 1 +#define ISP_MODE_VER 2 +#define SPLIT_TYPE_HOR 1 +#define SPLIT_TYPE_VER 2 + +int uvg_get_isp_split_dim(const int width, const int height, const int split_type, const bool is_transform_block); +int uvg_get_isp_split_num(const int width, const int height, const int split_type, const bool is_transform_block); +void uvg_get_isp_split_loc(cu_loc_t *loc, const int x, const int y, const int block_w, const int block_h, int split_idx, const int split_type, const bool is_transform_block); +bool uvg_can_use_isp(const int width, const int height); +bool uvg_can_use_isp_with_lfnst(const int width, const int height, const int isp_mode, const enum uvg_tree_type tree_type); diff --git a/src/rate_control.c b/src/rate_control.c index 67570565..3dfa35fe 100644 --- a/src/rate_control.c +++ b/src/rate_control.c @@ -795,12 +795,20 @@ static double qp_to_lambda(encoder_state_t* const state, int qp) state->frame->QP + 2 + frame_allocation, est_qp); } + if(state->encoder_control->cfg.dep_quant) { + est_lambda *= pow(2, 0.25 / 3.0); + } state->lambda = est_lambda; state->lambda_sqrt = sqrt(est_lambda); state->qp = est_qp; int8_t chroma_qp = encoder->qp_map[0][est_qp]; double tmpWeight = pow(2.0, (est_qp - chroma_qp) / 3.0); + if (state->encoder_control->cfg.dep_quant) + { + tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma) + } + state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight; state->c_lambda = est_lambda / tmpWeight; ctu->qp = est_qp; ctu->lambda = est_lambda; @@ -820,7 +828,11 @@ static double qp_to_lambda(encoder_state_t* const state, int qp) // Since this value will be later combined with qp_pred, clip to half of that instead to be safe state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp); state->qp = CLIP_TO_QP(state->qp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); ctu->adjust_lambda = state->lambda; @@ -1103,7 +1115,12 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, pos.x = 0; } state->qp = CLIP_TO_QP(state->frame->QP + dqp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); } else if (ctrl->cfg.target_bitrate > 0) { @@ -1138,6 +1155,9 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, state->frame->lambda * 1.5874010519681994, lambda); lambda = clip_lambda(lambda); + if (state->encoder_control->cfg.dep_quant) { + lambda *= pow(2, 0.25 / 3.0); + } state->lambda = lambda; state->lambda_sqrt = sqrt(lambda); @@ -1145,8 +1165,13 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, } else { state->qp = state->frame->QP; - state->lambda = state->frame->lambda; - state->lambda_sqrt = sqrt(state->frame->lambda); + double lambda = state->frame->lambda; + + if (state->encoder_control->cfg.dep_quant) { + lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = lambda; + state->lambda_sqrt = sqrt(lambda); } lcu->lambda = state->lambda; @@ -1154,6 +1179,11 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, int8_t chroma_qp = ctrl->qp_map[0][state->qp]; double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0); + if (state->encoder_control->cfg.dep_quant) + { + tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma) + } + state->chroma_weights[1] = state->chroma_weights[2] = state->chroma_weights[3] = tmpWeight; state->c_lambda = state->lambda / tmpWeight; // Apply variance adaptive quantization @@ -1170,10 +1200,34 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state, // Since this value will be later combined with qp_pred, clip to half of that instead to be safe state->qp = CLIP(state->frame->QP + UVG_QP_DELTA_MIN / 2, state->frame->QP + UVG_QP_DELTA_MAX / 2, state->qp); state->qp = CLIP_TO_QP(state->qp); - state->lambda = qp_to_lambda(state, state->qp); + double to_lambda = qp_to_lambda(state, state->qp); + if (state->encoder_control->cfg.dep_quant) { + to_lambda *= pow(2, 0.25 / 3.0); + } + state->lambda = to_lambda; state->lambda_sqrt = sqrt(state->lambda); lcu->adjust_lambda = state->lambda; lcu->adjust_qp = state->qp; } } + + +double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode) +{ + const encoder_control_t * const ctrl = state->encoder_control; + double lambda = state->lambda; + int8_t chroma_qp = ctrl->qp_map[0][state->qp]; + double tmpWeight = pow(2.0, (state->qp - chroma_qp) / 3.0); + if (state->encoder_control->cfg.dep_quant) { + tmpWeight *= (state->encoder_control->cfg.gop_len >= 8 ? pow(2.0, 0.1 / 3.0) : pow(2.0, 0.2 / 3.0)); // increase chroma weight for dependent quantization (in order to reduce bit rate shift from chroma to luma) + } + lambda /= tmpWeight; + lambda *= use_jccr && state->qp > 18 ? 1.3 : 1.0; + if (jccr_mode == 1 || jccr_mode == 2) { + lambda *= 0.8; + } else if (jccr_mode == 3) { + lambda *= 0.5; + } + return lambda; +} \ No newline at end of file diff --git a/src/rate_control.h b/src/rate_control.h index f397e2a2..644d7fc4 100644 --- a/src/rate_control.h +++ b/src/rate_control.h @@ -76,4 +76,6 @@ void uvg_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos); void uvg_update_after_picture(encoder_state_t * const state); void uvg_estimate_pic_lambda(encoder_state_t * const state); +double uvg_calculate_chroma_lambda(encoder_state_t *state, bool use_jccr, int jccr_mode); + #endif // RATE_CONTROL_H_ diff --git a/src/rdo.c b/src/rdo.c index f8ebacdf..c5d1c71b 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -33,6 +33,7 @@ #include "rdo.h" #include +#include #include #include #include @@ -52,7 +53,6 @@ #include "strategies/strategies-quant.h" -#define QUANT_SHIFT 14 #define SCAN_SET_SIZE 16 #define LOG2_SCAN_SET_SIZE 4 #define SBH_THRESHOLD 4 @@ -297,15 +297,20 @@ out: static INLINE double get_coeff_cabac_cost( const encoder_state_t * const state, const coeff_t *coeff, - int32_t width, + const cu_loc_t* const cu_loc, color_t color, int8_t scan_mode, int8_t tr_skip, cu_info_t* cur_tu) { + const int width = cu_loc->width; + const int height = cu_loc->height; + const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + // Make sure there are coeffs present bool found = false; - for (int i = 0; i < width*width; i++) { + for (int i = 0; i < sub_coeff_w * sub_coeff_h; i++) { if (coeff[i] != 0) { found = 1; break; @@ -330,7 +335,7 @@ static INLINE double get_coeff_cabac_cost( uvg_encode_coeff_nxn((encoder_state_t*) state, &cabac_copy, coeff, - width, + cu_loc, color, scan_mode, cur_tu, @@ -341,6 +346,7 @@ static INLINE double get_coeff_cabac_cost( &cabac_copy, coeff, width, + height, color, scan_mode, &bits); @@ -391,14 +397,36 @@ double uvg_get_coeff_cost( const encoder_state_t * const state, const coeff_t *coeff, cu_info_t* cur_tu, - int32_t width, + const cu_loc_t* const cu_loc, color_t color, int8_t scan_mode, - int8_t tr_skip) + int8_t tr_skip, + int coeff_order) { uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on; uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on; + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + int x_local = cu_loc->x % LCU_WIDTH; + int y_local = cu_loc->y % LCU_WIDTH; + const int sub_coeff_w = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int sub_coeff_h = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int lcu_width = color == COLOR_Y ? LCU_WIDTH : LCU_WIDTH_C; + + + const coeff_t* coeff_ptr = NULL; + coeff_t sub_coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; + + if (coeff_order == COEFF_ORDER_LINEAR) { + coeff_ptr = coeff; + } + else { + // Coeff order CU + uvg_get_sub_coeff(sub_coeff, coeff, x_local, y_local, sub_coeff_w, sub_coeff_h, lcu_width); + coeff_ptr = sub_coeff; + } + if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit && state->qp < MAX_FAST_COEFF_COST_QP && !tr_skip) { // TODO: do we need to assert(0) out of the fast-estimation branch if we @@ -409,17 +437,17 @@ double uvg_get_coeff_cost( return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0) } else { uint64_t weights = uvg_fast_coeff_get_weights(state); - uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights); + uint32_t fast_cost = uvg_fast_coeff_cost(coeff_ptr, width, height, weights); if (check_accuracy) { - double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu); + double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu); save_accuracy(state->qp, ccc, fast_cost); } return fast_cost; } } else { - double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu); + double ccc = get_coeff_cabac_cost(state, coeff_ptr, cu_loc, color, scan_mode, tr_skip, cur_tu); if (save_cccs) { - save_ccc(state->qp, coeff, width * width, ccc); + save_ccc(state->qp, coeff, width * height, ccc); } return ccc; } @@ -677,19 +705,20 @@ static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t * tables generated during RDOQ to select the best coefficient to change. */ void uvg_rdoq_sign_hiding( - const encoder_state_t *const state, - const int32_t qp_scaled, - const uint32_t *const scan2raster, - const struct sh_rates_t *const sh_rates, - const int32_t last_pos, - const coeff_t *const coeffs, - coeff_t *const quant_coeffs, - const int8_t color) + const encoder_state_t *const state, + const int32_t qp_scaled, + const uint32_t *const scan2raster, + const struct sh_rates_t *const sh_rates, + const int32_t last_pos, + const coeff_t *const coeffs, + coeff_t *const quant_coeffs, + const int8_t color, + const bool need_sqrt_adjust) { const encoder_control_t * const ctrl = state->encoder_control; const double lambda = color ? state->c_lambda : state->lambda; - int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6]; + int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6]; // This somehow scales quant_delta into fractional bits. Instead of the bits // being multiplied by lambda, the residual is divided by it, or something // like that. @@ -814,28 +843,28 @@ void uvg_rdoq_sign_hiding( } } -static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t posX, uint32_t posY, uint32_t width, uint32_t height) +static unsigned templateAbsSum(const coeff_t* coeff, int baseLevel, uint32_t posX, uint32_t posY, uint32_t width, uint32_t height, uint8_t mts_index) { const coeff_t* pData = coeff + posX + posY * width; coeff_t sum = 0; if (posX < width - 1) { - sum += abs(pData[1]); + sum += mts_index && posX + 1 >= 16 ? 0 : abs(pData[1]); if (posX < width - 2) { - sum += abs(pData[2]); + sum += mts_index && posX + 2 >= 16 ? 0 : abs(pData[2]); } if (posY < height - 1) { - sum += abs(pData[width + 1]); + sum += mts_index && (posY + 1 >= 16 || posX + 1 >= 16) ? 0 : abs(pData[width + 1]); } } if (posY < height - 1) { - sum += abs(pData[width]); + sum += mts_index && posY + 1 >= 16 ? 0 : abs(pData[width]); if (posY < height - 2) { - sum += abs(pData[width << 1]); + sum += mts_index && posY + 2 >= 16 ? 0 : abs(pData[width << 1]); } } return MAX(MIN(sum - 5 * baseLevel, 31), 0); @@ -1141,7 +1170,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ const int max_log2_tr_dynamic_range = 15; uint32_t log2_tr_width = uvg_math_floor_log2(width); uint32_t log2_tr_height = uvg_math_floor_log2(height); - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; + const uint32_t log2_block_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_block_height = uvg_g_convert_to_log2[height]; const uint32_t log2_cg_width = g_log2_sbb_size[log2_tr_width][log2_tr_height][0]; const uint32_t log2_cg_height = g_log2_sbb_size[log2_tr_width][log2_tr_height][1]; @@ -1166,15 +1196,18 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ switch (cg_num) { case 1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); FILL_ARRAY(cost_coeffgroup_sig, 0, 1); break; + case 2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); FILL_ARRAY(cost_coeffgroup_sig, 0, 2); break; case 4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); FILL_ARRAY(cost_coeffgroup_sig, 0, 4); break; + case 8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); FILL_ARRAY(cost_coeffgroup_sig, 0, 8); break; case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); FILL_ARRAY(cost_coeffgroup_sig, 0, 16); break; + case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); FILL_ARRAY(cost_coeffgroup_sig, 0, 32); break; case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); FILL_ARRAY(cost_coeffgroup_sig, 0, 64); break; default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups"); } const bool needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation. const int q_bits = QUANT_SHIFT + qp_scaled / 6 + (needs_sqrt2_scale ? -1 : 0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits - const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6]; + const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6]; const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff; @@ -1182,8 +1215,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ const coeff_t entropy_coding_maximum = (1 << max_log2_tr_dynamic_range) - 1; - const uint32_t* scan = uvg_g_sig_last_scan[scan_mode][log2_block_size - 1]; - const uint32_t* scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode]; + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); uint32_t coeff_levels[3]; double coeff_level_error[4]; @@ -1221,8 +1254,8 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ scan_pos = (sbId << log2_cg_size) + scan_pos_in_sb; int last_pos_coded = sbSizeM1; uint32_t blkpos = scan[scan_pos]; - uint32_t pos_y = blkpos >> log2_block_size; - uint32_t pos_x = blkpos - (pos_y << log2_block_size); + uint32_t pos_y = blkpos >> log2_block_width; + uint32_t pos_x = blkpos - (pos_y << log2_block_width); //===== quantization ===== // set coeff @@ -1365,6 +1398,48 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_ return abs_sum; } + +static uint32_t context_get_sig_ctx_idx_abs(const coeff_t* coeff, uint32_t pos_x, uint32_t pos_y, + uint32_t width, uint32_t height, int8_t color, + int32_t* temp_diag, int32_t* temp_sum, int8_t mts) +{ + const coeff_t* data = coeff + pos_x + pos_y * width; + const int diag = pos_x + pos_y; + int num_pos = 0; + int sum_abs = 0; +#define UPDATE(x) {int a=abs(x);sum_abs+=MIN(4+(a&1),a);num_pos+=(a?1:0);} + if (pos_x < width - 1) + { + UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[1]); + if (pos_x < width - 2) + { + UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[2]); + } + if (pos_y < height - 1) + { + UPDATE(mts && (pos_y + 1 >= 16 || pos_x + 1 >= 16) ? 0 : data[width + 1]); + } + } + if (pos_y < height - 1) + { + UPDATE(mts && pos_x + 1 >= 16 ? 0 : data[width]); + if (pos_y < height - 2) + { + UPDATE(mts && pos_x + 2 >= 16 ? 0 : data[width << 1]); + } + } +#undef UPDATE + int ctx_ofs = MIN((sum_abs + 1) >> 1, 3) + (diag < 2 ? 4 : 0); + if (color == COLOR_Y) + { + ctx_ofs += diag < 5 ? 4 : 0; + } + + *temp_diag = diag; + *temp_sum = sum_abs - num_pos; + return ctx_ofs; +} + /** RDOQ with CABAC * \returns void * Rate distortion optimized quantization for entropy @@ -1377,31 +1452,35 @@ void uvg_rdoq( coeff_t *dest_coeff, int32_t width, int32_t height, - int8_t type, + int8_t color, int8_t scan_mode, int8_t block_type, - int8_t tr_depth, uint16_t cbf, - uint8_t lfnst_idx) + uint8_t lfnst_idx, uint8_t mts_idx) { const encoder_control_t * const encoder = state->encoder_control; cabac_data_t * const cabac = &state->cabac; - uint32_t log2_tr_width = uvg_math_floor_log2( height ); - uint32_t log2_tr_height = uvg_math_floor_log2( width ); - int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); // Represents scaling through forward transform + const uint32_t log2_block_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_block_height = uvg_g_convert_to_log2[height]; + bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size + + int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1); // Represents scaling through forward transform uint16_t go_rice_param = 0; uint32_t reg_bins = (width * height * 28) >> 4; - const uint32_t log2_block_size = uvg_g_convert_to_bit[ width ] + 2; - int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + type; - - int32_t qp_scaled = uvg_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); - int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift; + int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + color; - const double lambda = type ? state->c_lambda : state->lambda; + int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); + + int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift - needs_block_size_trafo_scale; - const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6]; - const double *err_scale = encoder->scaling_list.error_scale[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6]; + const double lambda = color ? state->c_lambda : state->lambda; + const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6]; + const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF; + + const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6]; + const double *err_scale = encoder->scaling_list.error_scale[log2_block_width][log2_block_height][scalinglist_type][qp_scaled%6]; double block_uncoded_cost = 0; @@ -1415,14 +1494,19 @@ void uvg_rdoq( memset(dest_coeff, 0, sizeof(coeff_t) * width * height); - const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1]; + const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; + const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0]; + const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; - const uint32_t cg_width = (MIN((uint8_t)32, width) >> (log2_cg_size / 2)); + const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width); + const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height); + + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); + const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); - const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode]; const uint32_t cg_size = 16; const int32_t shift = 4 >> 1; - const uint32_t num_blk_side = width >> shift; + const uint32_t num_blk_side = MAX(width >> shift, 1); double cost_coeffgroup_sig[ 64 ]; uint32_t sig_coeffgroup_flag[ 64 ]; @@ -1431,26 +1515,34 @@ void uvg_rdoq( int32_t temp_diag = -1; int32_t temp_sum = -1; - const uint32_t *scan = uvg_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ]; - int32_t cg_last_scanpos = -1; int32_t last_scanpos = -1; - uint32_t cg_num = width * height >> 4; + uint32_t cg_num = lfnst_idx > 0 ? 1 : width * height >> 4; + + double dTransShift = (double)transform_shift + (needs_block_size_trafo_scale ? -0.5 : 0.0); + // Compensate for scaling of bitcount in Lagrange cost function + double scale = CTX_FRAC_ONE_BIT; + // Compensate for scaling through forward transform + scale = scale * pow(2.0, -2.0 * dTransShift); + const double default_error_scale = scale / default_quant_coeff / default_quant_coeff; // Explicitly tell the only possible numbers of elements to be zeroed. // Hope the compiler is able to utilize this information. switch (cg_num) { case 1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); break; + case 2: FILL_ARRAY(sig_coeffgroup_flag, 0, 2); break; case 4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); break; + case 8: FILL_ARRAY(sig_coeffgroup_flag, 0, 8); break; case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break; + case 32: FILL_ARRAY(sig_coeffgroup_flag, 0, 32); break; case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break; - default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups"); + default: assert(0 && "There should be 1, 2, 4, 8, 16, 32 or 64 coefficient groups"); } - cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[type ? 2 : 0]); - cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]); - cabac_ctx_t* base_gt1_ctx = (type == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]); + cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.sig_coeff_group_model[color ? 2 : 0]); + cabac_ctx_t *baseCtx = (color == 0) ? &(cabac->ctx.cu_sig_model_luma[0][0]) : &(cabac->ctx.cu_sig_model_chroma[0][0]); + cabac_ctx_t* base_gt1_ctx = (color == 0) ? &(cabac->ctx.cu_gtx_flag_model_luma[1][0]) : &(cabac->ctx.cu_gtx_flag_model_chroma[1][0]); struct { double coded_level_and_dist; @@ -1462,22 +1554,27 @@ void uvg_rdoq( //Find last cg and last scanpos const int max_lfnst_pos = ((height == 4 && width == 4) || (height == 8 && width == 8)) ? 7 : 15; - int32_t cg_scanpos; + int32_t cg_scanpos; + uint32_t max_scan_group_size = lfnst_idx > 0 ? max_lfnst_pos : cg_size - 1; for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--) { - for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--) + uint32_t cg_blkpos = scan_cg[cg_scanpos]; + uint32_t cg_pos_y = cg_blkpos / num_blk_side; + uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side); + if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue; + for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) { int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg; - if (lfnst_idx > 0 && scanpos > max_lfnst_pos) break; + uint32_t blkpos = scan[scanpos]; - int32_t q = quant_coeff[blkpos]; + int32_t q = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff; int32_t level_double = coef[blkpos]; level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1))); uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits; double err = (double)level_double; - cost_coeff0[scanpos] = err * err * err_scale[blkpos]; + cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale); dest_coeff[blkpos] = max_abs_level; if (max_abs_level > 0) { @@ -1507,43 +1604,45 @@ void uvg_rdoq( uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side); FILL(rd_stats, 0); - for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) { + if (mts_idx != 0 && (cg_pos_y >= 4 || cg_pos_x >= 4)) continue; + for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) { int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg; if (scanpos > last_scanpos) { continue; } uint32_t blkpos = scan[scanpos]; - int32_t q = quant_coeff[blkpos]; - double temp = err_scale[blkpos]; + int32_t q = use_scaling_list ? quant_coeff[blkpos] : default_quant_coeff; + double temp = (use_scaling_list ? err_scale[blkpos] : default_error_scale); int32_t level_double = coef[blkpos]; level_double = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1))); uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits; dest_coeff[blkpos] = max_abs_level; double err = (double)level_double; - cost_coeff0[scanpos] = err * err * err_scale[blkpos]; + cost_coeff0[scanpos] = err * err * (use_scaling_list ? err_scale[blkpos] : default_error_scale); block_uncoded_cost += cost_coeff0[ scanpos ]; if (last_scanpos >= 0) { - uint32_t pos_y = blkpos >> log2_block_size; - uint32_t pos_x = blkpos - (pos_y << log2_block_size); + uint32_t pos_y = blkpos >> log2_block_width; + uint32_t pos_x = blkpos - (pos_y << log2_block_width); //===== coefficient level estimation ===== int32_t level; uint16_t ctx_sig = 0; if (scanpos != last_scanpos) { - ctx_sig = uvg_context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, type, &temp_diag, &temp_sum); + // VVC document 9.3.4.2.8, context for sig_coeff_flag calculated here + ctx_sig = context_get_sig_ctx_idx_abs(dest_coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum, mts_idx); } if (temp_diag != -1) { - ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((type == 0) ? 15 : 5) : (type == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0); + ctx_set = (MIN(temp_sum, 4) + 1) + (!temp_diag ? ((color == 0) ? 15 : 5) : (color == 0) ? temp_diag < 3 ? 10 : (temp_diag < 10 ? 5 : 0) : 0); } else ctx_set = 0; if (reg_bins < 4) { - int sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height); + int sumAll = templateAbsSum(dest_coeff, 0, pos_x, pos_y, width, height,mts_idx); go_rice_param = g_auiGoRiceParsCoeff[sumAll]; } @@ -1554,12 +1653,12 @@ void uvg_rdoq( if (scanpos == last_scanpos) { level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos], level_double, max_abs_level, 0, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, - reg_bins, q_bits, temp, 1, type); + reg_bins, q_bits, temp, 1, color); } else { level = uvg_get_coded_level(state, &cost_coeff[scanpos], &cost_coeff0[scanpos], &cost_sig[scanpos], level_double, max_abs_level, ctx_sig, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, - reg_bins, q_bits, temp, 0, type); + reg_bins, q_bits, temp, 0, color); if (encoder->cfg.signhide_enable) { int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1); int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0); @@ -1572,14 +1671,14 @@ void uvg_rdoq( if (encoder->cfg.signhide_enable) { sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8); if (level > 0) { - int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false); - sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now; - sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now; + int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false); + sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now; + sh_rates.dec[blkpos] = uvg_get_ic_rate(state, level - 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now; } else { // level == 0 if (reg_bins < 4) { - int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false); - sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, type, false) - rate_now; + int32_t rate_now = uvg_get_ic_rate(state, level, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false); + sh_rates.inc[blkpos] = uvg_get_ic_rate(state, level + 1, gt1_ctx, gt2_ctx, par_ctx, go_rice_param, reg_bins, color, false) - rate_now; } else { sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_gt1_ctx[gt1_ctx], 0); @@ -1595,7 +1694,7 @@ void uvg_rdoq( } else if (reg_bins >= 4) { reg_bins -= (level < 2 ? level : 3) + (scanpos != last_scanpos); - int sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height); + int sumAll = templateAbsSum(coef, 4, pos_x, pos_y, width, height, mts_idx); go_rice_param = g_auiGoRiceParsCoeff[sumAll]; } } @@ -1620,7 +1719,7 @@ void uvg_rdoq( if( cg_scanpos ) { if (sig_coeffgroup_flag[cg_blkpos] == 0) { uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, - cg_pos_y, cg_width); + cg_pos_y, cg_width, cg_height); cost_coeffgroup_sig[cg_scanpos] = lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0); base_cost += cost_coeffgroup_sig[cg_scanpos] - rd_stats.sig_cost; } else { @@ -1636,7 +1735,7 @@ void uvg_rdoq( // add SigCoeffGroupFlag cost to total cost ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, - cg_pos_y, cg_width); + cg_pos_y, cg_width, cg_height); cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1); base_cost += cost_coeffgroup_sig[cg_scanpos]; @@ -1656,7 +1755,7 @@ void uvg_rdoq( cost_coeffgroup_sig[cg_scanpos] = lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0); // reset coeffs to 0 in this block - for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) { + for (int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) { int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg; uint32_t blkpos = scan[scanpos]; if (dest_coeff[blkpos]){ @@ -1679,12 +1778,12 @@ void uvg_rdoq( int8_t found_last = 0; int32_t best_last_idx_p1 = 0; - if( block_type != CU_INTRA && !type ) { + if( block_type != CU_INTRA && !color ) { best_cost = block_uncoded_cost + lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0); base_cost += lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1); } else { cabac_ctx_t* base_cbf_model = NULL; - switch (type) { + switch (color) { case COLOR_Y: base_cbf_model = cabac->ctx.qt_cbf_model_luma; break; @@ -1697,25 +1796,26 @@ void uvg_rdoq( default: assert(0); } - ctx_cbf = ( type != COLOR_V ? 0 : cbf_is_set(cbf, 5 - uvg_math_floor_log2(width), COLOR_U)); + // This cbf should work even with non-square blocks + ctx_cbf = ( color != COLOR_V ? 0 : cbf_is_set(cbf, COLOR_U)); best_cost = block_uncoded_cost + lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0); base_cost += lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1); } - calc_last_bits(state, width, height, type, last_x_bits, last_y_bits); + calc_last_bits(state, width, height, color, last_x_bits, last_y_bits); for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) { uint32_t cg_blkpos = scan_cg[cg_scanpos]; base_cost -= cost_coeffgroup_sig[cg_scanpos]; if (sig_coeffgroup_flag[ cg_blkpos ]) { - for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) { + for ( int32_t scanpos_in_cg = max_scan_group_size; scanpos_in_cg >= 0; scanpos_in_cg--) { int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg; if (scanpos > last_scanpos) continue; uint32_t blkpos = scan[scanpos]; if( dest_coeff[ blkpos ] ) { - uint32_t pos_y = blkpos >> log2_block_size; - uint32_t pos_x = blkpos - ( pos_y << log2_block_size ); + uint32_t pos_y = blkpos >> log2_block_width; + uint32_t pos_x = blkpos - ( pos_y << log2_block_width ); double cost_last = get_rate_last(lambda, pos_x, pos_y, last_x_bits,last_y_bits ); double totalCost = base_cost + cost_last - cost_sig[ scanpos ]; @@ -1739,11 +1839,23 @@ void uvg_rdoq( } // end for uint32_t abs_sum = 0; - for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) { - int32_t blkPos = scan[scanpos]; - int32_t level = dest_coeff[blkPos]; - abs_sum += level; - dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level); + if(!mts_idx || (width < 32 && height < 32)) { + for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) { + int32_t blkPos = scan[scanpos]; + int32_t level = dest_coeff[blkPos]; + abs_sum += level; + dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level); + } + } + else { + for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) { + int32_t blkPos = scan[scanpos]; + int32_t blk_x = blkPos & (width - 1); + int32_t blk_y = blkPos >> log2_block_width; + int32_t level = blk_x >= 16 || blk_y >= 16 ? 0 : dest_coeff[blkPos]; + abs_sum += level; + dest_coeff[blkPos] = (coeff_t)(( level < 0 ) ? -level : level); + } } //===== clean uncoded coefficients ===== for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) { @@ -1751,7 +1863,7 @@ void uvg_rdoq( } if (encoder->cfg.signhide_enable && abs_sum >= 2) { - uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, type); + uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale); } } diff --git a/src/rdo.h b/src/rdo.h index 7f325cfd..2ba0c2a9 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -44,6 +44,8 @@ #include "global.h" // IWYU pragma: keep #include "search_inter.h" +#define QUANT_SHIFT 14 +#define IQUANT_SHIFT 6 extern const uint32_t uvg_g_go_rice_range[5]; extern const uint32_t uvg_g_go_rice_prefix_len[5]; @@ -60,9 +62,8 @@ void uvg_rdoq( int8_t type, int8_t scan_mode, int8_t block_type, - int8_t tr_depth, uint16_t cbf, - uint8_t lfnst_idx); + uint8_t lfnst_idx, uint8_t mts_idx); int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_coeff, int32_t width, @@ -73,10 +74,11 @@ double uvg_get_coeff_cost( const encoder_state_t * const state, const coeff_t *coeff, cu_info_t* cur_tu, - int32_t width, + const cu_loc_t* const cu_loc, color_t color, int8_t scan_mode, - int8_t tr_skip); + int8_t tr_skip, + int coeff_order); int32_t uvg_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_gt1, uint16_t ctx_num_gt2, uint16_t ctx_num_par, uint16_t abs_go_rice, uint32_t reg_bins, int8_t type, int use_limited_prefix_length); diff --git a/src/scalinglist.c b/src/scalinglist.c index 5c32ac4c..01edfa27 100644 --- a/src/scalinglist.c +++ b/src/scalinglist.c @@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] = 24, 25, 28, 33, 41, 54, 71, 91 }; -const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564}; -const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72}; +const int16_t uvg_g_quant_scales[2][6] = { + {26214, 23302, 20560, 18396, 16384, 14564}, + { 18396,16384,14564,13107,11651,10280 } +}; +const int16_t uvg_g_inv_quant_scales[2][6] = { + {40, 45, 51, 57, 64, 72}, + { 57,64,72,80,90,102 } +}; /** @@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp]; int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp]; - // Encoder list - uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio, + // Encoder list TODO: the sqrt adjusted lists + uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio, MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable); // Decoder list - scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio, + scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio, MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable); diff --git a/src/search.c b/src/search.c index cb9fc1d1..c353a914 100644 --- a/src/search.c +++ b/src/search.c @@ -36,11 +36,14 @@ #include #include "cabac.h" +#include "cu.h" #include "encoder.h" #include "encode_coding_tree.h" +#include "filter.h" #include "imagelist.h" #include "inter.h" #include "intra.h" +#include "rate_control.h" #include "uvg266.h" #include "rdo.h" #include "search_inter.h" @@ -62,92 +65,247 @@ static const int INTRA_THRESHOLD = 8; -static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to) +static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type + tree_type) { - for (int y = y_local; y < y_local + width; y += SCU_WIDTH) { - for (int x = x_local; x < x_local + width; x += SCU_WIDTH) { + const int y_limit = (cu_loc->local_y + cu_loc->height); + const int x_limit = (cu_loc->local_x + cu_loc->width); + for (int y = cu_loc->local_y ; y < y_limit; y += SCU_WIDTH) { + for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) { *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y); } } } -static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, enum uvg_tree_type - tree_type) + +static INLINE void initialize_partial_work_tree( + const encoder_state_t* const state, + lcu_t* from, + lcu_t *to, + const cu_loc_t * const cu_loc, + const cu_loc_t* const + chroma_loc, + const enum uvg_tree_type tree_type) { + + const int y_limit = MIN(LCU_WIDTH, state->tile->frame->height - cu_loc->y / 64 * 64); + const int x_limit = MIN(LCU_WIDTH, state->tile->frame->width - cu_loc->x / 64 * 64); + + if (cu_loc->local_x == 0) { + to->left_ref = from->left_ref; + *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from); + } + else { + if(tree_type != UVG_CHROMA_T) { + uvg_pixels_blit(from->rec.y, to->rec.y, cu_loc->local_x, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH); + } + if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) { + uvg_pixels_blit(from->rec.u, to->rec.u, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(from->rec.v, to->rec.v, chroma_loc->local_x / 2, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C); + } + } + + if (cu_loc->local_y == 0) { + to->top_ref = from->top_ref; + *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from); + } + else { + if (tree_type != UVG_CHROMA_T) { + uvg_pixels_blit(&from->rec.y[cu_loc->local_x], &to->rec.y[cu_loc->local_x], + LCU_WIDTH - cu_loc->local_x, cu_loc->local_y, + LCU_WIDTH, LCU_WIDTH); + } + if (tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) { + uvg_pixels_blit(&from->rec.u[chroma_loc->local_x / 2], &to->rec.u[chroma_loc->local_x / 2], + LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2, + LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&from->rec.v[chroma_loc->local_x / 2], &to->rec.v[chroma_loc->local_x / 2], + LCU_WIDTH_C - chroma_loc->local_x / 2, chroma_loc->local_y / 2, + LCU_WIDTH_C, LCU_WIDTH_C); + } + } + + if (tree_type == UVG_CHROMA_T) { + // These are needed for CCLM + uvg_pixels_blit(from->rec.y, to->rec.y, MIN(cu_loc->local_x + cu_loc->width * 2, LCU_WIDTH), MIN(cu_loc->local_y + cu_loc->height * 2, LCU_WIDTH), LCU_WIDTH, LCU_WIDTH); + } + + to->ref.chroma_format = from->ref.chroma_format; + to->rec.chroma_format = from->rec.chroma_format; + + if (tree_type != UVG_CHROMA_T) { + const int offset = cu_loc->local_x + cu_loc->local_y * LCU_WIDTH; + uvg_pixels_blit(&from->ref.y[offset], &to->ref.y[offset], cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH); + } + + if(tree_type != UVG_LUMA_T && from->ref.chroma_format != UVG_CSP_400) { + const int offset = chroma_loc->local_x / 2 + chroma_loc->local_y / 2 * LCU_WIDTH_C; + uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); + } + if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) { + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t)); + } + } + + } + + const int y_start = (cu_loc->local_y) - 4; + const int x_start = (cu_loc->local_x) - 4; + for (int y = y_start; y < y_limit; y += SCU_WIDTH) { + *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y); + } + for (int x = x_start; x < x_limit; x += SCU_WIDTH) { + *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start); + } + + for (int y = cu_loc->local_y; y < y_limit; y += SCU_WIDTH) { + for (int x = cu_loc->local_x ; x < x_limit; x += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t)); + } + } + + if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) { + const int y_start = (chroma_loc->local_y) - 4; + const int x_start = (chroma_loc->local_x) - 4; + for (int y = y_start; y < y_limit; y += SCU_WIDTH) { + *LCU_GET_CU_AT_PX(to, x_start, y) = *LCU_GET_CU_AT_PX(from, x_start, y); + } + for (int x = x_start; x < y_limit; x += SCU_WIDTH) { + *LCU_GET_CU_AT_PX(to, x, y_start) = *LCU_GET_CU_AT_PX(from, x, y_start); + } + + for(int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) { + if(x >= cu_loc->local_x && y>= cu_loc->local_y) continue; + *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y); + } + } + + if (chroma_loc->local_x == 0) { + to->left_ref = from->left_ref; + *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from); + } + if (chroma_loc->local_y == 0) { + to->top_ref = from->top_ref; + *LCU_GET_TOP_RIGHT_CU(to) = *LCU_GET_TOP_RIGHT_CU(from); + } + if (x_limit != LCU_WIDTH) { + for (int y = y_start; y < y_limit; y += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t)); + } + } + if (y_limit != LCU_WIDTH) { + for (int x = x_start; x < x_limit; x += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t)); + } + } + } + else { + if (x_limit != LCU_WIDTH) { + for (int y = y_start; y < y_limit; y += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x_limit, y), 0, sizeof(cu_info_t)); + } + } + if (y_limit != LCU_WIDTH) { + for (int x = x_start; x < x_limit; x += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x, y_limit), 0, sizeof(cu_info_t)); + } + } + } +} + +static INLINE void copy_cu_pixels( + lcu_t *from, + lcu_t *to, + const cu_loc_t* const cu_loc, + enum uvg_tree_type + tree_type) { + const int x_local = cu_loc->local_x; + const int y_local = cu_loc->local_y; const int luma_index = x_local + y_local * LCU_WIDTH; - const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C; + const int chroma_index = (x_local / 2) + (y_local / 2) * LCU_WIDTH_C; if(tree_type != UVG_CHROMA_T) { uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index], - width, width, LCU_WIDTH, LCU_WIDTH); + cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH); } if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { uvg_pixels_blit(&from->rec.u[chroma_index], &to->rec.u[chroma_index], - width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2); + cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); uvg_pixels_blit(&from->rec.v[chroma_index], &to->rec.v[chroma_index], - width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2); + cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); } } -static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint, enum +// ISP_TODO: this needs to work with the new coeff cu orderr +static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to, bool joint, enum uvg_tree_type tree_type) { if (tree_type != UVG_CHROMA_T) { - const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local); - copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width); + //const int luma_z = xy_to_zorder(LCU_WIDTH, cu_loc->x, cu_loc->y); + const int idx = (cu_loc->x % LCU_WIDTH) + ((cu_loc->y % LCU_WIDTH) * LCU_WIDTH); + copy_coeffs(&from->coeff.y[idx], &to->coeff.y[idx], cu_loc->width, cu_loc->height, LCU_WIDTH); + } if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { - const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> (tree_type != UVG_CHROMA_T), y_local >> (tree_type != UVG_CHROMA_T)); - copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1); - copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1); + //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T)); + const int chroma_x = (cu_loc->x >> 1); + const int chroma_y = (cu_loc->y >> 1); + + const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C); + copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); + copy_coeffs(&from->coeff.v[idx], &to->coeff.v[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); if (joint) { - copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1); + copy_coeffs(&from->coeff.joint_uv[idx], &to->coeff.joint_uv[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C); } } } + +static void lcu_fill_chroma_cu_info(lcu_t* lcu, const cu_loc_t* const cu_loc); /** * Copy all non-reference CU data from next level to current level. */ -static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint, enum - uvg_tree_type tree_type) +static void work_tree_copy_up( + lcu_t *from, + lcu_t* to, + bool joint, + enum + uvg_tree_type tree_type, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc) { - const int width = LCU_WIDTH >> depth; - copy_cu_info (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]); - copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type); - copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint, tree_type); + copy_cu_info (from, to, cu_loc, tree_type); + copy_cu_pixels(from, to, cu_loc, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type); + copy_cu_coeffs(cu_loc, from, to, joint, cu_loc != chroma_loc && tree_type == UVG_LUMA_T ? UVG_LUMA_T : tree_type); + if (chroma_loc && tree_type != UVG_LUMA_T) { + copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T); + copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T); + + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += 4) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += 4) { + cu_info_t* to_cu = LCU_GET_CU_AT_PX(to, x, y); + cu_info_t* from_cu = LCU_GET_CU_AT_PX(from, x, y); + to_cu->intra.mode_chroma = from_cu->intra.mode_chroma; + to_cu->joint_cb_cr = from_cu->joint_cb_cr; + to_cu->cr_lfnst_idx = from_cu->cr_lfnst_idx; + to_cu->chroma_deblocking = from_cu->chroma_deblocking; + to_cu->log2_chroma_width = from_cu->log2_chroma_width; + to_cu->log2_chroma_height = from_cu->log2_chroma_height; + + cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_U); + cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_V); + } + } + } } -/** - * Copy all non-reference CU data from current level to all lower levels. - */ -static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree, enum uvg_tree_type - tree_type) -{ - const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> 1; - for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) { - copy_cu_info (x_local, y_local, width, &work_tree[depth], &work_tree[i]); - copy_cu_pixels(x_local, y_local, LCU_WIDTH >> depth, &work_tree[depth], &work_tree[i], tree_type); - } -} - -void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type - tree_type) -{ - const int x_local = SUB_SCU(x_px); - const int y_local = SUB_SCU(y_px); - const unsigned width = (tree_type != UVG_CHROMA_T ? LCU_WIDTH : LCU_WIDTH_C) >> depth; - - for (unsigned y = 0; y < width; y += SCU_WIDTH) { - for (unsigned x = 0; x < width; x += SCU_WIDTH) { - LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth; - } - } -} - static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, int height, const cu_info_t *cu) { // Set mode in every CU covered by part_mode in this depth. @@ -155,21 +313,29 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in for (int x = x_local; x < x_local + width; x += SCU_WIDTH) { cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y); to->type = cu->type; - to->depth = cu->depth; - to->part_size = cu->part_size; to->qp = cu->qp; + to->split_tree = cu->split_tree; //to->tr_idx = cu->tr_idx; to->lfnst_idx = cu->lfnst_idx; + to->cr_lfnst_idx = cu->cr_lfnst_idx; + to->joint_cb_cr = cu->joint_cb_cr; to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos; to->violates_lfnst_constrained_luma = cu->violates_lfnst_constrained_luma; to->violates_lfnst_constrained_chroma = cu->violates_lfnst_constrained_chroma; + to->log2_height = cu->log2_height; + to->log2_width = cu->log2_width; + + to->log2_chroma_height = cu->log2_chroma_height; + to->log2_chroma_width = cu->log2_chroma_width; + if (cu->type == CU_INTRA) { to->intra.mode = cu->intra.mode; to->intra.mode_chroma = cu->intra.mode_chroma; to->intra.multi_ref_idx = cu->intra.multi_ref_idx; to->intra.mip_flag = cu->intra.mip_flag; to->intra.mip_is_transposed = cu->intra.mip_is_transposed; + to->intra.isp_mode = cu->intra.isp_mode; } else { to->skipped = cu->skipped; to->merged = cu->merged; @@ -180,74 +346,105 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in } } -static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type) +static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc) { - const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size; - const int num_pu = uvg_part_mode_num_parts[part_mode]; + // The bottom right cu will always have the chroma info + cu_info_t *bottom_right = LCU_GET_CU_AT_PX( + lcu, + cu_loc->local_x + cu_loc->width - 1, + cu_loc->local_y + cu_loc->height - 1); + if(bottom_right->type != CU_INTRA) return; - for (int i = 0; i < num_pu; ++i) { - const int x_pu = PU_GET_X(part_mode, cu_width, x_local, i); - const int y_pu = PU_GET_Y(part_mode, cu_width, y_local, i); - const int width_pu = PU_GET_W(part_mode, cu_width, i); - const int height_pu = PU_GET_H(part_mode, cu_width, i); - cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - pu->type = type; - lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu); + for(int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += 4 ) { + for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += 4) { + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y); + cu->intra.mode_chroma = bottom_right->intra.mode_chroma; + cu->joint_cb_cr = bottom_right->joint_cb_cr; + cu->cr_lfnst_idx = bottom_right->cr_lfnst_idx; + cu->log2_chroma_height = bottom_right->log2_chroma_height; + cu->log2_chroma_width = bottom_right->log2_chroma_width; + cu->type = bottom_right->type; + cu->tr_skip |= bottom_right->tr_skip & 6; + } } } -static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu) -{ - const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth; - const uint32_t mask = ~((width >> tr_split)-1); +static void lcu_fill_chroma_cbfs(lcu_t *lcu, const cu_loc_t * const chroma_loc, enum uvg_tree_type tree_type) +{ + int8_t height = chroma_loc->height; + int8_t width = chroma_loc->width; + uint32_t x_local = chroma_loc->local_x; + uint32_t y_local = chroma_loc->local_y; + const int offset = ~((TR_MAX_WIDTH) - 1); // Set coeff flags in every CU covered by part_mode in this depth. - for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) { - for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) { + for (uint32_t y = 0; y < height; y += SCU_WIDTH) { + for (uint32_t x = 0; x < width; x += SCU_WIDTH) { // Use TU top-left CU to propagate coeff flags - cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask); - cu_info_t *cu_to = LCU_GET_CU_AT_PX(lcu, x, y); + cu_info_t* cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & offset), y_local + (y & offset)); + cu_info_t* cu_to = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y); if (cu_from != cu_to) { - // Chroma and luma coeff data is needed for deblocking - cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y); cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U); cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V); } } } + +} + +static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu, enum + uvg_tree_type tree_type) +{ + // Set coeff flags in every CU covered by part_mode in this depth. + for (uint32_t y = 0; y < height; y += SCU_WIDTH) { + for (uint32_t x = 0; x < width; x += SCU_WIDTH) { + // Use TU top-left CU to propagate coeff flags + cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & ~(TR_MAX_WIDTH - 1)), y_local + (y & ~(TR_MAX_WIDTH - 1))); + cu_info_t *cu_to = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y); + if (cu_from != cu_to) { + // Chroma and luma coeff data is needed for deblocking + if(tree_type != UVG_CHROMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y); + if(tree_type != UVG_LUMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U); + if (tree_type != UVG_LUMA_T)cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V); + } + } + } } //Calculates cost for all zero coeffs -static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y, +static double cu_zero_coeff_cost( + const encoder_state_t *state, + lcu_t *work_tree, + const cu_loc_t* const cu_loc, const int depth) { - int x_local = SUB_SCU(x); - int y_local = SUB_SCU(y); - int cu_width = LCU_WIDTH >> depth; lcu_t *const lcu = &work_tree[depth]; + const int y_local = cu_loc->local_y; + const int x_local = cu_loc->local_x; + const int luma_index = y_local * LCU_WIDTH + x_local; const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); double ssd = 0.0; ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], - LCU_WIDTH, LCU_WIDTH, cu_width + LCU_WIDTH, LCU_WIDTH, cu_loc->width, cu_loc->height ); - if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) { + if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) { ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], - LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height ); ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd( &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], - LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width, cu_loc->chroma_height ); } // Save the pixels at a lower level of the working tree. - copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1], UVG_BOTH_T); + copy_cu_pixels(lcu, &work_tree[depth + 1], cu_loc, UVG_BOTH_T); return ssd; } @@ -261,7 +458,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, const int stride = state->tile->frame->rec->stride; const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA); - for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) { + for (int y_ = 0; y_ < height && y_ * 2 + y < state->tile->frame->height; y_++) { for (int x_ = 0; x_ < width; x_++) { int s = 4; s += y_rec[2 * x_] * 2; @@ -281,7 +478,7 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, if((y + height * 2) % 64 == 0) { int line = y / 64 * stride2 / 2; y_rec -= LCU_WIDTH; - for (int i = 0; i < width; ++i) { + for (int i = 0; i < width && i + x / 2 < stride2 / 2; ++i) { int s = 2; s += y_rec[i * 2] * 2; s += y_rec[i * 2 + 1]; @@ -301,71 +498,117 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -double uvg_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu) +double uvg_cu_rd_cost_luma( + const encoder_state_t *const state, + const cu_loc_t* const cu_loc, + const cu_info_t *const pred_cu, + lcu_t *const lcu, + uint8_t isp_cbf) { - const int width = LCU_WIDTH >> depth; const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; - + // cur_cu is used for TU parameters. - cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y); double coeff_bits = 0; double tr_tree_bits = 0; - // Check that lcu is not in - assert(x_px >= 0 && x_px < LCU_WIDTH); - assert(y_px >= 0 && y_px < LCU_WIDTH); + // Check that lcu is not in - const uint8_t tr_depth = tr_cu->tr_depth - depth; - - if (tr_depth > 0) { - int offset = width / 2; + if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) { double sum = 0; + // Recursively process sub-CUs. + enum split_type split; + if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; + } + else if (cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; + } + else { + split = BT_HOR_SPLIT; + } - sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + cu_loc_t split_cu_loc[4]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + for (int i = 0; i < split_count; ++i) { + sum += uvg_cu_rd_cost_luma(state, &split_cu_loc[i], pred_cu, lcu, isp_cbf); + } return sum + tr_tree_bits * state->lambda; } + const bool is_not_isp = pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP; // Add transform_tree cbf_luma bit cost. - const int is_tr_split = tr_cu->tr_depth - tr_cu->depth; - int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y); - if (pred_cu->type == CU_INTRA || - is_tr_split || - cbf_is_set(tr_cu->cbf, depth, COLOR_U) || - cbf_is_set(tr_cu->cbf, depth, COLOR_V)) - { - cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]); + if (is_not_isp) { + const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + int is_set = cbf_is_set(pred_cu->cbf, COLOR_Y); + if (pred_cu->type == CU_INTRA || + !PU_IS_TU(pred_cu) || + cbf_is_set(tr_cu->cbf, COLOR_U) || + cbf_is_set(tr_cu->cbf, COLOR_V)) + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]); - CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); + CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search"); + } + + if (is_set && state->encoder_control->cfg.trskip_enable + && cu_loc->width <= (1 << state->encoder_control->cfg.trskip_max_size) + && cu_loc->height <= (1 << state->encoder_control->cfg.trskip_max_size)) { + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag"); + } } - - if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag"); + else { + // TODO: 8x4 CUs + const int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, pred_cu->intra.isp_mode, true); + int luma_ctx = 2; + const int split_limit_minus_one = split_limit - 1; + for (int i = 0; i < split_limit; i++) { + if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) { + const int flag = (isp_cbf >> i) & 1; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, tr_tree_bits, "cbf_y_search"); + luma_ctx = 2 + flag; + } + } } // SSD between reconstruction and original int ssd = 0; if (!state->encoder_control->cfg.lossless) { - int index = y_px * LCU_WIDTH + x_px; + int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x; ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - width); + cu_loc->width, cu_loc->height); } if (!skip_residual_coding) { - int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); - const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + int8_t luma_scan_mode = SCAN_DIAG; + if (is_not_isp) { + //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + const coeff_t* coeffs = lcu->coeff.y; - coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, width, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP); + coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU); + } + else { + int split_type = pred_cu->intra.isp_mode; + int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, split_type, true); + + for (int i = 0; i < split_limit; ++i) { + cu_loc_t split_loc; + uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height, i, split_type, true); + const int part_x = split_loc.x; + const int part_y = split_loc.y; + + // TODO: maybe just pass the cu_loc_t to these functions + //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)]; + const coeff_t* coeffs = lcu->coeff.y; + + coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU); + } + } } double bits = tr_tree_bits + coeff_bits; @@ -373,57 +616,58 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state, } -double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t *const pred_cu, - lcu_t *const lcu) +double uvg_cu_rd_cost_chroma( + const encoder_state_t *const state, + cu_info_t *const pred_cu, + lcu_t *const lcu, + const cu_loc_t * const cu_loc) { - const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 }; - const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; - cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + const vector2d_t lcu_px = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 }; + cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); - + double tr_tree_bits = 0; double coeff_bits = 0; + + const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, COLOR_U); + int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, COLOR_V); - assert(x_px >= 0 && x_px < LCU_WIDTH); - assert(y_px >= 0 && y_px < LCU_WIDTH); - - if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) { - // For MAX_PU_DEPTH calculate chroma for previous depth for the first - // block and return 0 cost for all others. - return 0; - } - int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U); - int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V); - - // See luma for why the second condition - if (!skip_residual_coding) { - const int tr_depth = depth - pred_cu->depth; - cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; - cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]); - cabac->cur_ctx = ctx; - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); - } - ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); - } - } - - - if (tr_cu->tr_depth > depth) { - int offset = LCU_WIDTH >> (depth + 1); + if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) { double sum = 0; + // Recursively process sub-CUs. + enum split_type split; + if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; + } + else if (cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; + } + else { + split = BT_HOR_SPLIT; + } - sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + cu_loc_t split_cu_loc[4]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + for (int i = 0; i < split_count; ++i) { + sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc[i]); + } return sum + tr_tree_bits * state->lambda; } + + if (!skip_residual_coding) { + cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); + cabac->cur_ctx = ctx; + CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search"); + + ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]); + CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search"); + + } + + if (state->encoder_control->cfg.jccr) { int cbf_mask = u_is_set * 2 + v_is_set - 1; @@ -441,23 +685,26 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); + cu_loc->chroma_width, cu_loc->chroma_height); int ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - width); + cu_loc->chroma_width, cu_loc->chroma_height); ssd = ssd_u + ssd_v; } if (!skip_residual_coding) { int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); - const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + + // We need the rounded & shifted coordinates for the chroma coeff calculation + cu_loc_t chroma_loc; + uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height); if((pred_cu->joint_cb_cr & 3) == 0){ - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, width, 2, scan_order, 0); - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, width, 2, scan_order, 0); + coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU); + coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU); } else { - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0); + coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &chroma_loc, 2, scan_order, 0, COEFF_ORDER_CU); } } @@ -470,82 +717,104 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, static double cu_rd_cost_tr_split_accurate( const encoder_state_t* const state, - const int x_px, - const int y_px, - const int depth, const cu_info_t* const pred_cu, lcu_t* const lcu, - enum uvg_tree_type tree_type) { - const int width = LCU_WIDTH >> depth; - + enum uvg_tree_type tree_type, + uint8_t isp_cbf, + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, + bool has_chroma) { + const int width = cu_loc->width; + const int height = cu_loc->height; // TODO: height for non-square blocks + const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0); // cur_cu is used for TU parameters. - cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px); + cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y); double coeff_bits = 0; - double tr_tree_bits = 0; - - // Check that lcu is not in - assert(x_px >= 0 && x_px < LCU_WIDTH); - assert(y_px >= 0 && y_px < LCU_WIDTH); - - const uint8_t tr_depth = tr_cu->tr_depth - depth; - - const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U); - const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V); + double luma_bits = 0; + double chroma_bits = 0; + + const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, COLOR_U); + const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, COLOR_V); cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac; { - int cbf = cbf_is_set_any(pred_cu->cbf, depth); + int cbf = cbf_is_set_any(tr_cu->cbf); // Only need to signal coded block flag if not skipped or merged // skip = no coded residual, merge = coded residual - if (pred_cu->type != CU_INTRA && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); + if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, luma_bits, "rqt_root_cbf"); } } - - bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8)) && tree_type != UVG_LUMA_T; - if( !skip_residual_coding && has_chroma) { - if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb"); - } - if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr"); - } - } - - if (tr_depth > 0) { - int offset = LCU_WIDTH >> (depth + 1); + + if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) { double sum = 0; - - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type); - sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type); - sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type); - return sum + tr_tree_bits * state->lambda; + enum split_type split; + if(cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; + } else if(cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; + } else { + split = BT_HOR_SPLIT; + } + + cu_loc_t split_cu_loc[4]; + const int split_count= uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + cu_loc_t split_chroma_cu_loc[4]; + if (chroma_loc) { + uvg_get_split_locs(chroma_loc, split, split_chroma_cu_loc, NULL); + } + for (int i = 0; i < split_count; ++i) { + sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc[i], chroma_loc ? &split_chroma_cu_loc[i] : NULL, has_chroma); + } + return sum + luma_bits * state->lambda; } - const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T; + has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && has_chroma && tree_type != UVG_LUMA_T; + if (!skip_residual_coding && has_chroma) { + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, chroma_bits, "cbf_cb"); + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, chroma_bits, "cbf_cr"); + } + + const int cb_flag_y = cbf_is_set(tr_cu->cbf, COLOR_Y) && tree_type != UVG_CHROMA_T; + + const bool is_isp = !(pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP); // Add transform_tree cbf_luma bit cost. - const int is_tr_split = depth - tr_cu->depth; - if ((pred_cu->type == CU_INTRA || - is_tr_split || - cb_flag_u || - cb_flag_v) + if (!is_isp) { + const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH; + if ((pred_cu->type == CU_INTRA || + is_tr_split || + cb_flag_u || + cb_flag_v) && !skip_residual_coding && tree_type != UVG_CHROMA_T) - { - cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]); + { + cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[0]); - CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search"); + CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, luma_bits, "cbf_y_search"); + } + } + else { + // TODO: 8x4 CUs + const int split_limit = uvg_get_isp_split_num(width, height, pred_cu->intra.isp_mode, true); + int luma_ctx = 2; + const int split_limit_minus_one = split_limit - 1; + for (int i = 0; i < split_limit; i++) { + if (i != split_limit_minus_one || isp_cbf != 1 << split_limit_minus_one) { + const int flag = (isp_cbf >> i) & 1; + CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_luma[luma_ctx]), flag, luma_bits, "cbf_y_search"); + luma_ctx = 2 + flag; + } + } } if (cb_flag_y || cb_flag_u || cb_flag_v) { // TODO qp_delta_sign_flag if ((cb_flag_u || cb_flag_v) && has_chroma && state->encoder_control->cfg.jccr) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, chroma_bits, "tu_joint_cbcr_residual_flag"); } } @@ -553,40 +822,66 @@ static double cu_rd_cost_tr_split_accurate( // SSD between reconstruction and original unsigned luma_ssd = 0; if (!state->encoder_control->cfg.lossless && tree_type != UVG_CHROMA_T) { - int index = y_px * LCU_WIDTH + x_px; + int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y; luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - width); + width, height); } // Chroma transform skip enable/disable is non-normative, so we need to count the chroma // tr-skip bits even when we are never using it. - const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size); + const bool can_use_tr_skip = state->encoder_control->cfg.trskip_enable + && width <= (1 << state->encoder_control->cfg.trskip_max_size) + && height <= (1 << state->encoder_control->cfg.trskip_max_size) + && !is_isp; - if(cb_flag_y){ + if(cb_flag_y || is_isp){ if (can_use_tr_skip) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, luma_bits, "transform_skip_flag"); } - int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); - const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + int8_t luma_scan_mode = SCAN_DIAG; + if (pred_cu->type != CU_INTRA || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { + //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)]; + const coeff_t* coeffs = lcu->coeff.y; - coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1); + coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU); + } + else { + int split_type = pred_cu->intra.isp_mode; + int split_limit = uvg_get_isp_split_num(width, height, split_type, true); + + for (int i = 0; i < split_limit; ++i) { + cu_loc_t split_loc; + uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true); + const int part_x = split_loc.x; + const int part_y = split_loc.y; + + // TODO: maybe just pass the cu_loc_t to these functions + //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, part_x, part_y)]; + const coeff_t* coeffs = lcu->coeff.y; + + coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &split_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU); + } + } } - if(depth == 4 || tree_type == UVG_LUMA_T) { - if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, COLOR_Y, lcu)) { + const bool is_local_sep_tree = (cu_loc->width != chroma_loc->width || cu_loc->height != chroma_loc->height) && state->encoder_control->chroma_format != UVG_CSP_400; + + if(is_local_sep_tree || tree_type == UVG_LUMA_T) { + + if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_LUMA_T : tree_type, COLOR_Y, cu_loc, lcu)) { const int lfnst_idx = tr_cu->lfnst_idx; CABAC_FBITS_UPDATE( cabac, &cabac->ctx.lfnst_idx_model[1], lfnst_idx != 0, - tr_tree_bits, + luma_bits, "lfnst_idx"); if (lfnst_idx > 0) { CABAC_FBITS_UPDATE( cabac, &cabac->ctx.lfnst_idx_model[2], lfnst_idx == 2, - tr_tree_bits, + luma_bits, "lfnst_idx"); } } @@ -595,103 +890,106 @@ static double cu_rd_cost_tr_split_accurate( unsigned chroma_ssd = 0; if(has_chroma) { - const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3 }; - const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1)); - int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth); - const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); + cu_loc_t temp_chroma_loc; + const vector2d_t lcu_px = { chroma_loc->local_x >> 1, chroma_loc->local_y >> 1}; + uvg_cu_loc_ctor(&temp_chroma_loc, lcu_px.x, lcu_px.y, chroma_loc->width, chroma_loc->height); + const int chroma_width = chroma_loc->chroma_width; + const int chroma_height = chroma_loc->chroma_height; + int8_t scan_order = SCAN_DIAG; + //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y); - const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size); + const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable + && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size) + && chroma_height <= (1 << state->encoder_control->cfg.trskip_max_size); if(pred_cu->joint_cb_cr == 0) { if (!state->encoder_control->cfg.lossless) { int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], - LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[1]; unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], - LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[2]; chroma_ssd = ssd_u + ssd_v; } if(chroma_can_use_tr_skip && cb_flag_u) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag"); } if(chroma_can_use_tr_skip && cb_flag_v) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, chroma_bits, "transform_skip_flag"); } - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], tr_cu, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2); - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], tr_cu, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4); + chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU); + chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &temp_chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU); } else { { int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x; int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], - LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3]; int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], - LCU_WIDTH_C, LCU_WIDTH_C, - chroma_width); + LCU_WIDTH_C, LCU_WIDTH_C, chroma_width, chroma_height) * state->chroma_weights[3]; chroma_ssd = ssd_u_joint + ssd_v_joint; } if (chroma_can_use_tr_skip) { - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, chroma_bits, "transform_skip_flag"); } - coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], tr_cu, chroma_width, COLOR_U, scan_order, 0); + chroma_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &temp_chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU); } } - if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) { - const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx; + const bool is_chroma_tree = is_local_sep_tree || tree_type == UVG_CHROMA_T; + if (uvg_is_lfnst_allowed(state, tr_cu, is_local_sep_tree ? UVG_CHROMA_T : tree_type, is_chroma_tree ? COLOR_UV : COLOR_Y, is_chroma_tree ? chroma_loc : cu_loc, lcu) && tree_type != UVG_LUMA_T) { + const int lfnst_idx = is_chroma_tree ? tr_cu->cr_lfnst_idx : tr_cu->lfnst_idx; CABAC_FBITS_UPDATE( cabac, - &cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T], + &cabac->ctx.lfnst_idx_model[is_chroma_tree], lfnst_idx != 0, - tr_tree_bits, + luma_bits, "lfnst_idx"); if (lfnst_idx > 0) { CABAC_FBITS_UPDATE( cabac, &cabac->ctx.lfnst_idx_model[2], lfnst_idx == 2, - tr_tree_bits, + luma_bits, "lfnst_idx"); } } tr_cu->lfnst_last_scan_pos = false; tr_cu->violates_lfnst_constrained_luma = false; tr_cu->violates_lfnst_constrained_chroma = false; - if (uvg_is_mts_allowed(state, tr_cu) && tree_type != UVG_CHROMA_T) { + if (uvg_is_mts_allowed(state, tr_cu, cu_loc) && tree_type != UVG_CHROMA_T) { bool symbol = tr_cu->tr_idx != 0; int ctx_idx = 0; - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx"); ctx_idx++; for (int i = 0; i < 3 && symbol; i++, ctx_idx++) { symbol = tr_cu->tr_idx > i + MTS_DST7_DST7 ? 1 : 0; - CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, tr_tree_bits, "mts_idx"); + CABAC_FBITS_UPDATE(cabac, &cabac->ctx.mts_idx_model[ctx_idx], symbol, luma_bits, "mts_idx"); } tr_cu->mts_last_scan_pos = false; tr_cu->violates_mts_coeff_constraint = false; } - double bits = tr_tree_bits + coeff_bits; - return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda; + double bits = luma_bits + coeff_bits; + return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + (bits + chroma_bits) * state->lambda; } // Return estimate of bits used to code prediction mode of cur_cu. -static double calc_mode_bits(const encoder_state_t *state, - const lcu_t *lcu, - const cu_info_t * cur_cu, - int x, int y, int depth) +static double calc_mode_bits( + const encoder_state_t *state, + const lcu_t *lcu, + const cu_info_t * cur_cu, + const cu_loc_t* const cu_loc) { assert(cur_cu->type == CU_INTRA); - double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu); + double mode_bits = uvg_luma_mode_bits(state, cur_cu, cu_loc, lcu); - if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) { + if (((cu_loc->width == 4 && cu_loc->x % 8 && cu_loc->y % 8) || (cu_loc->width != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) { mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode); } @@ -768,6 +1066,134 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map) } +static void mark_deblocking(const cu_loc_t* const cu_loc, const cu_loc_t* const chroma_loc, lcu_t* lcu, enum uvg_tree_type tree_type, bool has_chroma, const bool is_separate_tree, int x_local, int y_local) +{ + if(tree_type != UVG_CHROMA_T) { + if(cu_loc->x) { + for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += TR_MAX_WIDTH) { + for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_VER; + if(!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER; + } + } + } + else if(cu_loc->width == 64) { + for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->luma_deblocking |= EDGE_VER; + if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER; + } + } + + if(cu_loc->y) { + for (int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += TR_MAX_WIDTH) { + for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->luma_deblocking |= EDGE_HOR; + if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR; + } + } + } + else if (cu_loc->height == 64) { + for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->luma_deblocking |= EDGE_HOR; + if (!is_separate_tree && tree_type == UVG_BOTH_T) LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR; + } + } + + if(is_separate_tree && has_chroma) { + if (chroma_loc->x) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += TR_MAX_WIDTH) { + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER; + } + } + } + else if(cu_loc->width == 64) { + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER; + } + } + + if (chroma_loc->y) { + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += TR_MAX_WIDTH) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR; + } + } + } + else if (cu_loc->height == 64) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR; + } + } + } + } + else { + + if (chroma_loc->x) { + for (int x = x_local; x < x_local + chroma_loc->width; x += TR_MAX_WIDTH) { + for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_VER; + } + } + } + else if(chroma_loc->width == 64) { + for (int y = y_local; y < y_local + chroma_loc->height; y += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, TR_MAX_WIDTH, y)->chroma_deblocking |= EDGE_VER; + } + } + + if(chroma_loc->y) { + for (int y = y_local; y < y_local + chroma_loc->height; y += TR_MAX_WIDTH) { + for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, y)->chroma_deblocking |= EDGE_HOR; + } + } + } + else if (chroma_loc->height == 64) { + for (int x = x_local; x < x_local + chroma_loc->width; x += SCU_WIDTH) { + LCU_GET_CU_AT_PX(lcu, x, TR_MAX_WIDTH)->chroma_deblocking |= EDGE_HOR; + } + } + } +} + +static bool check_for_early_termission(const int cu_width, const int cu_height, const cu_info_t* const cur_cu, int x_local, int y_local, const + bool* improved, + int cbf, + lcu_t* split_lcu, + int split_type, + const bool* can_split) +{ + // Best no split has no residual and same direction bt didn't improve so don't try tt + // 3.11 + if ( + !cbf && ((!improved[BT_VER_SPLIT] && split_type == TT_VER_SPLIT) || + (!improved[BT_HOR_SPLIT] && split_type == TT_HOR_SPLIT))) + return true; + + + // 3.8 + if (split_type == TT_HOR_SPLIT && can_split[BT_HOR_SPLIT]) { + bool can_skip = true; + for (int x_scu = x_local; x_scu < x_local + cu_width; x_scu += 4) { + can_skip &= + LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local)->log2_height == cur_cu->log2_height - 1 && + LCU_GET_CU_AT_PX(&split_lcu[BT_HOR_SPLIT - 1], x_scu, y_local + cu_height / 2)->log2_height == cur_cu->log2_height - 1; + } + if (can_skip) return true; + } + if (split_type == TT_VER_SPLIT && can_split[BT_VER_SPLIT]) { + bool can_skip = true; + for (int y_scu = y_local; y_scu < y_local + cu_height; y_scu += 4) { + can_skip &= + LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local, y_scu)->log2_width == cur_cu->log2_width - 1 && + LCU_GET_CU_AT_PX(&split_lcu[BT_VER_SPLIT - 1], x_local + cu_width / 2, y_scu)->log2_width == cur_cu->log2_width - 1; + } + if (can_skip) return true; + } + return false; +} + /** * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode. * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH. @@ -780,17 +1206,24 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map) */ static double search_cu( encoder_state_t* const state, - int x, - int y, - int depth, - lcu_t* work_tree, - enum uvg_tree_type - tree_type) + const cu_loc_t* const cu_loc, + const cu_loc_t* const chroma_loc, + lcu_t* lcu, + enum uvg_tree_type tree_type, + const split_tree_t split_tree, + bool has_chroma) { + const int depth = split_tree.current_depth; const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; - const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth; - const int luma_width = LCU_WIDTH >> depth; + const int cu_width = cu_loc->width; + const int cu_height = cu_loc->height; + const int x = cu_loc->x; + const int y = cu_loc->y; + const int luma_width = cu_loc->width; + const int luma_height = cu_loc->height; + + const bool is_separate_tree = chroma_loc == NULL || cu_loc->height != chroma_loc->height || cu_loc->width != chroma_loc->width; assert(cu_width >= 4); double cost = MAX_DOUBLE; double inter_zero_coeff_cost = MAX_DOUBLE; @@ -799,7 +1232,7 @@ static double search_cu( cabac_data_t pre_search_cabac; memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; cu_info_t hmvp_lut[MAX_NUM_HMVP_CANDS]; @@ -815,11 +1248,9 @@ static double search_cu( int32_t min; int32_t max; } pu_depth_inter, pu_depth_intra; - - lcu_t *const lcu = &work_tree[depth]; - - int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T); - int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T); + + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); int32_t frame_width = frame->width; int32_t frame_height = frame->height; @@ -841,55 +1272,51 @@ static double search_cu( pu_depth_intra.min = ctrl->cfg.pu_depth_intra.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.min[gop_layer] : ctrl->cfg.pu_depth_intra.min[0]; pu_depth_intra.max = ctrl->cfg.pu_depth_intra.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_intra.max[gop_layer] : ctrl->cfg.pu_depth_intra.max[0]; } - if(tree_type == UVG_CHROMA_T) { - pu_depth_intra.max = CLIP(1, 3, pu_depth_intra.max); - pu_depth_intra.min = CLIP(1, 3, pu_depth_intra.min); - } + pu_depth_inter.min = ctrl->cfg.pu_depth_inter.min[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.min[gop_layer] : ctrl->cfg.pu_depth_inter.min[0]; pu_depth_inter.max = ctrl->cfg.pu_depth_inter.max[gop_layer] >= 0 ? ctrl->cfg.pu_depth_inter.max[gop_layer] : ctrl->cfg.pu_depth_inter.max[0]; cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + memset(cur_cu, 0, sizeof(cu_info_t)); // Assign correct depth - cur_cu->depth = (depth > MAX_DEPTH) ? MAX_DEPTH : depth; - cur_cu->tr_depth = (depth > 0) ? depth : 1; cur_cu->type = CU_NOTSET; - cur_cu->part_size = SIZE_2Nx2N; cur_cu->qp = state->qp; - cur_cu->bdpcmMode = 0; - cur_cu->tr_idx = 0; - cur_cu->violates_mts_coeff_constraint = 0; - cur_cu->mts_last_scan_pos = 0; - cur_cu->violates_lfnst_constrained_luma = 0; - cur_cu->violates_lfnst_constrained_chroma = 0; - cur_cu->lfnst_last_scan_pos = 0; - cur_cu->lfnst_idx = 0; - cur_cu->joint_cb_cr = 0; + cur_cu->split_tree = split_tree.split_tree; + cur_cu->log2_width = uvg_g_convert_to_log2[cu_width]; + cur_cu->log2_height = uvg_g_convert_to_log2[cu_height]; + if(chroma_loc) { + cur_cu->log2_chroma_height = uvg_g_convert_to_log2[chroma_loc->chroma_height]; + cur_cu->log2_chroma_width = uvg_g_convert_to_log2[chroma_loc->chroma_width]; + } + + intra_search_data_t intra_search = {0}; + + const bool completely_inside = x + luma_width <= frame_width && y + luma_height <= frame_height; // If the CU is completely inside the frame at this depth, search for // prediction modes at this depth. - if ( x + luma_width <= frame_width && y + luma_width <= frame_height) + if ( completely_inside) { int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max; bool can_use_inter = state->frame->slicetype != UVG_SLICE_I && - depth <= MAX_DEPTH && + split_tree.current_depth <= MAX_DEPTH && ( - WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) || + WITHIN(split_tree.current_depth, pu_depth_inter.min, pu_depth_inter.max) || // When the split was forced because the CTU is partially outside the // frame, we permit inter coding even if pu_depth_inter would // otherwise forbid it. (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_width || (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame_height - ); + ) && cu_loc->width == cu_loc->height; // Don't allow non square inter CUs for now if (can_use_inter) { double mode_cost; double mode_bitcost; uvg_search_cu_inter(state, - x, y, - depth, - lcu, - &mode_cost, &mode_bitcost); + cu_loc, lcu, + &mode_cost, + &mode_bitcost); if (mode_cost < cost) { cost = mode_cost; inter_bitcost = mode_bitcost; @@ -907,7 +1334,7 @@ static double search_cu( int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(split_tree.current_depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. @@ -915,15 +1342,11 @@ static double search_cu( (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame_height) && !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I); - intra_search_data_t intra_search; intra_search.cost = 0; if (can_use_intra && !skip_intra) { intra_search.pred_cu = *cur_cu; if(tree_type != UVG_CHROMA_T) { - intra_search.pred_cu.joint_cb_cr = 4; - uvg_search_cu_intra(state, x, y, depth, &intra_search, - lcu, - tree_type); + uvg_search_cu_intra(state, &intra_search, lcu, is_separate_tree ? UVG_LUMA_T : tree_type, cu_loc); } #ifdef COMPLETE_PRED_MODE_BITS // Technically counting these bits would be correct, however counting @@ -936,72 +1359,80 @@ static double search_cu( } #endif if (state->encoder_control->cfg.cclm && tree_type != UVG_CHROMA_T && state->encoder_control->chroma_format != UVG_CSP_400) { - uvg_intra_recon_cu(state, - x, y, - depth, &intra_search, - &intra_search.pred_cu, - lcu, tree_type, true, false); + if(intra_search.pred_cu.intra.isp_mode == ISP_MODE_NO_ISP) { + uvg_intra_recon_cu(state, + &intra_search, cu_loc, + &intra_search.pred_cu, lcu, + tree_type, + true, + false); + } + else { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t)); + state->search_cabac.update = 1; + uvg_recon_and_estimate_cost_isp( + state, + cu_loc, + 0, + &intra_search, + lcu, + NULL + ); + memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t)); + } downsample_cclm_rec( - state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64] ); } double intra_cost = intra_search.cost; if (intra_cost < cost && tree_type != UVG_LUMA_T) { int8_t intra_mode = intra_search.pred_cu.intra.mode; - - // TODO: This heavily relies to square CUs - if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { + + if ((has_chroma || tree_type == UVG_CHROMA_T) + && state->encoder_control->chroma_format != UVG_CSP_400) { intra_search.pred_cu.joint_cb_cr = 0; - // There is almost no benefit to doing the chroma mode search for - // rd2. Possibly because the luma mode search already takes chroma - // into account, so there is less of a chanse of luma mode being - // really bad for chroma. - if(tree_type == UVG_CHROMA_T) { - intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra; - intra_mode = intra_search.pred_cu.intra.mode; + if(tree_type == UVG_CHROMA_T || is_separate_tree) { + intra_mode = uvg_get_co_located_luma_mode( + chroma_loc, cu_loc, &intra_search.pred_cu, is_separate_tree ? lcu : NULL, + tree_type == UVG_CHROMA_T ? state->tile->frame->cu_array : NULL, + UVG_CHROMA_T); + state->collocated_luma_mode = intra_mode; intra_search.pred_cu.type = CU_INTRA; + } else if (intra_search.pred_cu.intra.mip_flag) { + intra_mode = 0; } - intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode; + intra_search.pred_cu.intra.mode_chroma = intra_mode; if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) { - uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search, tree_type); - - if (intra_search.pred_cu.joint_cb_cr == 0) { - intra_search.pred_cu.joint_cb_cr = 4; - } - + uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, intra_mode, tree_type, is_separate_tree); } else if (!intra_search.pred_cu.intra.mip_flag) { - intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode; + intra_search.pred_cu.intra.mode_chroma = intra_mode; } else { intra_search.pred_cu.intra.mode_chroma = 0; } - - if(tree_type != UVG_CHROMA_T && ctrl->cfg.rdo >= 2) { - uvg_intra_recon_cu(state, - x, y, - depth, &intra_search, - &intra_search.pred_cu, - lcu, - tree_type, false, true); - intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu); + state->quant_blocks[2].needs_init = true; + uvg_intra_recon_cu(state, + &intra_search, chroma_loc, + &intra_search.pred_cu, lcu, + is_separate_tree ? UVG_CHROMA_T : tree_type, + false, + true); + if(tree_type != UVG_CHROMA_T) { + intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, chroma_loc); } else { intra_cost = intra_search.cost; } - intra_search.pred_cu.intra.mode = intra_mode; intra_search.pred_cu.violates_lfnst_constrained_chroma = false; intra_search.pred_cu.lfnst_last_scan_pos = false; } else { intra_search.pred_cu.intra.mode_chroma = intra_mode; } - intra_search.pred_cu.intra.mode = intra_mode; - if(tree_type == UVG_CHROMA_T) { - uvg_lcu_fill_trdepth(lcu, x_local, y_local, depth, depth, tree_type); - } } if (intra_cost < cost) { cost = intra_cost; @@ -1023,8 +1454,7 @@ static double search_cu( double mode_cost; double mode_bitcost; uvg_search_cu_ibc(state, - x, y, - depth, + cu_loc, lcu, &mode_cost, &mode_bitcost); if (mode_cost < cost) { @@ -1041,30 +1471,82 @@ static double search_cu( // Reconstruct best mode because we need the reconstructed pixels for // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { - assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN); bool recon_chroma = true; - bool recon_luma = tree_type != UVG_CHROMA_T; - if ((depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) { + bool recon_luma = tree_type != UVG_CHROMA_T && cur_cu->intra.isp_mode == ISP_MODE_NO_ISP; + if (is_separate_tree || !has_chroma || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T || cu_loc->chroma_height % 4 == 2) { recon_chroma = false; } - lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); - uvg_intra_recon_cu(state, - x, y, - depth, &intra_search, - NULL, - lcu, tree_type,recon_luma,recon_chroma); - if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) { - intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; - uvg_intra_recon_cu(state, - x, y, - depth, &intra_search, - NULL, - lcu, - tree_type,false,true); + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); + if (!state->encoder_control->cfg.cclm && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP) { + uvg_recon_and_estimate_cost_isp( + state, + cu_loc, + 0, + &intra_search, + lcu, + NULL + ); } - if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0; - lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + else { + uvg_intra_recon_cu(state, + &intra_search, cu_loc, + NULL, lcu, + tree_type, + recon_luma, recon_chroma); + } + + + if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) + || tree_type == UVG_CHROMA_T) { + intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; + if(tree_type != UVG_CHROMA_T) { + lcu_fill_chroma_cu_info( + lcu, + chroma_loc); + } + uvg_intra_recon_cu(state, + &intra_search, chroma_loc, + NULL, lcu, + UVG_CHROMA_T, + false, + true); + lcu_fill_chroma_cbfs( + lcu, + chroma_loc, + tree_type); + } else { + assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0"); + } + + // Set isp split cbfs here + const int split_type = intra_search.pred_cu.intra.isp_mode; + const int split_num = split_type == ISP_MODE_NO_ISP || tree_type == UVG_CHROMA_T ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true); + + const int cbf_cb = cbf_is_set(cur_cu->cbf, COLOR_U); + const int cbf_cr = cbf_is_set(cur_cu->cbf, COLOR_V); + const int jccr = cur_cu->joint_cb_cr; + for (int i = 0; i < split_num; ++i) { + cu_loc_t isp_loc; + uvg_get_isp_split_loc(&isp_loc, x, y, cu_width, cu_height, i, split_type, true); + // Fetching from CU array does not work for dimensions less than 4 + // Fetch proper x, y coords for isp blocks + int tmp_x = isp_loc.x; + int tmp_y = isp_loc.y; + uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y, MAX(cu_width, cu_height)); + cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH); + bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1; + cbf_clear(&split_cu->cbf, COLOR_Y); + cbf_clear(&split_cu->cbf, COLOR_U); + cbf_clear(&split_cu->cbf, COLOR_V); + if (cur_cbf) { + cbf_set(&split_cu->cbf, COLOR_Y); + } + if(cbf_cb) cbf_set(&split_cu->cbf, COLOR_U); + if(cbf_cr) cbf_set(&split_cu->cbf, COLOR_V); + split_cu->joint_cb_cr = jccr; + } + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); } else if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { @@ -1075,35 +1557,28 @@ static double search_cu( if (cur_cu->inter.mv_dir & 1) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[0][0], &cur_cu->inter.mv[0][1]); if (cur_cu->inter.mv_dir & 2) uvg_round_precision(INTERNAL_MV_PREC, 2, &cur_cu->inter.mv[1][0], &cur_cu->inter.mv[1][1]); } - // Reset transform depth because intra messes with them. - // This will no longer be necessary if the transform depths are not shared. - int tr_depth = MAX(1, depth); - if (cur_cu->part_size != SIZE_2Nx2N) { - tr_depth = depth + 1; - } - uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type); const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - uvg_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma); + uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc); - if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { + if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable && false) { //Calculate cost for zero coeffs - inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda; + // inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda; } - + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, x, y, cu_width, cu_height); uvg_quantize_lcu_residual(state, true, has_chroma && !cur_cu->joint_cb_cr, - cur_cu->joint_cb_cr, x, y, - depth, + cur_cu->joint_cb_cr, &loc, NULL, lcu, false, - tree_type); + tree_type); - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + int cbf = cbf_is_set_any(cur_cu->cbf); - if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged && !cbf) { cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU @@ -1113,132 +1588,268 @@ static double search_cu( inter_bitcost += cur_cu->merge_idx; } } - lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type); - lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); + lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu, UVG_BOTH_T); } } + + // The cabac functions assume chroma locations whereas the search uses luma locations + // for the chroma tree, therefore we need to shift the chroma coordinates here for + // passing to the bit cost calculating functions. + cu_loc_t separate_tree_chroma_loc = *cu_loc; + separate_tree_chroma_loc.y >>= 1; + separate_tree_chroma_loc.x >>= 1; + separate_tree_chroma_loc.width >>= 1; + separate_tree_chroma_loc.height >>= 1; if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { double bits = 0; cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; + + bits += uvg_mock_encode_coding_unit( + state, + cabac, + cu_loc, + is_separate_tree && !has_chroma ? NULL : chroma_loc, + lcu, + cur_cu, + tree_type, + split_tree); - if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { - bits += uvg_mock_encode_coding_unit( - state, - cabac, - x, y, depth, - lcu, - cur_cu, - tree_type); - } - else { - assert(0); - } cost = bits * state->lambda; - cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type); + cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, intra_search.best_isp_cbfs, cu_loc, chroma_loc, has_chroma); + //fprintf(stderr, "%4d %4d %2d %2d %d %d %f\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree, cost); - if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { - cost = inter_zero_coeff_cost; + //if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) { + // cost = inter_zero_coeff_cost; - // Restore saved pixels from lower level of the working tree. - copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu, tree_type); + // // Restore saved pixels from lower level of the working tree. + // copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type); - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { - cur_cu->merged = 0; - cur_cu->skipped = 1; - lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); - } + // if (cur_cu->merged) { + // cur_cu->merged = 0; + // cur_cu->skipped = 1; + // lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); + // } - if (cur_cu->tr_depth != depth) { - // Reset transform depth since there are no coefficients. This - // ensures that CBF is cleared for the whole area of the CU. - uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type); - } - - cur_cu->cbf = 0; - lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); - } + // cur_cu->cbf = 0; + // lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); + //} cabac->update = 0; + + mark_deblocking( + cu_loc, + chroma_loc, + lcu, + tree_type, + has_chroma, + is_separate_tree, + x_local, + y_local); + if (cur_cu->type == CU_INTRA && cur_cu->intra.isp_mode != ISP_MODE_NO_ISP && tree_type != UVG_CHROMA_T) { + const int split_num = uvg_get_isp_split_num( cu_width, cu_height, cur_cu->intra.isp_mode,true); + for (int i = 1; i < split_num; i++) { + cu_loc_t isp_loc; + uvg_get_isp_split_loc( + &isp_loc, + x, + y, + cu_width, + cu_height, + i, + cur_cu->intra.isp_mode, + true); + if (isp_loc.x % 4 || isp_loc.y % 4) continue; + mark_deblocking( + &isp_loc, + chroma_loc, + lcu, + UVG_LUMA_T, + false, + false, + isp_loc.local_x, + isp_loc.local_y); + } + } } bool can_split_cu = // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) || + (split_tree.current_depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) || (state->frame->slicetype != UVG_SLICE_I && - depth < pu_depth_inter.max); + split_tree.current_depth < pu_depth_inter.max); if(state->encoder_control->cabac_debug_file) { - fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, depth, tree_type); + fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %9d %d", x, y, split_tree.split_tree, tree_type); fwrite(&state->search_cabac.ctx, 1, sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file); } - // Recursively split all the way to max search depth. - if (can_split_cu) { - int half_cu = cu_width >> (tree_type != UVG_CHROMA_T); - double split_cost = 0.0; - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + bool can_split[6]; + bool is_implicit = uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split); + + const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1; + const int max_btd = state->encoder_control->cfg.max_btt_depth[slice_type]; + int minimum_split_amount; + switch (slice_type) { + case 0: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break; + case 1: minimum_split_amount = MIN(pu_depth_intra.min, pu_depth_inter.min) - split_tree.current_depth; break; + case 2: minimum_split_amount = pu_depth_intra.min - split_tree.current_depth; break; + default: + assert(0 && "Incorrect_slice_type"); + } + if(minimum_split_amount > max_btd && !is_implicit && can_split[1]) { + // If search should not be performed at depths that cannot be reached after a maximum mtt split amount + // we are in trouble, therefore prevent mtt splits in such situation + can_split[2] = can_split[3] = can_split[4] = can_split[5] = false; + } + + can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5]; + + bool improved[6] = {false}; + + // If skip mode was selected for the block, skip further search. + // Skip mode means there's no coefficients in the block, so splitting + // might not give any better results but takes more time to do. + // It is ok to interrupt the search as soon as it is known that + // the split costs at least as much as not splitting. + int cbf = cbf_is_set_any(cur_cu->cbf); + + // 3.13 + if ((cu_height < 32 || cu_width < 32) && cur_cu->type != CU_NOTSET && !cbf && split_tree.mtt_depth > 1 && tree_type != UVG_CHROMA_T) { + can_split_cu = false; + } + + if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF || true)) { + lcu_t * split_lcu = MALLOC(lcu_t, 5); + enum split_type best_split = 0; + double best_split_cost = MAX_DOUBLE; cabac_data_t post_seach_cabac; + cabac_data_t best_split_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); - memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); + // Recursively split all the way to max search depth. + for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) { + if (!can_split[split_type]) + continue; + split_tree_t new_split = { + split_tree.split_tree | split_type << (split_tree.current_depth * 3), + split_tree.current_depth + 1, + split_tree.mtt_depth + (split_type != QT_SPLIT), + split_tree.implicit_mtt_depth + (split_type != QT_SPLIT && is_implicit), + 0 + }; + + if (completely_inside && check_for_early_termission( + cu_width, + cu_height, + cur_cu, + x_local, + y_local, + improved, + cbf, + split_lcu, + split_type, + can_split)) { + can_split[split_type] = false; + continue; + } + + double split_cost = 0.0; + memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); - double split_bits = 0; + double split_bits = 0; - if (depth < MAX_DEPTH) { + if (cur_cu->log2_height + cur_cu->log2_width > 4) { - state->search_cabac.update = 1; - // Add cost of cu_split_flag. - const cu_info_t* left_cu = NULL, * above_cu = NULL; - if (x) { - if (x_local || tree_type != UVG_CHROMA_T) { - left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + state->search_cabac.update = 1; + // Add cost of cu_split_flag. + const cu_info_t* left_cu = NULL, * above_cu = NULL; + if (x) { + if (x_local || tree_type != UVG_CHROMA_T) { + left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); + } + else { + left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x - 1, y); + } } - else { - left_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, (x >> 1) - 1, y >> 1); + if (y) { + if (y_local || tree_type != UVG_CHROMA_T) { + above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1); + } + else { + above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x, y - 1); + } + } + split_tree_t count_tree = split_tree; + count_tree.split_tree = split_tree.split_tree | split_type << (split_tree.current_depth * 3); + uvg_write_split_flag( + state, + &state->search_cabac, + left_cu, + above_cu, + cu_loc, + count_tree, + tree_type, + &is_implicit, + &split_bits + ); + } + + // 3.9 + const double factor = state->qp > 30 ? 1.1 : 1.075; + if (split_bits * state->lambda + cost / factor > cost) { + can_split[split_type] = false; + continue; + } + + + state->search_cabac.update = 0; + split_cost += split_bits * state->lambda; + + // 3.7 + bool stop_to_qt = false; + + cu_loc_t new_cu_loc[4]; + uint8_t separate_chroma = 0; + const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma); + separate_chroma |= !has_chroma; + initialize_partial_work_tree(state, lcu, &split_lcu[split_type - 1], cu_loc , separate_chroma ? chroma_loc : cu_loc, tree_type); + for (int split = 0; split < splits; ++split) { + new_split.part_index = split; + split_cost += search_cu(state, + &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split], + &split_lcu[split_type -1], + tree_type, new_split, + !separate_chroma || (split == splits - 1 && has_chroma)); + // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma + + if (split_type == QT_SPLIT && completely_inside) { + const cu_info_t * const t = LCU_GET_CU_AT_PX( + &split_lcu[0], + new_cu_loc[split].local_x, + new_cu_loc[split].local_y); + stop_to_qt |= GET_SPLITDATA(t, depth + 1) == QT_SPLIT; + } + + if (split_cost > cost || split_cost > best_split_cost) { + can_split[split_type] = false; + break; } } - if (y) { - if (y_local || tree_type != UVG_CHROMA_T) { - above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1); - } - else { - above_cu = uvg_cu_array_at_const(state->tile->frame->chroma_cu_array, x >> 1, (y >> 1) - 1); - } + + improved[split_type] = cost > split_cost; + + if (split_cost < best_split_cost) { + best_split_cost = split_cost; + best_split = split_type; + memcpy(&best_split_cabac, &state->search_cabac, sizeof(cabac_data_t)); } - uvg_write_split_flag( - state, - &state->search_cabac, - left_cu, - above_cu, - 1, - depth, - cu_width, - x >> (tree_type == UVG_CHROMA_T), - y >> (tree_type == UVG_CHROMA_T), - tree_type, - &split_bits); - } - - state->search_cabac.update = 0; - split_cost += split_bits * state->lambda; - - // If skip mode was selected for the block, skip further search. - // Skip mode means there's no coefficients in the block, so splitting - // might not give any better results but takes more time to do. - // It is ok to interrupt the search as soon as it is known that - // the split costs at least as much as not splitting. - if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) { - if (split_cost < cost) split_cost += search_cu(state, x, y, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type); - } else { - split_cost = INT_MAX; + if (stop_to_qt) break; } // If no search is not performed for this depth, try just the best mode @@ -1253,59 +1864,66 @@ static double search_cu( && tree_type == UVG_BOTH_T) { - cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local); + cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&split_lcu[best_split - 1], x_local, y_local); // If the best CU in depth+1 is intra and the biggest it can be, try it. - if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) { + if (cu_d1->type == CU_INTRA && (cu_d1->log2_height + 1 == cur_cu->log2_height || cu_d1->log2_width + 1 == cur_cu->log2_width)) { cabac_data_t temp_cabac; memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac)); memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; double bits = 0; + bool is_implicit = false; uvg_write_split_flag(state, &state->search_cabac, x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL, - y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, - 0, depth, cu_width, x, y, tree_type, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &is_implicit, &bits); cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; - cur_cu->part_size = SIZE_2Nx2N; + if (cur_cu->intra.mode_chroma > 79) { + cur_cu->intra.mode_chroma = cur_cu->intra.mode; + } // Disable MRL in this case cur_cu->intra.multi_ref_idx = 0; cur_cu->lfnst_idx = 0; cur_cu->cr_lfnst_idx = 0; - - uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth, tree_type); - lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); intra_search_data_t proxy; FILL(proxy, 0); proxy.pred_cu = *cur_cu; uvg_intra_recon_cu(state, - x, y, - depth, - &proxy, + &proxy, cu_loc, NULL, lcu, - tree_type, true, state->encoder_control->chroma_format == UVG_CSP_400); + tree_type, + true, + state->encoder_control->chroma_format != UVG_CSP_400); - double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits; + double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits; cost += mode_bits * state->lambda; - cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type); + cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc, chroma_loc, has_chroma); + + mark_deblocking(cu_loc, chroma_loc, lcu, tree_type, has_chroma, is_separate_tree, x_local, y_local); memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac)); } } - if (split_cost < cost) { + if (best_split_cost < cost) { // Copy split modes to this depth. - cost = split_cost; - work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr, tree_type); + cost = best_split_cost; + memcpy(&state->search_cabac, &best_split_cabac, sizeof(best_split_cabac)); + work_tree_copy_up(&split_lcu[best_split -1], lcu, state->encoder_control->cfg.jccr, tree_type, cu_loc, is_separate_tree && !has_chroma ? NULL : chroma_loc); + downsample_cclm_rec( + state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64] + ); #if UVG_DEBUG //debug_split = 1; #endif @@ -1313,9 +1931,8 @@ static double search_cu( // Copy this CU's mode all the way down for use in adjacent CUs mode // search. memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac)); - work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type); downsample_cclm_rec( - state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64] ); if (state->frame->slicetype != UVG_SLICE_I) { @@ -1329,21 +1946,21 @@ static double search_cu( } // Add candidate when in inter slice or ibc is enabled if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) { - uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); + uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu); } } else { downsample_cclm_rec( - state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64] ); } - } else if (depth >= 0 && depth < MAX_PU_DEPTH) { + FREE_POINTER(split_lcu); + } else if (cur_cu->log2_height + cur_cu->log2_width > 4) { // Need to copy modes down since the lower level of the work tree is used // when searching SMP and AMP blocks. - work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type); if(tree_type != UVG_CHROMA_T) { downsample_cclm_rec( - state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] + state, x, y, cu_width / 2, cu_height / 2, lcu->rec.y, lcu->left_ref.y[64] ); } @@ -1358,7 +1975,7 @@ static double search_cu( } // Add candidate when in inter slice or ibc is enabled if(state->frame->slicetype != UVG_SLICE_I || state->encoder_control->cfg.ibc) { - uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu); + uvg_hmvp_add_mv(state, x, y, cu_width, cu_height, cur_cu); } } @@ -1493,10 +2110,9 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i // Copy non-reference CUs to picture. uvg_cu_array_copy_from_lcu( tree_type != UVG_CHROMA_T ? state->tile->frame->cu_array : state->tile->frame->chroma_cu_array, - tree_type != UVG_CHROMA_T ? x_px : x_px / 2, - tree_type != UVG_CHROMA_T ? y_px : y_px / 2, - lcu, - tree_type); + x_px, + y_px, + lcu); // Copy pixels to picture. { @@ -1540,30 +2156,34 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con // will use these as temporary storage for predictions before making // a decision on which to use, and they get updated during the search // process. - lcu_t work_tree[MAX_PU_DEPTH + 1]; - init_lcu_t(state, x, y, &work_tree[0], hor_buf, ver_buf); - for (int depth = 1; depth <= MAX_PU_DEPTH; ++depth) { - work_tree[depth] = work_tree[0]; - } + lcu_t work_tree; + init_lcu_t(state, x, y, &work_tree, hor_buf, ver_buf); // If the ML depth prediction is enabled, // generate the depth prediction interval // for the current lcu constraint_t* constr = state->constraint; if (constr->ml_intra_depth_ctu) { - uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree[0].ref.y, state->qp); + uvg_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree.ref.y, state->qp); } int tree_type = state->frame->slicetype == UVG_SLICE_I - && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T; + && state->encoder_control->cfg.dual_tree + ? UVG_LUMA_T + : UVG_BOTH_T; + + cu_loc_t start; + uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH); + split_tree_t split_tree = { 0, 0, 0, 0, 0 }; // Start search from depth 0. double cost = search_cu( - state, - x, - y, - 0, - work_tree, - tree_type); + state, + &start, + &start, + &work_tree, + tree_type, + split_tree, + tree_type == UVG_BOTH_T); // Save squared cost for rate control. if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) { @@ -1572,29 +2192,28 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con // The best decisions through out the LCU got propagated back to depth 0, // so copy those back to the frame. - copy_lcu_to_cu_data(state, x, y, &work_tree[0], tree_type); + copy_lcu_to_cu_data(state, x, y, &work_tree, tree_type); // Copy coeffs to encoder state. - copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH); + copy_coeffs(work_tree.coeff.y, coeff->y, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH); if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) { cost = search_cu( - state, - x, - y, - 0, - work_tree, - UVG_CHROMA_T); + state, &start, + &start, + &work_tree, UVG_CHROMA_T, + split_tree, + true); if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) { uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost; } - copy_lcu_to_cu_data(state, x, y, &work_tree[0], UVG_CHROMA_T); + copy_lcu_to_cu_data(state, x, y, &work_tree, UVG_CHROMA_T); } - copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C); - copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C); + copy_coeffs(work_tree.coeff.u, coeff->u, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C); + copy_coeffs(work_tree.coeff.v, coeff->v, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C); if (state->encoder_control->cfg.jccr) { - copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C); + copy_coeffs(work_tree.coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C, LCU_WIDTH_C, LCU_WIDTH_C); } } diff --git a/src/search.h b/src/search.h index 7566fb96..809a4635 100644 --- a/src/search.h +++ b/src/search.h @@ -84,19 +84,17 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map); void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff); -double uvg_cu_rd_cost_luma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - const cu_info_t *const pred_cu, - lcu_t *const lcu); -double uvg_cu_rd_cost_chroma(const encoder_state_t *const state, - const int x_px, const int y_px, const int depth, - cu_info_t *const pred_cu, - lcu_t *const lcu); +double uvg_cu_rd_cost_luma( + const encoder_state_t *const state, + const cu_loc_t* const cu_loc, + const cu_info_t *const pred_cu, + lcu_t *const lcu, + uint8_t isp_cbf); +double uvg_cu_rd_cost_chroma( + const encoder_state_t *const state, + cu_info_t *const pred_cu, + lcu_t *const lcu, + const cu_loc_t * const); -void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type - tree_type); - -void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); -void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); #endif diff --git a/src/search_ibc.c b/src/search_ibc.c index 44f9ac50..2d80ec28 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -75,7 +75,8 @@ typedef struct { * \brief Possible optimized SAD implementation for the width, leave as * NULL for arbitrary-width blocks */ - optimized_sad_func_ptr_t optimized_sad; + optimized_sad_func_ptr_t optimized_sad_y; + optimized_sad_func_ptr_t optimized_sad_uv; lcu_t *lcu; @@ -109,8 +110,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x, } -static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y) { + const uint32_t x = loc->x; + const uint32_t y = loc->y; const int x_scu = SUB_SCU(x); const int y_scu = SUB_SCU(y); @@ -132,9 +135,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; - uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc); *cur_cu = cu_backup; + uint32_t width = loc->width; + uint32_t height = loc->height; cost = uvg_satd_any_size(width, width, @@ -162,10 +167,15 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu } -static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +static uint32_t calculate_ibc_cost_sad(ibc_search_info_t *info, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y) { - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); - + const uint32_t x = loc->x; + const uint32_t y = loc->y; + lcu_t *lcu = info->lcu; + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + const encoder_state_t* state = info->state; + cu_info_t cu_backup = *cur_cu; uint32_t cost = MAX_INT; @@ -173,6 +183,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s const int y_scu = SUB_SCU(y); const uint32_t offset = x_scu + y_scu * LCU_WIDTH; const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + const uint32_t width = loc->width; + const uint32_t height = loc->height; cur_cu->type = CU_IBC; cur_cu->inter.mv_dir = 1; @@ -183,23 +195,26 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; - uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc); *cur_cu = cu_backup; - if (optimized_sad != NULL) { - cost = optimized_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { - cost += optimized_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); - cost += optimized_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); - } + if (info->optimized_sad_y != NULL) { + cost = info->optimized_sad_y(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width, LCU_WIDTH, state->tile->frame->source->stride); } else { cost = uvg_reg_sad(lcu->rec.y + offset, &state->tile->frame->source->y[y * state->tile->frame->source->stride + x], width,width, LCU_WIDTH, state->tile->frame->source->stride); - if(state->encoder_control->chroma_format != UVG_CSP_400) { + } + + // ToDo: Enable chroma cost calculation + /* if (state->encoder_control->chroma_format != UVG_CSP_400) { + if (info->optimized_sad_uv != NULL) { + cost += info->optimized_sad_uv(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + cost += info->optimized_sad_uv(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); + } else { cost += uvg_reg_sad(lcu->rec.u + offset_c, &state->tile->frame->source->u[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); cost += uvg_reg_sad(lcu->rec.v + offset_c, &state->tile->frame->source->v[(y / 2) * state->tile->frame->source->stride / 2 + x / 2], width / 2, width / 2, LCU_WIDTH_C, state->tile->frame->source->stride / 2); } - } + }*/ return cost; } @@ -235,8 +250,11 @@ static bool check_mv_cost(ibc_search_info_t *info, double bitcost = 0; double cost = MAX_DOUBLE; + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height); - cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y); + + cost = calculate_ibc_cost_sad(info, &loc, x, y); if (cost >= *best_cost) return false; @@ -246,7 +264,7 @@ static bool check_mv_cost(ibc_search_info_t *info, info->mv_cand, NULL, 0, - NULL, + 0, &bitcost ); @@ -782,63 +800,47 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, * \param amvp Return searched AMVP PUs sorted by costs * \param merge Return searched Merge PUs sorted by costs */ -static void search_pu_ibc(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - unit_stats_map_t *amvp, - unit_stats_map_t *merge, - ibc_search_info_t *info) +static void search_pu_ibc( + encoder_state_t * const state, + const cu_loc_t * const cu_loc, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + ibc_search_info_t *info) { - const uvg_config *cfg = &state->encoder_control->cfg; - const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); - - // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and - // nRx2N partitions. - const bool merge_a1 = i_pu == 0 || width >= height; - // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and - // 2NxnD partitions. - const bool merge_b1 = i_pu == 0 || width <= height; - + const uvg_config *cfg = &state->encoder_control->cfg; + const videoframe_t * const frame = state->tile->frame; + const int width_cu = cu_loc->width; + const int height_cu= cu_loc->height; lcu_t *lcu = info->lcu; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); - cur_pu->type = CU_IBC; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->tr_depth = depth; - cur_pu->qp = state->qp; - cur_pu->inter.mv_dir = 1; + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_IBC; + cur_pu->qp = state->qp; + cur_pu->inter.mv_dir = 1; // Default to candidate 0 CU_SET_MV_CAND(cur_pu, 0, 0); - + FILL(*info, 0); - info->state = state; - info->pic = frame->source; - info->origin.x = x; - info->origin.y = y; - info->width = width; - info->height = height; - info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; - info->optimized_sad = uvg_get_optimized_sad(width); - info->lcu = lcu; + info->state = state; + info->pic = frame->source; + info->origin.x = cu_loc->x; + info->origin.y = cu_loc->y; + info->width = width_cu; + info->height = height_cu; + info->mvd_cost_func = + cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; + info->optimized_sad_y = uvg_get_optimized_sad(width_cu); + info->optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width); + info->lcu = lcu; // Search for merge mode candidates info->num_merge_cand = uvg_inter_get_merge_cand( state, - x, y, - width, height, - merge_a1, merge_b1, + cu_loc, info->merge_cand, lcu); @@ -853,7 +855,7 @@ static void search_pu_ibc(encoder_state_t * const state, #ifdef COMPLETE_PRED_MODE_BITS // Technically counting these bits would be correct, however counting // them universally degrades quality so this block is disabled by default - const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0); #else const double no_skip_flag = 0; #endif @@ -875,7 +877,7 @@ static void search_pu_ibc(encoder_state_t * const state, { continue; } - uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu); + uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc); merge->unit[merge->size] = *cur_pu; merge->unit[merge->size].type = CU_IBC; merge->unit[merge->size].merge_idx = merge_idx; @@ -883,11 +885,11 @@ static void search_pu_ibc(encoder_state_t * const state, merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc); } else { - merge->cost[merge->size] = uvg_satd_any_size(width, height, + merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; @@ -909,7 +911,7 @@ static void search_pu_ibc(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { merge->size = 1; @@ -919,6 +921,7 @@ static void search_pu_ibc(encoder_state_t * const state, merge->keys[0] = 0; } else if(cfg->rdo < 2) { + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. @@ -927,19 +930,18 @@ static void search_pu_ibc(encoder_state_t * const state, cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T); - uvg_inter_recon_cu(state, lcu, x, y, width, true, false); - uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); + uvg_inter_recon_cu(state, lcu, true, false, cu_loc); + uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T); - if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + if (cbf_is_set(cur_pu->cbf, COLOR_Y)) { continue; } else if (has_chroma) { - uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc); uvg_quantize_lcu_residual(state, false, has_chroma, false, /*we are only checking for lack of coeffs so no need to check jccr*/ - x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); - if (!cbf_is_set_any(cur_pu->cbf, depth)) { + cu_loc, cur_pu, lcu, true, UVG_BOTH_T); + if (!cbf_is_set_any(cur_pu->cbf)) { cur_pu->type = CU_IBC; cur_pu->merge_idx = merge_idx; cur_pu->skipped = true; @@ -964,15 +966,12 @@ static void search_pu_ibc(encoder_state_t * const state, // Do the motion search - uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, + uvg_inter_get_mv_cand(info->state, info->mv_cand, cur_pu, lcu, - NULL); + 0, + cu_loc); vector2d_t best_mv = { 0, 0 }; @@ -1003,9 +1002,7 @@ static void search_pu_ibc(encoder_state_t * const state, best_cost = calculate_ibc_cost_satd( info->state, lcu, - info->origin.x, - info->origin.y, - info->width, + cu_loc, (best_mv.x >> INTERNAL_MV_PREC), (best_mv.y >> INTERNAL_MV_PREC)); best_cost += best_bits * info->state->lambda; @@ -1052,16 +1049,16 @@ static void search_pu_ibc(encoder_state_t * const state, }; - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (state->encoder_control->cfg.rdo >= 2) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc); } if(cfg->rdo < 2) { int predmode_ctx; - const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3; - const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); + const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); @@ -1077,33 +1074,29 @@ static void search_pu_ibc(encoder_state_t * const state, #include "threads.h" static int uvg_search_hash_cu_ibc(encoder_state_t* const state, - int x, int y, int depth, + const cu_loc_t* cu_loc, lcu_t* lcu, double* inter_cost, double* inter_bitcost) { - const int x_cu = x; - const int y_cu = y; + const int x_cu = cu_loc->x; + const int y_cu = cu_loc->y; const int part_mode = SIZE_2Nx2N; const uvg_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int width = PU_GET_W(part_mode, width_cu, 0); - const int height = PU_GET_H(part_mode, width_cu, 0); + const int width_cu = cu_loc->width; + const int height_cu = cu_loc->height; const bool merge_a1 = true; const bool merge_b1 = true; ibc_search_info_t info; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(x_cu); + const int y_local = SUB_SCU(y_cu); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_IBC; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->tr_depth = depth; cur_pu->qp = state->qp; // Default to candidate 0 @@ -1113,24 +1106,20 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, info.state = state; info.pic = frame->source; - info.origin.x = x; - info.origin.y = y; - info.width = width; - info.height = height; + info.origin.x = cu_loc->x; + info.origin.y = cu_loc->y; + info.width = width_cu; + info.height = height_cu; info.mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; - info.optimized_sad = uvg_get_optimized_sad(width); + info.optimized_sad_y = uvg_get_optimized_sad(width_cu); + info.optimized_sad_uv = uvg_get_optimized_sad(cu_loc->chroma_width); info.lcu = lcu; // Search for merge mode candidates info.num_merge_cand = uvg_inter_get_merge_cand( state, - x, - y, - width, - height, - merge_a1, - merge_b1, + cu_loc, info.merge_cand, lcu); @@ -1145,17 +1134,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, static int evaluations = 0; static int hits = 0; - - UVG_CLOCK_T hashmap_start_temp; - UVG_CLOCK_T hashmap_end_temp; - - UVG_CLOCK_T hashmap_start_real_time; UVG_CLOCK_T hashmap_end_real_time; UVG_GET_TIME(&hashmap_start_real_time); - int xx = x; - int yy = y; + int xx = x_cu; + int yy = y_cu; int best_mv_x = INT_MAX>>2; int best_mv_y = INT_MAX>>2; @@ -1185,12 +1169,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, int pos_y = result->value & 0xffff; int mv_x = pos_x - xx; int mv_y = pos_y - yy; - if (pos_x <= xx - width && pos_y <= yy - height) { + if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) { valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y); if (valid_mv) { bool full_block = true; // Is the full block covered by the IBC? - for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) { - for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) { + for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) { + for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) { uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[ ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE]; @@ -1211,7 +1195,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, if (full_block) { double cost = ibc_cost, bits = ibc_bitcost; vector2d_t mv = { best_mv_x, best_mv_y}; - cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, NULL, &bits); + cost = calc_ibc_mvd_cost(state, mv_x, mv_y,INTERNAL_MV_PREC,info.mv_cand, info.merge_cand, info.num_merge_cand, 0, &bits); //double cost = get_ibc_mvd_coding_cost(state, &state->cabac, mv_x,mv_y) * state->lambda_sqrt; //cost += bool better_mv = cost < ibc_cost; @@ -1220,7 +1204,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, best_mv_y = mv_y; ibc_cost = cost; ibc_bitcost = bits; - fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y); + fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y); found_block = true; //break; } @@ -1238,7 +1222,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, //if (x > state->tile->frame->width-64 && y > state->tile->frame->height-64) //fprintf(stderr, "Hashmap time: %f (crc: %f, search: %f) Evaluations: %d Hits: %d, hashed in this block: %d\n", time_spent,crc_time, search_time, evaluations, hits,hashes_found); - if (!found_block) return; + if (!found_block) return 0; *inter_cost = ibc_cost; *inter_bitcost = ibc_bitcost; @@ -1267,18 +1251,16 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, cur_pu->skipped = merged; - const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); + const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); ibc_cost += ibc_flag * state->lambda; ibc_bitcost += ibc_flag; uvg_inter_recon_cu( state, lcu, - x, - y, - CU_WIDTH_FROM_DEPTH(depth), true, - state->encoder_control->chroma_format != UVG_CSP_400); + state->encoder_control->chroma_format != UVG_CSP_400, + cu_loc); if (*inter_cost < MAX_DOUBLE) { assert(fracmv_within_ibc_range( @@ -1286,7 +1268,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); } - + return 1; } @@ -1305,17 +1287,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, * \param inter_bitcost Return inter bitcost */ void uvg_search_cu_ibc(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost) + const cu_loc_t * const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; + // Quick hashmap search /* uvg_search_hash_cu_ibc( state, - x, y, depth, + cu_loc, lcu, inter_cost, inter_bitcost); @@ -1330,8 +1313,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state, info.lcu = lcu; search_pu_ibc(state, - x, y, depth, - SIZE_2Nx2N, 0, + cu_loc, amvp, &merge, &info); @@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state, return; } - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; cur_pu->type = CU_IBC; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), - true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, + true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc); if (*inter_cost < MAX_DOUBLE) { assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_ibc.h b/src/search_ibc.h index 14ce3b6f..b3c4e544 100644 --- a/src/search_ibc.h +++ b/src/search_ibc.h @@ -46,7 +46,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state, - int x, int y, int depth, + const cu_loc_t * const cu_loc, lcu_t *lcu, double *inter_cost, double* inter_bitcost); diff --git a/src/search_inter.c b/src/search_inter.c index 6508995f..76c7fc36 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc, /** * \brief Perform inter search for a single reference frame. */ -static void search_pu_inter_ref(inter_search_info_t *info, - int depth, +static void search_pu_inter_ref( + inter_search_info_t *info, lcu_t *lcu, cu_info_t *cur_cu, unit_stats_map_t *amvp) @@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Get MV candidates cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height); + uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); + info->mv_cand, + cur_cu, + lcu, + ref_list, + &cu_loc); vector2d_t best_mv = { 0, 0 }; @@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info, /** * \brief Search bipred modes for a PU. */ -static void search_pu_inter_bipred(inter_search_info_t *info, - int depth, - lcu_t *lcu, - unit_stats_map_t *amvp_bipred) +static void search_pu_inter_bipred( + inter_search_info_t *info, + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height); const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; const videoframe_t * const frame = info->state->tile->frame; @@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc); } // Don't try merge candidates that don't satisfy mv constraints. @@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info, uvg_inter_recon_bipred(info->state, ref->images[ref_LX[0][merge_cand[i].ref[0]]], ref->images[ref_LX[1][merge_cand[j].ref[1]]], - x, y, - width, - height, mv, lcu, true, - false); + false, + &cu_loc); const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride]; @@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, * \param amvp Return searched AMVP PUs sorted by costs * \param merge Return searched Merge PUs sorted by costs */ -static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, +static void search_pu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, lcu_t *lcu, unit_stats_map_t *amvp, unit_stats_map_t *merge, @@ -1678,25 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state, { const uvg_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = cu_loc->width; + const int height_cu = cu_loc->height; - // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and - // nRx2N partitions. - const bool merge_a1 = i_pu == 0 || width >= height; - // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and - // 2NxnD partitions. - const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_NOTSET; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; cur_pu->qp = state->qp; // Default to candidate 0 @@ -1707,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state, info->state = state; info->pic = frame->source; - info->origin.x = x; - info->origin.y = y; - info->width = width; - info->height = height; + info->origin.x = cu_loc->x; + info->origin.y = cu_loc->y; + info->width = width_cu; + info->height = height_cu; info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost; - info->optimized_sad = uvg_get_optimized_sad(width); + info->optimized_sad = uvg_get_optimized_sad(width_cu); // Search for merge mode candidates info->num_merge_cand = uvg_inter_get_merge_cand( state, - x, y, - width, height, - merge_a1, merge_b1, + cu_loc, info->merge_cand, lcu ); @@ -1754,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state, // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); @@ -1768,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state, { continue; } - uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); + uvg_inter_pred_pu(state, lcu, true, false, cu_loc); merge->unit[merge->size] = *cur_pu; merge->unit[merge->size].type = CU_INTER; merge->unit[merge->size].merge_idx = merge_idx; @@ -1776,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state, merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc); } else { - merge->cost[merge->size] = uvg_satd_any_size(width, height, + merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; @@ -1802,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { merge->size = 1; @@ -1812,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state, merge->keys[0] = 0; } else if(cfg->rdo < 2) { + + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. @@ -1824,22 +1811,22 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T); - uvg_inter_recon_cu(state, lcu, x, y, width, true, false); - uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); + uvg_inter_recon_cu(state, lcu, true, false, cu_loc); - if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { + uvg_quantize_lcu_residual(state, true, false, false, cu_loc, cur_pu, lcu, true, UVG_BOTH_T); + + if (cbf_is_set(cur_pu->cbf, COLOR_Y)) { continue; } else if (has_chroma) { - uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc); uvg_quantize_lcu_residual(state, false, has_chroma, false, /*we are only checking for lack of coeffs so no need to check jccr*/ - x, y, depth, cur_pu, lcu, + cu_loc, cur_pu, lcu, true, - UVG_BOTH_T); - if (!cbf_is_set_any(cur_pu->cbf, depth)) { + UVG_BOTH_T); + if (!cbf_is_set_any(cur_pu->cbf)) { cur_pu->type = CU_INTER; cur_pu->merge_idx = merge_idx; cur_pu->skipped = true; @@ -1871,7 +1858,7 @@ static void search_pu_inter(encoder_state_t * const state, info->ref_idx = ref_idx; info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); + search_pu_inter_ref(info, lcu, cur_pu, amvp); } assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); @@ -1936,14 +1923,11 @@ static void search_pu_inter(encoder_state_t * const state, info->ref = ref->images[info->ref_idx]; uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - unipred_pu, - lcu, - list); + info->mv_cand, + unipred_pu, + lcu, + list, + cu_loc); double frac_cost = MAX_DOUBLE; double frac_bits = MAX_INT; @@ -1964,8 +1948,8 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); + if (state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc); } amvp[list].cost[key] = frac_cost; @@ -1987,15 +1971,15 @@ static void search_pu_inter(encoder_state_t * const state, amvp[list].size = n_best; } - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { - if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc); + if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc); } // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B && cfg->bipred - && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred + && cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred if (can_use_bipred) { @@ -2026,25 +2010,23 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc); } uvg_inter_recon_bipred(info->state, - ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], - ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], - x, y, - width, - height, - mv, - lcu, - true, - false); + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], + mv, lcu, + true, + false, + cu_loc + ); - const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; + const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)]; + const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)]; best_bipred_cost = - uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); + uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH); double bitcost[2] = { 0, 0 }; @@ -2091,17 +2073,17 @@ static void search_pu_inter(encoder_state_t * const state, } // TODO: this probably should have a separate command line option - if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]); assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); uvg_sort_keys_by_cost(&amvp[2]); - if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc); } } if(cfg->rdo < 2) { int predmode_ctx; - const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); @@ -2135,22 +2117,19 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void uvg_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - cu_info_t* cur_cu, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost){ +void uvg_cu_cost_inter_rd2( + encoder_state_t * const state, + cu_info_t* cur_cu, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost, + const cu_loc_t* const cu_loc){ - int tr_depth = MAX(1, depth); - if (cur_cu->part_size != SIZE_2Nx2N) { - tr_depth = depth + 1; - } - uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T); + const int x_px = SUB_SCU(cu_loc->x); + const int y_px = SUB_SCU(cu_loc->y); + const int width = cu_loc->width; + const int height = cu_loc->height; - const int x_px = SUB_SCU(x); - const int y_px = SUB_SCU(y); - const int width = LCU_WIDTH >> depth; cabac_data_t cabac_copy; memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); cabac_data_t* cabac = &state->search_cabac; @@ -2160,31 +2139,43 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, *cur_pu = *cur_cu; const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); + uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc); int index = y_px * LCU_WIDTH + x_px; double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], LCU_WIDTH, LCU_WIDTH, - width) * UVG_LUMA_MULT; + width, height) * UVG_LUMA_MULT; if (reconstruct_chroma) { int index = y_px / 2 * LCU_WIDTH_C + x_px / 2; double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index], LCU_WIDTH_C, LCU_WIDTH_C, - width / 2); + cu_loc->chroma_width, cu_loc->chroma_height); double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index], LCU_WIDTH_C, LCU_WIDTH_C, - width / 2); + cu_loc->chroma_width, cu_loc->chroma_height); ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT; } double no_cbf_bits; double bits = 0; - const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL); - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL); + + int8_t depth = 0; + int8_t mtt_depth = 0; + uint32_t splits = cur_cu->split_tree; + while (splits & 7) { + if ((splits & 7) != QT_SPLIT) { + mtt_depth++; + } + depth++; + splits >>= 3; + } + const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth, 0}; + if (cur_cu->merged) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; - bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T); + bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree); } else { - no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T); + no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree); bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1); } double no_cbf_cost = ssd + no_cbf_bits * state->lambda; @@ -2194,20 +2185,20 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, state->encoder_control->cfg.chroma_trskip_enable; double chroma_cost = 0; - if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && cur_cu->depth == cur_cu->tr_depth && reconstruct_chroma) { + if((state->encoder_control->cfg.jccr || can_use_chroma_tr_skip) && PU_IS_TU(cur_cu) && reconstruct_chroma) { uvg_quantize_lcu_residual(state, true, false, - false, x, y, - depth, + false, + cu_loc, cur_cu, lcu, - false, - UVG_BOTH_T); + false, + UVG_BOTH_T); ALIGNED(64) uvg_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C]; ALIGNED(64) uvg_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C]; - uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, width, LCU_WIDTH_C, width); - uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, width, LCU_WIDTH_C, width); + uvg_pixels_blit(&lcu->ref.u[index], u_pred, width, height, LCU_WIDTH_C, width); + uvg_pixels_blit(&lcu->ref.v[index], v_pred, width, height, LCU_WIDTH_C, width); ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C]; ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C]; @@ -2216,6 +2207,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, u_pred, u_resi, width, + height, LCU_WIDTH_C, width); uvg_generate_residual( @@ -2223,19 +2215,17 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, v_pred, v_resi, width, + height, LCU_WIDTH_C, width); uvg_chorma_ts_out_t chorma_ts_out; uvg_chroma_transform_search( state, - depth, lcu, &cabac_copy, - width, - width, + cu_loc, index, - 0, cur_cu, u_pred, v_pred, @@ -2243,41 +2233,41 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, v_resi, &chorma_ts_out, UVG_BOTH_T); - cbf_clear(&cur_cu->cbf, depth, COLOR_U); - cbf_clear(&cur_cu->cbf, depth, COLOR_V); + cbf_clear(&cur_cu->cbf, COLOR_U); + cbf_clear(&cur_cu->cbf, COLOR_V); if (chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) { cur_cu->joint_cb_cr = 0; cur_cu->tr_skip |= (chorma_ts_out.best_u_index == CHROMA_TS) << COLOR_U; cur_cu->tr_skip |= (chorma_ts_out.best_v_index == CHROMA_TS) << COLOR_V; - if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_U); - if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, depth, COLOR_V); + if(chorma_ts_out.best_u_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_U); + if(chorma_ts_out.best_v_index != NO_RESIDUAL) cbf_set(&cur_cu->cbf, COLOR_V); chroma_cost += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost; } else { cur_cu->joint_cb_cr = chorma_ts_out.best_combined_index; - if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, depth, COLOR_U); - if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, depth, COLOR_V); + if (chorma_ts_out.best_combined_index & 2) cbf_set(&cur_cu->cbf, COLOR_U); + if (chorma_ts_out.best_combined_index & 1) cbf_set(&cur_cu->cbf, COLOR_V); chroma_cost += chorma_ts_out.best_combined_cost; } } else { uvg_quantize_lcu_residual(state, true, reconstruct_chroma, - reconstruct_chroma && state->encoder_control->cfg.jccr, x, y, - depth, + reconstruct_chroma && state->encoder_control->cfg.jccr, + cu_loc, cur_cu, lcu, - false, - UVG_BOTH_T); + false, + UVG_BOTH_T); } - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + int cbf = cbf_is_set_any(cur_cu->cbf); if(cbf) { - *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu); + *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0); if (reconstruct_chroma) { - if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) { - *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu); + if (!PU_IS_TU(cur_cu) || !state->encoder_control->cfg.jccr) { + *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc); } else { *inter_cost += chroma_cost; @@ -2297,7 +2287,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged) { cur_cu->skipped = 1; } *inter_cost = no_cbf_cost; @@ -2321,11 +2311,12 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void uvg_search_cu_inter(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost) +void uvg_search_cu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2338,12 +2329,8 @@ void uvg_search_cu_inter(encoder_state_t * const state, inter_search_info_t info; search_pu_inter(state, - x, y, depth, - SIZE_2Nx2N, 0, - lcu, - amvp, - &merge, - &info); + cu_loc, lcu, amvp, + &merge, &info); // Early Skip CU decision if (merge.size == 1 && merge.unit[0].skipped) { @@ -2385,13 +2372,14 @@ void uvg_search_cu_inter(encoder_state_t * const state, return; } - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), - true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, + true, state->encoder_control->chroma_format != UVG_CSP_400, + cu_loc); if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_inter.h b/src/search_inter.h index d76dd927..cdabd15a 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -73,11 +73,12 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state, int32_t ref_idx, double *bitcost); -void uvg_search_cu_inter(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost); +void uvg_search_cu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost); @@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state, const lcu_t *lcu, int x, int y); -void uvg_cu_cost_inter_rd2(encoder_state_t* const state, - int x, int y, int depth, +void uvg_cu_cost_inter_rd2( + encoder_state_t* const state, cu_info_t* cur_cu, lcu_t* lcu, double* inter_cost, - double* inter_bitcost); + double* inter_bitcost, + const cu_loc_t* const cu_loc); int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx); diff --git a/src/search_intra.c b/src/search_intra.c index 226c40c3..a644ed9c 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -49,6 +49,7 @@ #include "strategies/strategies-picture.h" #include "videoframe.h" #include "strategies/strategies-quant.h" +#include "uvg_math.h" // Normalize SAD for comparison against SATD to estimate transform skip @@ -129,17 +130,31 @@ static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double * * * \return */ -static void get_cost_dual(encoder_state_t * const state, - const pred_buffer preds, const uvg_pixel *orig_block, - cost_pixel_nxn_multi_func *satd_twin_func, - cost_pixel_nxn_multi_func *sad_twin_func, - int width, double *costs_out) +static void get_cost_dual( + encoder_state_t * const state, + const pred_buffer preds, + const uvg_pixel *orig_block, + cost_pixel_nxn_multi_func *satd_twin_func, + cost_pixel_nxn_multi_func *sad_twin_func, + int width, + int height, + double *costs_out) { #define PARALLEL_BLKS 2 unsigned satd_costs[PARALLEL_BLKS] = { 0 }; - satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs); + if (satd_twin_func != NULL) { + satd_twin_func(preds, orig_block, PARALLEL_BLKS, satd_costs); + } else { + satd_costs[0] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[0], width); + satd_costs[1] = uvg_satd_any_size_vtm(width, height, orig_block, width, preds[1], width); + } unsigned unsigned_sad_costs[PARALLEL_BLKS] = { 0 }; - sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs); + if (sad_twin_func != NULL) { + sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs); + } else { + unsigned_sad_costs[0] = uvg_reg_sad(preds[0], orig_block, width, height, width, width); + unsigned_sad_costs[1] = uvg_reg_sad(preds[1], orig_block, width, height, width, width); + } costs_out[0] = (double)MIN(satd_costs[0], unsigned_sad_costs[0] * 2); costs_out[1] = (double)MIN(satd_costs[1], unsigned_sad_costs[1] * 2); @@ -189,27 +204,29 @@ static void get_cost_dual(encoder_state_t * const state, * \param lcu_px Position of the top left pixel of current CU within current LCU. */ static void derive_mts_constraints(cu_info_t *const pred_cu, - lcu_t *const lcu, const int depth, + lcu_t *const lcu, const int width, const int height, const vector2d_t lcu_px) { - const int width = LCU_WIDTH >> depth; - int8_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + int8_t scan_idx = SCAN_DIAG; int32_t i; // ToDo: large block support in VVC? uint32_t sig_coeffgroup_flag[32 * 32] = { 0 }; - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] - + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1]; - const uint32_t *scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1]; - const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_idx]; - const coeff_t* coeff = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, lcu_px.x, lcu_px.y)]; + const uint32_t log2_block_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_block_height = uvg_g_convert_to_log2[height]; + const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_block_width, log2_block_height); + const uint32_t * const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_idx, log2_block_width, log2_block_height); + + coeff_t coeff_y[TR_MAX_WIDTH * TR_MAX_WIDTH]; + uvg_get_sub_coeff(coeff_y, lcu->coeff.y, lcu_px.x, lcu_px.y, width, height, LCU_WIDTH); signed scan_cg_last = -1; signed scan_pos_last = -1; - for (int i = 0; i < width * width; i++) { - if (coeff[scan[i]]) { + for (int i = 0; i < width * height; i++) { + if (coeff_y[scan[i]]) { scan_pos_last = i; sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1; } @@ -247,6 +264,7 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, } + /** * \brief Perform search for best intra transform split configuration. * @@ -262,51 +280,42 @@ static void derive_mts_constraints(cu_info_t *const pred_cu, */ static double search_intra_trdepth( encoder_state_t * const state, - int x_px, - int y_px, - int depth, - int max_depth, + const cu_loc_t* const cu_loc, double cost_treshold, intra_search_data_t *const search_data, lcu_t *const lcu, enum uvg_tree_type tree_type) { - assert(depth >= 0 && depth <= MAX_PU_DEPTH); - - const int width = LCU_WIDTH >> depth; - const int height = width; // TODO: height for non-square blocks - const int width_c = width > TR_MIN_WIDTH ? width / 2 : width; - - const int offset = width / 2; - const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; + const uint8_t width = cu_loc->width; + const uint8_t height = cu_loc->height; // TODO: height for non-square blocks + const uint8_t width_c = cu_loc->chroma_width; + const uint8_t height_c = cu_loc->chroma_height; + + const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y }; const bool reconstruct_chroma = false;// (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400; cu_info_t* pred_cu = &search_data->pred_cu; - cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); - - struct { - uvg_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; - uvg_pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH]; - uvg_pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH]; - } nosplit_pixels; - uint16_t nosplit_cbf = 0; double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; - if (depth > 0) { - tr_cu->tr_depth = depth; - pred_cu->tr_depth = depth; + cabac_data_t cabac_data; + memcpy(&cabac_data, &state->search_cabac, sizeof(cabac_data_t)); + state->search_cabac.update = 1; + + if (width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH) { const bool mts_enabled = (state->encoder_control->cfg.mts == UVG_MTS_INTRA || state->encoder_control->cfg.mts == UVG_MTS_BOTH) - && tr_cu->depth == tr_cu->tr_depth; + && PU_IS_TU(pred_cu); nosplit_cost = 0.0; + const bool has_been_split = 1 << pred_cu->log2_width != cu_loc->width || + 1 << pred_cu->log2_height != cu_loc->height; - cbf_clear(&pred_cu->cbf, depth, COLOR_Y); + cbf_clear(&pred_cu->cbf, COLOR_Y); if (reconstruct_chroma) { - cbf_clear(&pred_cu->cbf, depth, COLOR_U); - cbf_clear(&pred_cu->cbf, depth, COLOR_V); + cbf_clear(&pred_cu->cbf, COLOR_U); + cbf_clear(&pred_cu->cbf, COLOR_V); } const int8_t chroma_mode = reconstruct_chroma ? (!pred_cu->intra.mip_flag ? pred_cu->intra.mode : 0) : -1; @@ -325,51 +334,53 @@ static double search_intra_trdepth( { trafo = 0; num_transforms = (mts_enabled ? MTS_TR_NUM : 1); + // Do not do MTS search if ISP mode is used + num_transforms = pred_cu->intra.isp_mode == ISP_MODE_NO_ISP ? num_transforms : 1; } const int mts_start = trafo; - //TODO: height - if (state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) { + if (state->encoder_control->cfg.trskip_enable + && width <= (1 << state->encoder_control->cfg.trskip_max_size) + && height <= (1 << state->encoder_control->cfg.trskip_max_size) + && PU_IS_TU(pred_cu) + && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { num_transforms = MAX(num_transforms, 2); } pred_cu->intra.mode_chroma = -1; - pred_cu->joint_cb_cr = 4; const int max_tb_size = TR_MAX_WIDTH; // LFNST search params - int max_lfnst_idx = width > max_tb_size || height > max_tb_size ? - 0 : - 2; + int max_lfnst_idx = width > max_tb_size || height > max_tb_size ? 0 : 2; if(pred_cu->intra.mip_flag && (width < 16 || height < 16)) { max_lfnst_idx = 0; } - + int start_idx = 0; - int end_idx = state->encoder_control->cfg.lfnst && depth == pred_cu-> - tr_depth ? - max_lfnst_idx : - 0; - for (int i = start_idx; i < end_idx + 1; ++i) { + int end_lfnst_idx = state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) && + uvg_can_use_isp_with_lfnst(width, height, pred_cu->intra.isp_mode, tree_type) ? max_lfnst_idx : 0; + for (int i = start_idx; i < end_lfnst_idx + 1; ++i) { search_data->lfnst_costs[i] = MAX_DOUBLE; } - - for (int lfnst_idx = start_idx; lfnst_idx <= end_idx; lfnst_idx++) { - // Initialize lfnst variables - pred_cu->lfnst_idx = lfnst_idx; - pred_cu->violates_lfnst_constrained_luma = false; - pred_cu->violates_lfnst_constrained_chroma = false; - pred_cu->lfnst_last_scan_pos = false; - - for (trafo = mts_start; trafo < num_transforms; trafo++) { + for (trafo = mts_start; trafo < num_transforms; trafo++) { + for (int lfnst_idx = start_idx; lfnst_idx <= end_lfnst_idx; lfnst_idx++) { + // Initialize lfnst variables + search_data->best_isp_cbfs = 0; pred_cu->tr_idx = trafo; pred_cu->tr_skip = trafo == MTS_SKIP; - bool constraints[2] = { false, false}; + pred_cu->lfnst_idx = lfnst_idx; + pred_cu->violates_lfnst_constrained_luma = false; + pred_cu->violates_lfnst_constrained_chroma = false; + pred_cu->lfnst_last_scan_pos = false; + + bool constraints[2] = {false, false}; if (mts_enabled) { pred_cu->mts_last_scan_pos = 0; pred_cu->violates_mts_coeff_constraint = 0; - if ((trafo == MTS_SKIP && width > (1 << state->encoder_control->cfg.trskip_max_size)) - || !state->encoder_control->cfg.trskip_enable) { + if (trafo == MTS_SKIP && ((width > (1 << state->encoder_control->cfg.trskip_max_size) + || (height > (1 << state->encoder_control->cfg.trskip_max_size))) + || !PU_IS_TU(pred_cu) + || !state->encoder_control->cfg.trskip_enable)) { continue; } } @@ -377,65 +388,80 @@ static double search_intra_trdepth( if (pred_cu->lfnst_idx > 0 && pred_cu->tr_idx > 0) { continue; } - - uvg_intra_recon_cu( - state, - x_px, - y_px, - depth, - search_data, - pred_cu, - lcu, - UVG_LUMA_T, - true, - false); - if (trafo != 0 && !cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) continue; + if (!has_been_split && (lfnst_idx != 0 || trafo != 0)) { + memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data)); + state->search_cabac.update = 1; + } + double rd_cost; + if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP) { + rd_cost = uvg_recon_and_estimate_cost_isp( + state, + cu_loc, + cost_treshold, + search_data, + lcu, + &constraints[0] + ); + constraints[1] = search_data->best_isp_cbfs != 0; + } + else { + uvg_intra_recon_cu( + state, + search_data, + cu_loc, + pred_cu, + lcu, + UVG_LUMA_T, + true, + false + ); + } + if (pred_cu->intra.isp_mode != ISP_MODE_NO_ISP && search_data->best_isp_cbfs == 0) continue; + + if ((trafo != 0 || lfnst_idx != 0) && !cbf_is_set(pred_cu->cbf, COLOR_Y)) continue; - derive_mts_constraints(pred_cu, lcu, depth, lcu_px); + derive_mts_constraints(pred_cu, lcu, width, height, lcu_px); if (pred_cu->tr_idx > 1) { if (pred_cu->violates_mts_coeff_constraint || !pred_cu-> mts_last_scan_pos) { continue; } } - - const unsigned scan_offset = xy_to_zorder( - LCU_WIDTH, - lcu_px.x, - lcu_px.y); - - if (trafo != MTS_SKIP && end_idx != 0) { + + if (trafo != MTS_SKIP && end_lfnst_idx != 0 && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { uvg_derive_lfnst_constraints( pred_cu, - depth, constraints, - &lcu->coeff.y[scan_offset], + lcu->coeff.y, width, - height - ); + height, + &lcu_px, + COLOR_Y); } - if (!constraints[1] && cbf_is_set(pred_cu->cbf, depth, COLOR_Y)) { + if (!constraints[1] && cbf_is_set(pred_cu->cbf, COLOR_Y)) { //end_idx = 0; if (pred_cu->lfnst_idx > 0) { continue; } } - double rd_cost = uvg_cu_rd_cost_luma( - state, - lcu_px.x, - lcu_px.y, - depth, - pred_cu, - lcu); + + + if (pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) { + rd_cost = uvg_cu_rd_cost_luma( + state, + cu_loc, + pred_cu, + lcu, + search_data->best_isp_cbfs); + } double transform_bits = 0; - if (state->encoder_control->cfg.lfnst && depth == pred_cu->tr_depth && - trafo != MTS_SKIP) { - if (!constraints[0] && constraints[1]) { + if (state->encoder_control->cfg.lfnst && PU_IS_TU(pred_cu) && + trafo != MTS_SKIP && end_lfnst_idx != 0 && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0)) { + if ((!constraints[0] && (constraints[1] || pred_cu->intra.isp_mode != ISP_MODE_NO_ISP))) { transform_bits += CTX_ENTROPY_FBITS( - &state->search_cabac.ctx.lfnst_idx_model[tr_cu->depth == 4 || - tree_type == UVG_LUMA_T], + &state->search_cabac.ctx.lfnst_idx_model[tree_type == UVG_LUMA_T], lfnst_idx != 0); if (lfnst_idx > 0) { transform_bits += CTX_ENTROPY_FBITS( @@ -444,10 +470,14 @@ static double search_intra_trdepth( } } } - if (num_transforms > 2 && trafo != MTS_SKIP && width <= 32 - /*&& height <= 32*/ + if (num_transforms > 2 && trafo != MTS_SKIP + && (cbf_is_set(pred_cu->cbf, COLOR_Y) || search_data->best_isp_cbfs != 0) + && pred_cu->intra.isp_mode == ISP_MODE_NO_ISP + && lfnst_idx == 0 + && width <= 32 + && height <= 32 && !pred_cu->violates_mts_coeff_constraint && pred_cu-> - mts_last_scan_pos && lfnst_idx == 0) { + mts_last_scan_pos) { bool symbol = trafo != 0; int ctx_idx = 0; @@ -464,7 +494,7 @@ static double search_intra_trdepth( } } - rd_cost += transform_bits * state->frame->lambda; + rd_cost += transform_bits * state->lambda; search_data->lfnst_costs[lfnst_idx] = MIN( search_data->lfnst_costs[lfnst_idx], @@ -480,30 +510,22 @@ static double search_intra_trdepth( if (reconstruct_chroma) { int8_t luma_mode = pred_cu->intra.mode; pred_cu->intra.mode_chroma = chroma_mode; - pred_cu->joint_cb_cr = 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently - const unsigned scan_offset = xy_to_zorder( - LCU_WIDTH_C, - lcu_px.x, - lcu_px.y); uvg_intra_recon_cu( state, - x_px, - y_px, - depth, search_data, + cu_loc, pred_cu, lcu, UVG_BOTH_T, false, - true); + true + ); best_rd_cost += uvg_cu_rd_cost_chroma( state, - lcu_px.x, - lcu_px.y, - depth, pred_cu, - lcu); + lcu, + cu_loc); pred_cu->intra.mode = luma_mode; // Check lfnst constraints for chroma @@ -513,24 +535,24 @@ static double search_intra_trdepth( pred_cu->lfnst_last_scan_pos}; uvg_derive_lfnst_constraints( pred_cu, - depth, constraints, - &lcu->coeff.u[scan_offset], + lcu->coeff.u, width_c, - width_c - ); + height_c, + &lcu_px, + COLOR_U); if (constraints[0] || !constraints[1]) { best_lfnst_idx = 0; continue; } uvg_derive_lfnst_constraints( pred_cu, - depth, constraints, - &lcu->coeff.u[scan_offset], + lcu->coeff.u, width_c, - width_c - ); + height_c, + &lcu_px, + COLOR_U); if (constraints[0] || !constraints[1]) { best_lfnst_idx = 0; continue; @@ -542,14 +564,13 @@ static double search_intra_trdepth( if(reconstruct_chroma) { int8_t luma_mode = pred_cu->intra.mode; pred_cu->intra.mode_chroma = chroma_mode; - pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently uvg_intra_recon_cu(state, - x_px, y_px, - depth, search_data, - pred_cu, - lcu, - UVG_BOTH_T,false,true); - best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + search_data, cu_loc, + pred_cu, lcu, + UVG_BOTH_T, + false, + true); + best_rd_cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc); pred_cu->intra.mode = luma_mode; } pred_cu->tr_skip = best_tr_idx == MTS_SKIP; @@ -562,35 +583,10 @@ static double search_intra_trdepth( // Early stop condition for the recursive search. // If the cost of any 1/4th of the transform is already larger than the // whole transform, assume that splitting further is a bad idea. - if (nosplit_cost >= cost_treshold) { + if (nosplit_cost <= cost_treshold) { + memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data)); return nosplit_cost; } - - nosplit_cbf = pred_cu->cbf; - - uvg_pixels_blit( - lcu->rec.y, - nosplit_pixels.y, - width, - width, - LCU_WIDTH, - width); - if (reconstruct_chroma) { - uvg_pixels_blit( - lcu->rec.u, - nosplit_pixels.u, - width_c, - width_c, - LCU_WIDTH_C, - width_c); - uvg_pixels_blit( - lcu->rec.v, - nosplit_pixels.v, - width_c, - width_c, - LCU_WIDTH_C, - width_c); - } } @@ -599,63 +595,32 @@ static double search_intra_trdepth( // - Maximum transform hierarchy depth is constrained by clipping // max_depth. // - Min transform size hasn't been reached (MAX_PU_DEPTH). - if (depth < max_depth && depth < MAX_PU_DEPTH) { + else { split_cost = 0; - split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type); - if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type); + + enum split_type split; + if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; } - if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type); + else if (cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; } - if (split_cost < nosplit_cost) { - split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu, tree_type); + else { + split = BT_HOR_SPLIT; } - double cbf_bits = 0.0; - - // Add cost of cbf chroma bits on transform tree. - // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true - // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0 - // if this and any previous transform block has no chroma coefficients. - // When searching the first block we don't actually know the real values, - // so this will code cbf as 0 and not code the cbf at all for descendants. - if (state->encoder_control->chroma_format != UVG_CSP_400) { - const uint8_t tr_depth = depth - pred_cu->depth; - cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; - - cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]); - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) { - CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb"); - } - ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]); - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) { - CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr"); - } + cu_loc_t split_cu_loc[4]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + for (int i = 0; i < split_count; ++i) { + split_cost += search_intra_trdepth(state, &split_cu_loc[i], nosplit_cost, search_data, lcu, tree_type); } - - double bits = cbf_bits; - split_cost += bits * state->lambda; - } else { - assert(width <= TR_MAX_WIDTH); } + memcpy(&state->search_cabac, &cabac_data, sizeof(cabac_data)); - if (depth == 0 || split_cost < nosplit_cost) { + if (!PU_IS_TU(pred_cu) || split_cost < nosplit_cost) { return split_cost; } else { - uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type); - - pred_cu->cbf = nosplit_cbf; - - // We only restore the pixel data and not coefficients or cbf data. - // The only thing we really need are the border pixels.uvg_intra_get_dir_luma_predictor - uvg_pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH); - if (reconstruct_chroma) { - uvg_pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C); - uvg_pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C); - } - return nosplit_cost; } } @@ -679,25 +644,31 @@ static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length) static int search_intra_chroma_rough( encoder_state_t * const state, - int x_px, - int y_px, - int depth, - const uvg_pixel *orig_u, - const uvg_pixel *orig_v, - int16_t origstride, - uvg_intra_references *refs_u, - uvg_intra_references *refs_v, intra_search_data_t* chroma_data, lcu_t* lcu, int8_t luma_mode, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc) { - assert(depth != 4 || (x_px & 4 && y_px & 4)); + const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width]; + const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; + const vector2d_t luma_px = { cu_loc->x, cu_loc->y}; + const int width = 1 << log2_width_c; + const int height = width; // TODO: height for non-square blocks - const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); + const cu_loc_t loc = { luma_px.x, luma_px.y, width, height, width, height }; + + uvg_intra_references refs_u; + uvg_intra_build_reference(state, &loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0, 0); + + uvg_intra_references refs_v; + uvg_intra_build_reference(state, &loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0); + + vector2d_t lcu_cpx = { (cu_loc->local_x) / 2, (cu_loc->local_y) / 2 }; + uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; //cost_pixel_nxn_func *const sad_func = uvg_pixels_get_sad_func(width); - cu_loc_t loc = { x_px & ~7, y_px & ~7, width, width, width, width }; uvg_pixel _pred[32 * 32 + SIMD_ALIGNMENT]; uvg_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); @@ -705,12 +676,12 @@ static int search_intra_chroma_rough( uvg_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT]; uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); - uvg_pixels_blit(orig_u, orig_block, width, width, origstride, width); + uvg_pixels_blit(orig_u, orig_block, width, height, LCU_WIDTH_C, width); int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5); for (int i = 0; i < modes_count; ++i) { const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma; if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue; - uvg_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu, tree_type); + uvg_intra_predict(state, &refs_u, cu_loc, &loc, COLOR_U, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); switch (width) { case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block); @@ -725,11 +696,11 @@ static int search_intra_chroma_rough( } } - uvg_pixels_blit(orig_v, orig_block, width, width, origstride, width); + uvg_pixels_blit(orig_v, orig_block, width, height, LCU_WIDTH_C, width); for (int i = 0; i < modes_count; ++i) { const int8_t mode_chroma = chroma_data[i].pred_cu.intra.mode_chroma; if (mode_chroma == luma_mode || mode_chroma == 0 || mode_chroma >= 81) continue; - uvg_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu, tree_type); + uvg_intra_predict(state, &refs_v, cu_loc, &loc, COLOR_V, pred, &chroma_data[i], lcu); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); switch (width) { case 4: chroma_data[i].cost += uvg_satd_4x4(pred, orig_block); @@ -818,7 +789,7 @@ static int16_t search_intra_rough( uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); // Store original block for SAD computation - uvg_pixels_blit(orig, orig_block, width, width, origstride, width); + uvg_pixels_blit(orig, orig_block, width, height, origstride, width); int8_t modes_selected = 0; // Note: get_cost and get_cost_dual may return negative costs. @@ -837,7 +808,7 @@ static int16_t search_intra_rough( // Calculate SAD for evenly spaced modes to select the starting point for // the recursive search. - cu_loc_t loc = { 0, 0, width, width, width, width }; + cu_loc_t loc = { 0, 0, width, height, width, height }; intra_search_data_t search_proxy; FILL(search_proxy, 0); search_proxy.pred_cu = *pred_cu; @@ -984,8 +955,9 @@ static INLINE double count_bits( const double not_mpm_mode_bit, const double planar_mode_flag, const double not_planar_mode_flag, + const double not_isp_flag, int8_t mode - ) +) { int i = 0; int smaller_than_pred = 0; @@ -1007,7 +979,7 @@ static INLINE double count_bits( else { bits = not_mpm_mode_bit + 5 + (mode - smaller_than_pred > 2); } - bits += not_mrl + not_mip; + bits += not_mrl + not_mip + not_isp_flag; return bits; } @@ -1017,19 +989,19 @@ static uint8_t search_intra_rough( uvg_pixel *orig, int32_t origstride, uvg_intra_references *refs, - int log2_width, + int width, + int height, int8_t *intra_preds, intra_search_data_t* modes_out, cu_info_t* const pred_cu, uint8_t mip_ctx) { #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future? - assert(log2_width >= 2 && log2_width <= 5); - int_fast8_t width = 1 << log2_width; + assert(width >= 4 && width <= 32); // cost_pixel_nxn_func *satd_func = kvz_pixels_get_satd_func(width); // cost_pixel_nxn_func *sad_func = kvz_pixels_get_sad_func(width); - cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width); - cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width); + cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width, height); + cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width, height); bool mode_checked[UVG_NUM_INTRA_MODES] = {0}; double costs[UVG_NUM_INTRA_MODES]; @@ -1044,7 +1016,7 @@ static uint8_t search_intra_rough( uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); // Store original block for SAD computation - uvg_pixels_blit(orig, orig_block, width, width, origstride, width); + uvg_pixels_blit(orig, orig_block, width, height, origstride, width); int8_t modes_selected = 0; // Note: get_cost and get_cost_dual may return negative costs. @@ -1055,13 +1027,14 @@ static uint8_t search_intra_rough( int8_t mode; double cost; }; - + const double not_mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0; const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1); const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0); const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0); const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1); + const double not_isp_flag = state->encoder_control->cfg.isp && uvg_can_use_isp(width, height) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_subpart_model[0]), 0) : 0; const uint8_t mode_list_size = state->encoder_control->cfg.mip ? 6 : 3; struct mode_cost best_six_modes[6]; @@ -1070,17 +1043,16 @@ static uint8_t search_intra_rough( // Calculate SAD for evenly spaced modes to select the starting point for // the recursive search. - cu_loc_t loc = { 0, 0, width, width, width, width }; intra_search_data_t search_proxy; FILL(search_proxy, 0); search_proxy.pred_cu = *pred_cu; int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels; search_proxy.pred_cu.intra.mode = 0; - uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T); + uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[0], &search_proxy, NULL); search_proxy.pred_cu.intra.mode = 1; - uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[1], &search_proxy, NULL, UVG_LUMA_T); - get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs); + uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[1], &search_proxy, NULL); + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs); mode_checked[0] = true; mode_checked[1] = true; costs[0] += count_bits( @@ -1092,7 +1064,7 @@ static uint8_t search_intra_rough( not_mpm_mode_bit, planar_mode_flag, not_planar_mode_flag, - 0) * state->lambda_sqrt; + not_isp_flag, 0) * state->lambda_sqrt; costs[1] += count_bits( state, intra_preds, @@ -1102,7 +1074,7 @@ static uint8_t search_intra_rough( not_mpm_mode_bit, planar_mode_flag, not_planar_mode_flag, - 1) * state->lambda_sqrt; + not_isp_flag, 1) * state->lambda_sqrt; if(costs[0] < costs[1]) { min_cost = costs[0]; max_cost = costs[1]; @@ -1129,12 +1101,12 @@ static uint8_t search_intra_rough( for (int i = 0; i < PARALLEL_BLKS; ++i) { if (mode + i * offset <= 66) { search_proxy.pred_cu.intra.mode = mode + i*offset; - uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL, UVG_LUMA_T); + uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[i], &search_proxy, NULL); } } //TODO: add generic version of get cost multi - get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out); for (int i = 0; i < PARALLEL_BLKS; ++i) { if (mode + i * offset <= 66) { costs_out[i] += count_bits( @@ -1146,7 +1118,7 @@ static uint8_t search_intra_rough( not_mpm_mode_bit, planar_mode_flag, not_planar_mode_flag, - mode + i * offset) * state->lambda_sqrt; + not_isp_flag, mode + i * offset) * state->lambda_sqrt; } } @@ -1201,12 +1173,12 @@ static uint8_t search_intra_rough( for (int block = 0; block < PARALLEL_BLKS; ++block) { search_proxy.pred_cu.intra.mode = modes_to_check[block + i]; - uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[block], &search_proxy, NULL, UVG_LUMA_T); + uvg_intra_predict(state, refs, cu_loc, cu_loc, COLOR_Y, preds[block], &search_proxy, NULL); } //TODO: add generic version of get cost multi - get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out); for (int block = 0; block < PARALLEL_BLKS; ++block) { costs_out[block] += count_bits( state, @@ -1217,7 +1189,7 @@ static uint8_t search_intra_rough( not_mpm_mode_bit, planar_mode_flag, not_planar_mode_flag, - modes_to_check[block + i]) * state->lambda_sqrt; + not_isp_flag, modes_to_check[block + i]) * state->lambda_sqrt; } @@ -1270,8 +1242,12 @@ static void get_rough_cost_for_2n_modes( #define PARALLEL_BLKS 2 assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes"); const int width = cu_loc->width; - cost_pixel_nxn_multi_func* satd_dual_func = uvg_pixels_get_satd_dual_func(width); - cost_pixel_nxn_multi_func* sad_dual_func = uvg_pixels_get_sad_dual_func(width); + const int height = cu_loc->height; + cost_pixel_nxn_multi_func* satd_dual_func; + cost_pixel_nxn_multi_func* sad_dual_func; + satd_dual_func = uvg_pixels_get_satd_dual_func(width, height); + sad_dual_func = uvg_pixels_get_sad_dual_func(width, height); + uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT]; pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT); @@ -1279,7 +1255,7 @@ static void get_rough_cost_for_2n_modes( uvg_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT]; uvg_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); - uvg_pixels_blit(orig, orig_block, width, width, orig_stride, width); + uvg_pixels_blit(orig, orig_block, width, height, orig_stride, width); const double mrl = state->encoder_control->cfg.mrl && (cu_loc->y % LCU_WIDTH) ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0; const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0; @@ -1288,9 +1264,9 @@ static void get_rough_cost_for_2n_modes( double bits[PARALLEL_BLKS] = { 0 }; for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) { for (int i = 0; i < PARALLEL_BLKS; ++i) { - uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL, UVG_LUMA_T); + uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL); } - get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out); + get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, height, costs_out); for(int i = 0; i < PARALLEL_BLKS; ++i) { uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx; @@ -1345,28 +1321,58 @@ static void get_rough_cost_for_2n_modes( */ static int8_t search_intra_rdo( encoder_state_t * const state, - int x_px, - int y_px, - int depth, int modes_to_check, intra_search_data_t *search_data, lcu_t *lcu, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc) { - const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra); + const int width = cu_loc->width; + const int height = cu_loc->height; // TODO: height for non-square blocks for (int mode = 0; mode < modes_to_check; mode++) { - double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu); - search_data[mode].pred_cu.tr_idx = MTS_TR_NUM; - search_data[mode].bits = rdo_bitcost; - search_data[mode].cost = rdo_bitcost * state->lambda; + bool can_do_isp_search = search_data[mode].pred_cu.intra.mip_flag ? false : true; // Cannot use ISP with MIP + // can_do_isp_search = search_data[mode].pred_cu.intra.multi_ref_idx == 0 ? can_do_isp_search : false; // Cannot use ISP with MRL + const uint8_t mrl_idx = search_data[mode].pred_cu.intra.multi_ref_idx; + double best_isp_cost = MAX_DOUBLE; + double best_bits = MAX_DOUBLE; + int8_t best_isp_mode = 0; + int max_isp_modes = can_do_isp_search && uvg_can_use_isp(width, height) && state->encoder_control->cfg.isp ? NUM_ISP_MODES : 1; - double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu, tree_type); - search_data[mode].cost += mode_cost; - if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) { - modes_to_check = mode + 1; - break; + // + uint8_t best_mts_mode_for_isp[NUM_ISP_MODES] = {0}; + uint8_t best_lfnst_mode_for_isp[NUM_ISP_MODES] = {0}; + for (int isp_mode = 0; isp_mode < max_isp_modes; ++isp_mode) { + + + search_data[mode].pred_cu.intra.isp_mode = isp_mode; + search_data[mode].pred_cu.intra.multi_ref_idx = isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0; + double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, cu_loc, lcu); + search_data[mode].pred_cu.tr_idx = MTS_TR_NUM; + search_data[mode].bits = rdo_bitcost; + search_data[mode].cost = rdo_bitcost * state->lambda; + + double mode_cost = search_intra_trdepth(state, cu_loc, MAX_INT, &search_data[mode], lcu, tree_type); + best_mts_mode_for_isp[isp_mode] = search_data[mode].pred_cu.tr_idx; + best_lfnst_mode_for_isp[isp_mode] = search_data[mode].pred_cu.lfnst_idx; + search_data[mode].cost += mode_cost; + if (search_data[mode].cost < best_isp_cost) { + best_isp_cost = search_data[mode].cost; + best_isp_mode = isp_mode; + best_bits = search_data[mode].bits; + } + if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf)) { + modes_to_check = mode + 1; + break; + } } + search_data[mode].cost = best_isp_cost; + search_data[mode].bits = best_bits; + search_data[mode].pred_cu.intra.isp_mode = best_isp_mode; + search_data[mode].pred_cu.intra.multi_ref_idx = best_isp_mode == ISP_MODE_NO_ISP ? mrl_idx : 0; + search_data[mode].pred_cu.tr_idx = best_mts_mode_for_isp[best_isp_mode]; + search_data[mode].pred_cu.tr_skip = best_mts_mode_for_isp[best_isp_mode] == MTS_SKIP; + search_data[mode].pred_cu.lfnst_idx = best_lfnst_mode_for_isp[best_isp_mode]; } // Update order according to new costs @@ -1384,7 +1390,9 @@ static int8_t search_intra_rdo( } -double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu) +double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t* + const cu_loc, + const lcu_t* lcu) { cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac; double mode_bits = 0; @@ -1393,8 +1401,8 @@ double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const c uvg_encode_intra_luma_coding_unit( state, &cabac_copy, cur_cu, - x, y, depth, lcu, &mode_bits - ); + cu_loc, lcu, &mode_bits + ); return mode_bits; } @@ -1436,20 +1444,20 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in int8_t uvg_search_intra_chroma_rdo( encoder_state_t * const state, - int x_px, - int y_px, - int depth, int8_t num_modes, lcu_t *const lcu, + const cu_loc_t* const cu_loc, intra_search_data_t* chroma_data, int8_t luma_mode, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + bool is_separate) { - const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4); - - + const bool reconstruct_chroma = true; + + const int chroma_width = cu_loc->chroma_width; + const int chroma_height = cu_loc->chroma_height; uvg_intra_references refs[2]; - const vector2d_t luma_px = { x_px & ~7, y_px & ~7 }; + const vector2d_t luma_px = { cu_loc->x, cu_loc->y }; const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height, @@ -1457,29 +1465,21 @@ int8_t uvg_search_intra_chroma_rdo( if (reconstruct_chroma) { - int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2); - uvg_intra_build_reference(log2_width, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0); - uvg_intra_build_reference(log2_width, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0); + uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0); + uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0); - const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; + const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y }; cabac_data_t temp_cabac; memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t)); - int8_t width = 1 << log2_width; - int8_t height = 1 << log2_width; - const cu_loc_t loc = { x_px &~7, y_px & ~7, width, height, width, height}; - const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C; + + const int offset = ((cu_loc->local_x) >> 1) + ((cu_loc->local_y) >> 1)* LCU_WIDTH_C; int lfnst_modes_to_check[3]; - if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) { + if((is_separate || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && PU_IS_TU(&chroma_data->pred_cu) && chroma_height >= 4 && chroma_width >= 4) { for (int i = 0; i < 3; ++i) { lfnst_modes_to_check[i] = i; } } - else if(chroma_data->pred_cu.lfnst_idx) { - lfnst_modes_to_check[0] = chroma_data->pred_cu.lfnst_idx; - lfnst_modes_to_check[1] = -1; - lfnst_modes_to_check[2] = -1; - } else { lfnst_modes_to_check[0] = 0; lfnst_modes_to_check[1] = -1; @@ -1490,11 +1490,15 @@ int8_t uvg_search_intra_chroma_rdo( ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C]; ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C]; + double original_c_lambda = state->c_lambda; + state->quant_blocks[2].needs_init = true; + state->rate_estimator[1].needs_init = true; for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) { const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma; double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode); - chroma_data[mode_i].cost = mode_bits * state->lambda; + chroma_data[mode_i].cost = mode_bits * state->c_lambda; + chroma_data[mode_i].bits = mode_bits; cu_info_t* pred_cu = &chroma_data[mode_i].pred_cu; uint8_t best_lfnst_index = 0; for (int lfnst_i = 0; lfnst_i < 3; ++lfnst_i) { @@ -1502,58 +1506,58 @@ int8_t uvg_search_intra_chroma_rdo( if (lfnst == -1) { continue; } + state->c_lambda = original_c_lambda * (state->encoder_control->cfg.jccr && state->qp > 18 ? 1.3 : 1.0); pred_cu->cr_lfnst_idx = lfnst; - chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->lambda; - if (pred_cu->tr_depth == pred_cu->depth) { + chroma_data[mode_i].lfnst_costs[lfnst] += mode_bits * state->c_lambda; + if (PU_IS_TU(pred_cu) && (tree_type != UVG_CHROMA_T || (pred_cu->log2_chroma_width < 5 && pred_cu->log2_chroma_height < 5))) { uvg_intra_predict( state, &refs[COLOR_U - 1], - &loc, + cu_loc, + cu_loc, COLOR_U, u_pred, &chroma_data[mode_i], - lcu, - tree_type); + lcu); uvg_intra_predict( state, &refs[COLOR_V - 1], - &loc, + cu_loc, + cu_loc, COLOR_V, v_pred, &chroma_data[mode_i], - lcu, - tree_type); + lcu); uvg_generate_residual( &lcu->ref.u[offset], u_pred, u_resi, - width, + chroma_width, + chroma_height, LCU_WIDTH_C, - width); + chroma_width); uvg_generate_residual( &lcu->ref.v[offset], v_pred, v_resi, - width, + chroma_width, + chroma_height, LCU_WIDTH_C, - width); + chroma_width); uvg_chorma_ts_out_t chorma_ts_out; uvg_chroma_transform_search( state, - depth, lcu, &temp_cabac, - width, - height, + cu_loc, offset, - mode, pred_cu, u_pred, v_pred, u_resi, v_resi, &chorma_ts_out, - tree_type); + is_separate ? UVG_CHROMA_T : tree_type); // LFNST constraint failed if(chorma_ts_out.best_u_index == -1 && chorma_ts_out.best_combined_index == -1) { @@ -1561,8 +1565,9 @@ int8_t uvg_search_intra_chroma_rdo( continue; } + double actual_cost = state->lambda * (chorma_ts_out.u_bits + chorma_ts_out.v_bits + mode_bits) + (chorma_ts_out.u_distortion + chorma_ts_out.v_distortion); if(chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost < chorma_ts_out.best_combined_cost) { - chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_u_cost + chorma_ts_out.best_v_cost; + chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost; if( chroma_data[mode_i].lfnst_costs[lfnst] < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) { chroma_data[mode_i].pred_cu.joint_cb_cr = 0; @@ -1574,7 +1579,7 @@ int8_t uvg_search_intra_chroma_rdo( } } else { - chroma_data[mode_i].lfnst_costs[lfnst] += chorma_ts_out.best_combined_cost; + chroma_data[mode_i].lfnst_costs[lfnst] = actual_cost; if (chroma_data[mode_i].lfnst_costs[lfnst] < chroma_data[mode_i].lfnst_costs[best_lfnst_index] || lfnst_i == 0) { chroma_data[mode_i].pred_cu.joint_cb_cr = chorma_ts_out.best_combined_index; @@ -1583,17 +1588,18 @@ int8_t uvg_search_intra_chroma_rdo( chroma_data[mode_i].cost = chroma_data[mode_i].lfnst_costs[lfnst]; } } + } else { state->search_cabac.update = 1; - chroma_data[mode_i].cost = mode_bits * state->lambda; + chroma_data[mode_i].cost = mode_bits * state->c_lambda; uvg_intra_recon_cu(state, - x_px, y_px, - depth, &chroma_data[mode_i], - pred_cu, - lcu, - tree_type, false, true); - chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + &chroma_data[mode_i], cu_loc, + pred_cu, lcu, + tree_type, + false, + true); + chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc); memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t)); } } @@ -1602,6 +1608,7 @@ int8_t uvg_search_intra_chroma_rdo( } sort_modes(chroma_data, num_modes); + state->c_lambda = original_c_lambda; return chroma_data[0].pred_cu.intra.mode_chroma; } @@ -1612,26 +1619,25 @@ int8_t uvg_search_intra_chroma_rdo( int8_t uvg_search_cu_intra_chroma( encoder_state_t * const state, - const int x_px, - const int y_px, - const int depth, + const cu_loc_t* const cu_loc, lcu_t *lcu, intra_search_data_t *search_data, - enum uvg_tree_type tree_type) + int8_t luma_mode, + enum uvg_tree_type tree_type, + bool is_separate) { - const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; const cu_info_t *cur_pu = &search_data->pred_cu; - int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0; - int8_t modes[8] = { 0, 50, 18, 1, intra_mode, 81, 82, 83 }; - uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5); + int8_t modes[8] = { 0, 50, 18, 1, luma_mode, 81, 82, 83 }; + uint8_t total_modes = (state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_pu, tree_type) ? 8 : 5); for(int i = 0; i < 4; i++) { - if (modes[i] == intra_mode) { + if (modes[i] == luma_mode) { modes[i] = 66; break; } } + // The number of modes to select for slower chroma search. Luma mode // is always one of the modes, so 2 means the final decision is made @@ -1648,9 +1654,9 @@ int8_t uvg_search_cu_intra_chroma( FILL(chroma_data, 0); for (int i = 0; i < num_modes; i++) { chroma_data[i].pred_cu = *cur_pu; - chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i]; + chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? luma_mode : modes[i]; chroma_data[i].cost = 0; - if(depth != 4 && tree_type == UVG_BOTH_T) { + if(!is_separate && tree_type == UVG_BOTH_T) { memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3); } } @@ -1659,34 +1665,15 @@ int8_t uvg_search_cu_intra_chroma( // num_modes is 0.is 0. if(state->encoder_control->cfg.cclm && 0){ - const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2); - const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; - const vector2d_t luma_px = { x_px & ~7, y_px & ~7}; + - uvg_intra_references refs_u; - uvg_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs_u, state->encoder_control->cfg.wpp, NULL, 0); - - uvg_intra_references refs_v; - uvg_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0); - - vector2d_t lcu_cpx = { (lcu_px.x & ~7) / 2, (lcu_px.y & ~7) / 2 }; - uvg_pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; - uvg_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; - - num_modes = search_intra_chroma_rough(state, x_px, y_px, depth, - ref_u, - ref_v, - LCU_WIDTH_C, - &refs_u, - &refs_v, - chroma_data, - lcu, - intra_mode, - tree_type); + num_modes = search_intra_chroma_rough(state, chroma_data, lcu, luma_mode, + tree_type, + cu_loc); } if (num_modes > 1 || state->encoder_control->cfg.jccr) { - uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode, tree_type); + uvg_search_intra_chroma_rdo(state, num_modes, lcu, cu_loc, chroma_data, luma_mode, tree_type, is_separate); } else if(cur_pu->lfnst_idx) { chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx; @@ -1782,19 +1769,15 @@ static int select_candidates_for_further_search(const encoder_state_t * const st */ void uvg_search_cu_intra( encoder_state_t * const state, - const int x_px, - const int y_px, - const int depth, intra_search_data_t* mode_out, lcu_t *lcu, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc) { - const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) }; - const int8_t cu_width = LCU_WIDTH >> depth; - const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width, - MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) }; - const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth; - const vector2d_t luma_px = { x_px, y_px }; + const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y }; + const int8_t log2_width = uvg_g_convert_to_log2[cu_loc->width]; + const int8_t log2_height = uvg_g_convert_to_log2[cu_loc->width]; + const vector2d_t luma_px = { cu_loc->x, cu_loc->y}; const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height }; cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y); @@ -1810,25 +1793,22 @@ void uvg_search_cu_intra( // Select left and top CUs if they are available. // Top CU is not available across LCU boundary. - if (x_px >= SCU_WIDTH) { - left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_width-1); + if (cu_loc->x >= SCU_WIDTH) { + left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y+ cu_loc->height-1); } - if (y_px >= SCU_WIDTH && lcu_px.y > 0) { - above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_width-1, lcu_px.y - 1); + if (cu_loc->y >= SCU_WIDTH && lcu_px.y > 0) { + above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x+ cu_loc->width-1, lcu_px.y - 1); } - int8_t num_cand = uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu); + int8_t num_cand = uvg_intra_get_dir_luma_predictor(cu_loc->x, cu_loc->y, candidate_modes, cur_cu, left_cu, above_cu); - if (depth > 0) { - uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0); + bool is_large = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH; + if (!is_large) { + uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0, 0); } - - // The maximum number of possible MIP modes depend on block size & shape - int width = LCU_WIDTH >> depth; - int height = width; // TODO: proper height for non-square blocks. - + // This is needed for bit cost calculation and requires too many parameters to be // calculated inside the rough search functions - uint8_t mip_ctx = uvg_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL); + uint8_t mip_ctx = uvg_get_mip_flag_context(cu_loc, lcu, NULL); // Find best intra mode for 2Nx2N. uvg_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH]; @@ -1839,24 +1819,25 @@ void uvg_search_cu_intra( temp_pred_cu.type = CU_INTRA; FILL(temp_pred_cu.intra, 0); // Find modes with multiple reference lines if in use. Do not use if CU in first row. - uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1; + uint8_t lines = state->encoder_control->cfg.mrl && lcu_px.y != 0 ? MAX_REF_LINE_IDX : 1; uint8_t number_of_modes; uint8_t num_regular_modes; - bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4); + bool skip_rough_search = (is_large || state->encoder_control->cfg.rdo >= 4); if (!skip_rough_search) { num_regular_modes = number_of_modes = search_intra_rough( - state, - &cu_loc, - ref_pixels, - LCU_WIDTH, - refs, - log2_width, - candidate_modes, - search_data, - &temp_pred_cu, - mip_ctx); - // if(lines == 1) sort_modes(search_data, number_of_modes); + state, + cu_loc, + ref_pixels, + LCU_WIDTH, + refs, + cu_loc->width, + cu_loc->height, + candidate_modes, + search_data, + &temp_pred_cu, + mip_ctx); + // if(lines == 1) sort_modes(search_data, number_of_modes); } else { for (int8_t i = 0; i < UVG_NUM_INTRA_MODES; i++) { @@ -1870,7 +1851,7 @@ void uvg_search_cu_intra( } uint8_t num_mrl_modes = 0; - for(int line = 1; line < lines; ++line) { + for(int line = 1; line < lines && !is_large; ++line) { uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 }; if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) { @@ -1878,7 +1859,7 @@ void uvg_search_cu_intra( // Copy extra ref lines, including ref line 1 and top left corner. for (int i = 0; i < MAX_REF_LINE_IDX; ++i) { - int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX; + int height = (cu_loc->height) * 2 + MAX_REF_LINE_IDX; height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist. height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX); uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)], @@ -1887,7 +1868,7 @@ void uvg_search_cu_intra( frame->rec->stride, 1); } } - uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line); + uvg_intra_build_reference(state, cu_loc, cu_loc, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line, 0); for(int i = 1; i < INTRA_MPM_COUNT; i++) { num_mrl_modes++; const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes; @@ -1899,7 +1880,7 @@ void uvg_search_cu_intra( } } if (!skip_rough_search && lines != 1) { - get_rough_cost_for_2n_modes(state, refs, &cu_loc, + get_rough_cost_for_2n_modes(state, refs, cu_loc, ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mrl_modes, mip_ctx); @@ -1912,11 +1893,11 @@ void uvg_search_cu_intra( int num_mip_modes = 0; if (state->encoder_control->cfg.mip) { // MIP is not allowed for 64 x 4 or 4 x 64 blocks - if (!((width == 64 && height == 4) || (width == 4 && height == 64))) { - num_mip_modes = NUM_MIP_MODES_FULL(width, height); + if (!((cu_loc->height == 64 && cu_loc->width== 4) || (cu_loc->height== 4 && cu_loc->width == 64))) { + num_mip_modes = NUM_MIP_MODES_FULL(cu_loc->width, cu_loc->height); for (int transpose = 0; transpose < 2; transpose++) { - const int half_mip_modes = NUM_MIP_MODES_HALF(width, height); + const int half_mip_modes = num_mip_modes / 2; for (int i = 0; i < half_mip_modes; ++i) { const int index = i + number_of_modes + transpose * half_mip_modes; search_data[index].pred_cu = temp_pred_cu; @@ -1928,7 +1909,7 @@ void uvg_search_cu_intra( } } if (!skip_rough_search) { - get_rough_cost_for_2n_modes(state, refs, &cu_loc, + get_rough_cost_for_2n_modes(state, refs, cu_loc, ref_pixels, LCU_WIDTH, search_data + number_of_modes, num_mip_modes, mip_ctx); @@ -1937,9 +1918,6 @@ void uvg_search_cu_intra( number_of_modes += num_mip_modes; } - - // Set transform depth to current depth, meaning no transform splits. - uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth, tree_type); // Refine results with slower search or get some results if rough search was skipped. const int32_t rdo_level = state->encoder_control->cfg.rdo; if (rdo_level >= 2 || skip_rough_search) { @@ -1956,7 +1934,7 @@ void uvg_search_cu_intra( {2, 3, 3, 3, 3, 2}, // 64x4, 64x8, 64x16, 64x32, 64x64, 64x128, {2, 2, 2, 2, 2, 3}, // 128x4, 128x8, 128x16, 128x32, 128x64, 128x128, }; - number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[7- depth - 3][7 - depth - 3]; + number_of_modes_to_search = g_aucIntraModeNumFast_UseMPM_2D[log2_width - 2][log2_height - 2]; } else { // Check only the predicted modes. number_of_modes_to_search = 0; @@ -1968,8 +1946,8 @@ void uvg_search_cu_intra( search_data, num_regular_modes, num_mip_modes, - width, - height + cu_loc->width, + cu_loc->height ); } } @@ -1991,16 +1969,16 @@ void uvg_search_cu_intra( number_of_modes_to_search++; } } - + + state->quant_blocks[0].needs_init = 1; + state->rate_estimator[0].needs_init = 1; search_intra_rdo( state, - x_px, - y_px, - depth, number_of_modes_to_search, search_data, lcu, - tree_type); + tree_type, + cu_loc); search_data[0].pred_cu.mts_last_scan_pos = false; search_data[0].pred_cu.violates_mts_coeff_constraint = false; } diff --git a/src/search_intra.h b/src/search_intra.h index 36470e63..ebcec26e 100644 --- a/src/search_intra.h +++ b/src/search_intra.h @@ -43,27 +43,27 @@ #include "global.h" // IWYU pragma: keep #include "intra.h" -double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu); +double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, const cu_loc_t* + const cu_loc, + const lcu_t* lcu); double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode); int8_t uvg_search_cu_intra_chroma( encoder_state_t * const state, - const int x_px, - const int y_px, - const int depth, + const cu_loc_t* const cu_loc, lcu_t *lcu, intra_search_data_t* best_cclm, - enum uvg_tree_type tree_type); + int8_t luma_mode, + enum uvg_tree_type tree_type, + bool is_separate); void uvg_search_cu_intra( encoder_state_t * const state, - const int x_px, - const int y_px, - const int depth, intra_search_data_t* search_data, lcu_t *lcu, - enum uvg_tree_type tree_type); + enum uvg_tree_type tree_type, + const cu_loc_t* const cu_loc); #endif // SEARCH_INTRA_H_ diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index b695273b..081b1b25 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -52,10 +52,17 @@ extern const int16_t uvg_g_dct_8_t[8][8]; extern const int16_t uvg_g_dct_16_t[16][16]; extern const int16_t uvg_g_dct_32_t[32][32]; -#if COMPILE_INTEL_AVX2 +#define COMPILE_INTEL_AVX2 1 + +#if COMPILE_INTEL_AVX2 #include "uvg266.h" #if UVG_BIT_DEPTH == 8 #include +#include "strategies/avx2/dct_avx2_tables.h" +#define MAX_LOG2_TR_DYNAMIC_RANGE 15 +#define TRANSFORM_MATRIX_SHIFT 6 +#define INVERSE_SHIFT_1ST (TRANSFORM_MATRIX_SHIFT + 1) +#define INVERSE_SHIFT_2ND (TRANSFORM_MATRIX_SHIFT + MAX_LOG2_TR_DYNAMIC_RANGE - 1 - UVG_BIT_DEPTH) /* * \file @@ -73,6 +80,583 @@ static INLINE __m256i truncate_avx2(__m256i v, __m256i debias, int32_t shift) return _mm256_srai_epi32(truncable, shift); } + +// TODO: find avx2 solution for transpose +// TODO: attempt to make a generic transpose for avx2. Needs some extra logic for different widths and heights. +// TODO: make a few solutions for exact sizes and see if some pattern emerges... +static void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) { + const int sample_num = width * height; + const int vectors = sample_num / 16; + + int16_t* d_ptr = dst; + if (vectors == 0) { + return; + } + else if (vectors == 1) { + + } + else { + // Reserve enough storage for max transform size 32x32 + __m256i v_16b_result[64]; + __m256i v_32b_result[64]; + __m256i v_64b_result[64]; + __m256i v_128b_result[64]; + + // Handle two source vectors at a time + for (int i = 0; i < vectors; i += 2) { + __m256i v_src_0 = _mm256_load_si256((const __m256i*)src); + __m256i v_src_1 = _mm256_load_si256((const __m256i*)(src + 16)); + + v_16b_result[i] = _mm256_unpacklo_epi16(v_src_0, v_src_1); + v_16b_result[i + 1] = _mm256_unpackhi_epi16(v_src_0, v_src_1); + + src += 32; + } + + // 32 bit shuffle pass + int loop_idx = 0; + for (int i = 0; i < vectors; i += 2) { + const int idx_a = loop_idx; + const int idx_b = loop_idx + 2; + + v_32b_result[i] = _mm256_unpacklo_epi32(v_16b_result[idx_a], v_16b_result[idx_b]); + v_32b_result[i + 1] = _mm256_unpackhi_epi32(v_16b_result[idx_a], v_16b_result[idx_b]); + loop_idx++; + } + + // 64 bit shuffle pass + loop_idx = 0; + for (int i = 0; i < vectors; i += 2) { + const int idx_a = loop_idx; + const int idx_b = loop_idx + 4; + + v_64b_result[i] = _mm256_unpacklo_epi32(v_32b_result[idx_a], v_32b_result[idx_b]); + v_64b_result[i + 1] = _mm256_unpackhi_epi32(v_32b_result[idx_a], v_32b_result[idx_b]); + loop_idx++; + } + + // Final 128 bit shuffle pass + for (int i = 0; i < vectors; i += 2) { + const int idx_a = 0; + const int idx_b = 0; + + v_128b_result[i] = _mm256_unpacklo_epi32(v_64b_result[idx_a], v_64b_result[idx_b]); + v_128b_result[i + 1] = _mm256_unpackhi_epi32(v_64b_result[idx_a], v_64b_result[idx_b]); + } + + // Store loop + for (int i = 0; i < vectors; ++i) { + _mm256_store_si256((__m256i*)dst, v_128b_result[i]); + dst += 16; + } + } +} + +static void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height) +{ + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + dst[x * height + y] = src[y * width + x]; + } + } +} + + +typedef void (transpose_func)(const __m256i* src, __m256i* dst); + + +static void transpose_2x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x16_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x32_avx2(const __m256i* src, __m256i* dst) +{ + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + __m256i v_tmp[4]; + v_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle); + v_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle); + v_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle); + v_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0)); + + dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); +} +static void transpose_2x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x16_avx2(const __m256i* src, __m256i* dst) +{ + const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16); + + // const __m256i v_shuffle = _mm256_set_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + // 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31); + + __m256i v_src_tmp[4]; + v_src_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle); + v_src_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle); + v_src_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle); + v_src_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle); + + __m256i v_tmp[4]; + v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20); + v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31); + v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20); + v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31); + + __m256i v_tmp16_lo[2]; + __m256i v_tmp16_hi[2]; + v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); + v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0)); + + dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); +} +static void transpose_4x32_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp[8]; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + for (int i = 0; i < 8; ++i) { + v_tmp[i] = _mm256_shuffle_epi8(src[i], v_shuffle); + v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[i] = _mm256_shuffle_epi32(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp[0], v_tmp[1]); + v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp[2], v_tmp[3]); + v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp[4], v_tmp[5]); + v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp[6], v_tmp[7]); + + v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp[0], v_tmp[1]); + v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp[2], v_tmp[3]); + v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp[4], v_tmp[5]); + v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp[6], v_tmp[7]); + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20); + + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31); +} +static void transpose_4x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x16_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + __m256i v_tmp128[8]; + + v_tmp128[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_tmp128[1] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_tmp128[2] = _mm256_permute2x128_si256(src[1], src[5], 0x20); + v_tmp128[3] = _mm256_permute2x128_si256(src[1], src[5], 0x31); + v_tmp128[4] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_tmp128[5] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + v_tmp128[6] = _mm256_permute2x128_si256(src[3], src[7], 0x20); + v_tmp128[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31); + + v_tmp16_lo[0] = _mm256_unpacklo_epi16(v_tmp128[0], v_tmp128[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(v_tmp128[2], v_tmp128[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(v_tmp128[4], v_tmp128[5]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(v_tmp128[6], v_tmp128[7]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(v_tmp128[0], v_tmp128[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(v_tmp128[2], v_tmp128[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(v_tmp128[4], v_tmp128[5]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(v_tmp128[6], v_tmp128[7]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + + dst[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + dst[1] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + dst[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + dst[3] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + dst[4] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + dst[5] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + dst[6] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + dst[7] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); +} +static void transpose_8x32_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + + const __m256i v_shuffle = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31); + for (int i = 0; i < 8; ++i) { + const int offset = i * 2; + v_tmp16_lo[i] = _mm256_unpacklo_epi16(src[offset], src[offset + 1]); + v_tmp16_hi[i] = _mm256_unpackhi_epi16(src[offset], src[offset + 1]); + } + + for (int i = 0; i < 8; i += 4) { + v_tmp32_lo[i + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]); + v_tmp32_lo[i + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]); + v_tmp32_lo[i + 2] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]); + v_tmp32_lo[i + 3] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]); + + v_tmp32_hi[i + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]); + v_tmp32_hi[i + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]); + v_tmp32_hi[i + 2] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]); + v_tmp32_hi[i + 3] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]); + } + + for (int i = 0; i < 8; i += 4) { + v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]); + v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]); + v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]); + v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]); + + v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]); + v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]); + v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]); + v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]); + } + + for (int i = 0; i < 8; ++i) { + v_tmp64_lo[i] = _mm256_permute4x64_epi64(v_tmp64_lo[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp64_hi[i] = _mm256_permute4x64_epi64(v_tmp64_hi[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + dst[0] = _mm256_shuffle_epi8(v_tmp64_lo[0], v_shuffle); + dst[1] = _mm256_shuffle_epi8(v_tmp64_lo[4], v_shuffle); + dst[2] = _mm256_shuffle_epi8(v_tmp64_hi[0], v_shuffle); + dst[3] = _mm256_shuffle_epi8(v_tmp64_hi[4], v_shuffle); + + dst[4] = _mm256_shuffle_epi8(v_tmp64_lo[2], v_shuffle); + dst[5] = _mm256_shuffle_epi8(v_tmp64_lo[6], v_shuffle); + dst[6] = _mm256_shuffle_epi8(v_tmp64_hi[2], v_shuffle); + dst[7] = _mm256_shuffle_epi8(v_tmp64_hi[6], v_shuffle); + + dst[8] = _mm256_shuffle_epi8(v_tmp64_lo[1], v_shuffle); + dst[9] = _mm256_shuffle_epi8(v_tmp64_lo[5], v_shuffle); + dst[10] = _mm256_shuffle_epi8(v_tmp64_hi[1], v_shuffle); + dst[11] = _mm256_shuffle_epi8(v_tmp64_hi[5], v_shuffle); + + dst[12] = _mm256_shuffle_epi8(v_tmp64_lo[3], v_shuffle); + dst[13] = _mm256_shuffle_epi8(v_tmp64_lo[7], v_shuffle); + dst[14] = _mm256_shuffle_epi8(v_tmp64_hi[3], v_shuffle); + dst[15] = _mm256_shuffle_epi8(v_tmp64_hi[7], v_shuffle); +} +static void transpose_8x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_16x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_16x4_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[2]; + __m256i v_tmp16_hi[2]; + __m256i v_tmp32_lo[2]; + __m256i v_tmp32_hi[2]; + + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + + dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31); +} +static void transpose_16x8_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[5]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[6], src[7]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[5]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[6], src[7]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + + v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20); + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31); +} + +static void transpose_16x16_avx2_stride(const int16_t* src, int16_t* dst, const int src_stride, const int dst_stride) { + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride)); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride)); + } + + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 2) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]); + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 8; d += 4, s += 4) { + v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + + v_tmp64_lo[d + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_lo[d + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + v_tmp64_hi[d + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_hi[d + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + } + + _mm256_storeu_si256((__m256i*)(dst + 0 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 1 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 2 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 3 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 4 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 5 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 6 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 7 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x20)); + + _mm256_storeu_si256((__m256i*)(dst + 8 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 9 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 10 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 11 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 12 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 13 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 14 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 15 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x31)); +} + +static void transpose_16x16_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 16); +} + +static void transpose_16x32_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 16, (int16_t*)dst + 16, 16, 32); + +} +static void transpose_16x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_32x2_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo0 = _mm256_unpacklo_epi16(src[0], src[2]); + __m256i v_tmp16_lo1 = _mm256_unpacklo_epi16(src[1], src[3]); + __m256i v_tmp16_hi0 = _mm256_unpackhi_epi16(src[0], src[2]); + __m256i v_tmp16_hi1 = _mm256_unpackhi_epi16(src[1], src[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x31); +} +static void transpose_32x4_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[2]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[1], src[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[6]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[5], src[7]); + + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[2]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[1], src[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[6]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[5], src[7]); + + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[2]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[1], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[2]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[1], v_tmp16_hi[3]); + + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[2]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[1], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[2]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[1], v_tmp16_hi[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x31); + + dst[4] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20); + dst[5] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x20); + dst[6] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x31); +} +static void transpose_32x8_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 4) { + v_tmp16_lo[d + 0] = _mm256_unpacklo_epi16(src[s + 0], src[s + 2]); + v_tmp16_lo[d + 1] = _mm256_unpacklo_epi16(src[s + 1], src[s + 3]); + + v_tmp16_hi[d + 0] = _mm256_unpackhi_epi16(src[s + 0], src[s + 2]); + v_tmp16_hi[d + 1] = _mm256_unpackhi_epi16(src[s + 1], src[s + 3]); + } + + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + for (int d = 0, s = 0; d < 4; d += 2, s += 4) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]); + v_tmp32_lo[d + 4] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]); + v_tmp32_lo[d + 5] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]); + + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]); + v_tmp32_hi[d + 4] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]); + v_tmp32_hi[d + 5] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 4; d += 2, s += 4) { + v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_lo[d + 4] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_lo[d + 5] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + + v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_hi[d + 4] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_hi[d + 5] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + } + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x20); + + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x31); + + dst[8] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20); + dst[9] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x20); + dst[10] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20); + dst[11] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x20); + + dst[12] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31); + dst[13] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x31); + dst[14] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31); + dst[15] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x31); +} +static void transpose_32x16_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t *)dst, 32, 16); + transpose_16x16_avx2_stride((int16_t const *)src + 16, (int16_t *)dst + 16 * 16, 32, 16); +} +static void transpose_32x32_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t *)dst, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16, (int16_t *)dst + 16 * 32, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32, (int16_t *)dst + 16, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32 + 16, (int16_t *)dst + 16 * 32 + 16, 32, 32); +} +static void transpose_32x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x16_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x32_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x64_avx2(const __m256i* src, __m256i* dst){} + + + +static transpose_func* transpose_func_table[6][6] = { + { transpose_2x2_avx2, transpose_4x2_avx2, transpose_8x2_avx2, transpose_16x2_avx2, transpose_32x2_avx2, transpose_64x2_avx2}, + { transpose_2x4_avx2, transpose_4x4_avx2, transpose_8x4_avx2, transpose_16x4_avx2, transpose_32x4_avx2, transpose_64x4_avx2}, + { transpose_2x8_avx2, transpose_4x8_avx2, transpose_8x8_avx2, transpose_16x8_avx2, transpose_32x8_avx2, transpose_64x8_avx2}, + {transpose_2x16_avx2, transpose_4x16_avx2, transpose_8x16_avx2, transpose_16x16_avx2, transpose_32x16_avx2, transpose_64x16_avx2}, + {transpose_2x32_avx2, transpose_4x32_avx2, transpose_8x32_avx2, transpose_16x32_avx2, transpose_32x32_avx2, transpose_64x32_avx2}, + {transpose_2x64_avx2, transpose_4x64_avx2, transpose_8x64_avx2, transpose_16x64_avx2, transpose_32x64_avx2, transpose_64x64_avx2}, +}; + + +// Dispatcher function for avx2 transposes. This calls the proper subfunction +static void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height) +{ + // No need to transpose something of width or height 1 + const int w_log2_minus1 = uvg_g_convert_to_log2[width] - 1; + const int h_log2_minus1 = uvg_g_convert_to_log2[height] - 1; + + transpose_func* func = transpose_func_table[h_log2_minus1][w_log2_minus1]; + func(src, dst); +} + + // 4x4 matrix multiplication with value clipping. // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses, // destination for the result and the shift value for clipping. @@ -945,12 +1529,6 @@ ITRANSFORM(dct, 32); /*****************************************************/ // DST-7 -#define DEFINE_DST7_P4_MATRIX(a,b,c,d) { \ - { a, b, c, d},\ - { c, c, 0, -c},\ - { d, -a, -c, b},\ - { b, -d, c, -a},\ -} #define DEFINE_DST7_P4_MATRIX_T(a,b,c,d) { \ { a, c, d, b},\ @@ -959,17 +1537,6 @@ ITRANSFORM(dct, 32); { d, -c, b, -a},\ } -#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \ -{\ - { a, b, c, d, e, f, g, h},\ - { c, f, h, e, b, -a, -d, -g},\ - { e, g, b, -c, -h, -d, a, f},\ - { g, c, -d, -f, a, h, b, -e},\ - { h, -a, -g, b, f, -c, -e, d},\ - { f, -e, -a, g, -d, -b, h, -c},\ - { d, -h, e, -a, -c, g, -f, b},\ - { b, -d, f, -h, g, -e, c, -a},\ -} #define DEFINE_DST7_P8_MATRIX_T(a,b,c,d,e,f,g,h) \ {\ @@ -983,25 +1550,6 @@ ITRANSFORM(dct, 32); { h, -g, f, -e, d, -c, b, -a,},\ }\ -#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ -{ \ - { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}, \ - { c, f, i, l, o, o, l, i, f, c, 0, -c, -f, -i, -l, -o}, \ - { e, j, o, m, h, c, -b, -g, -l, -p, -k, -f, -a, d, i, n}, \ - { g, n, l, e, -b, -i, -p, -j, -c, d, k, o, h, a, -f, -m}, \ - { i, o, f, -c, -l, -l, -c, f, o, i, 0, -i, -o, -f, c, l}, \ - { k, k, 0, -k, -k, 0, k, k, 0, -k, -k, 0, k, k, 0, -k}, \ - { m, g, -f, -n, -a, l, h, -e, -o, -b, k, i, -d, -p, -c, j}, \ - { o, c, -l, -f, i, i, -f, -l, c, o, 0, -o, -c, l, f, -i}, \ - { p, -a, -o, b, n, -c, -m, d, l, -e, -k, f, j, -g, -i, h}, \ - { n, -e, -i, j, d, -o, a, m, -f, -h, k, c, -p, b, l, -g}, \ - { l, -i, -c, o, -f, -f, o, -c, -i, l, 0, -l, i, c, -o, f}, \ - { j, -m, c, g, -p, f, d, -n, i, a, -k, l, -b, -h, o, -e}, \ - { h, -p, i, -a, -g, o, -j, b, f, -n, k, -c, -e, m, -l, d}, \ - { f, -l, o, -i, c, c, -i, o, -l, f, 0, -f, l, -o, i, -c}, \ - { d, -h, l, -p, m, -i, e, -a, -c, g, -k, o, -n, j, -f, b}, \ - { b, -d, f, -h, j, -l, n, -p, o, -m, k, -i, g, -e, c, -a}, \ -} #define DEFINE_DST7_P16_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ { \ @@ -1024,43 +1572,6 @@ ITRANSFORM(dct, 32); } - -#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F}, \ - {c, f, i, l, o, r, u, x, A, D, F, C, z, w, t, q, n, k, h, e, b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E}, \ - {e, j, o, t, y, D, D, y, t, o, j, e, 0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e, 0, e, j, o, t, y, D}, \ - {g, n, u, B, D, w, p, i, b, -e, -l, -s, -z, -F, -y, -r, -k, -d, c, j, q, x, E, A, t, m, f, -a, -h, -o, -v, -C}, \ - {i, r, A, C, t, k, b, -g, -p, -y, -E, -v, -m, -d, e, n, w, F, x, o, f, -c, -l, -u, -D, -z, -q, -h, a, j, s, B}, \ - {k, v, F, u, j, -a, -l, -w, -E, -t, -i, b, m, x, D, s, h, -c, -n, -y, -C, -r, -g, d, o, z, B, q, f, -e, -p, -A}, \ - {m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z}, \ - {o, D, t, e, -j, -y, -y, -j, e, t, D, o, 0, -o, -D, -t, -e, j, y, y, j, -e, -t, -D, -o, 0, o, D, t, e, -j, -y}, \ - {q, E, n, -c, -t, -B, -k, f, w, y, h, -i, -z, -v, -e, l, C, s, b, -o, -F, -p, a, r, D, m, -d, -u, -A, -j, g, x}, \ - {s, A, h, -k, -D, -p, c, v, x, e, -n, -F, -m, f, y, u, b, -q, -C, -j, i, B, r, -a, -t, -z, -g, l, E, o, -d, -w}, \ - {u, w, b, -s, -y, -d, q, A, f, -o, -C, -h, m, E, j, -k, -F, -l, i, D, n, -g, -B, -p, e, z, r, -c, -x, -t, a, v}, \ - {w, s, -d, -A, -o, h, E, k, -l, -D, -g, p, z, c, -t, -v, a, x, r, -e, -B, -n, i, F, j, -m, -C, -f, q, y, b, -u}, \ - {y, o, -j, -D, -e, t, t, -e, -D, -j, o, y, 0, -y, -o, j, D, e, -t, -t, e, D, j, -o, -y, 0, y, o, -j, -D, -e, t}, \ - {A, k, -p, -v, e, F, f, -u, -q, j, B, a, -z, -l, o, w, -d, -E, -g, t, r, -i, -C, -b, y, m, -n, -x, c, D, h, -s}, \ - {C, g, -v, -n, o, u, -h, -B, a, D, f, -w, -m, p, t, -i, -A, b, E, e, -x, -l, q, s, -j, -z, c, F, d, -y, -k, r}, \ - {E, c, -B, -f, y, i, -v, -l, s, o, -p, -r, m, u, -j, -x, g, A, -d, -D, a, F, b, -C, -e, z, h, -w, -k, t, n, -q}, \ - {F, -a, -E, b, D, -c, -C, d, B, -e, -A, f, z, -g, -y, h, x, -i, -w, j, v, -k, -u, l, t, -m, -s, n, r, -o, -q, p}, \ - {D, -e, -y, j, t, -o, -o, t, j, -y, -e, D, 0, -D, e, y, -j, -t, o, o, -t, -j, y, e, -D, 0, D, -e, -y, j, t, -o}, \ - {B, -i, -s, r, j, -A, -a, C, -h, -t, q, k, -z, -b, D, -g, -u, p, l, -y, -c, E, -f, -v, o, m, -x, -d, F, -e, -w, n}, \ - {z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m}, \ - {x, -q, -g, E, -j, -n, A, -c, -u, t, d, -B, m, k, -D, f, r, -w, -a, y, -p, -h, F, -i, -o, z, -b, -v, s, e, -C, l}, \ - {v, -u, -a, w, -t, -b, x, -s, -c, y, -r, -d, z, -q, -e, A, -p, -f, B, -o, -g, C, -n, -h, D, -m, -i, E, -l, -j, F, -k}, \ - {t, -y, e, o, -D, j, j, -D, o, e, -y, t, 0, -t, y, -e, -o, D, -j, -j, D, -o, -e, y, -t, 0, t, -y, e, o, -D, j}, \ - {r, -C, k, g, -y, v, -d, -n, F, -o, -c, u, -z, h, j, -B, s, -a, -q, D, -l, -f, x, -w, e, m, -E, p, b, -t, A, -i}, \ - {p, -F, q, -a, -o, E, -r, b, n, -D, s, -c, -m, C, -t, d, l, -B, u, -e, -k, A, -v, f, j, -z, w, -g, -i, y, -x, h}, \ - {n, -B, w, -i, -e, s, -F, r, -d, -j, x, -A, m, a, -o, C, -v, h, f, -t, E, -q, c, k, -y, z, -l, -b, p, -D, u, -g}, \ - {l, -x, C, -q, e, g, -s, E, -v, j, b, -n, z, -A, o, -c, -i, u, -F, t, -h, -d, p, -B, y, -m, a, k, -w, D, -r, f}, \ - {j, -t, D, -y, o, -e, -e, o, -y, D, -t, j, 0, -j, t, -D, y, -o, e, e, -o, y, -D, t, -j, 0, j, -t, D, -y, o, -e}, \ - {h, -p, x, -F, y, -q, i, -a, -g, o, -w, E, -z, r, -j, b, f, -n, v, -D, A, -s, k, -c, -e, m, -u, C, -B, t, -l, d}, \ - {f, -l, r, -x, D, -C, w, -q, k, -e, -a, g, -m, s, -y, E, -B, v, -p, j, -d, -b, h, -n, t, -z, F, -A, u, -o, i, -c}, \ - {d, -h, l, -p, t, -x, B, -F, C, -y, u, -q, m, -i, e, -a, -c, g, -k, o, -s, w, -A, E, -D, z, -v, r, -n, j, -f, b}, \ - {b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a}, \ -} - #define DEFINE_DST7_P32_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ { \ {a, c, e, g, i, k, m, o, q, s, u, w, y, A, C, E, F, D, B, z, x, v, t, r, p, n, l, j, h, f, d, b,},\ @@ -1097,85 +1608,6 @@ ITRANSFORM(dct, 32); {F, -E, D, -C, B, -A, z, -y, x, -w, v, -u, t, -s, r, -q, p, -o, n, -m, l, -k, j, -i, h, -g, f, -e, d, -c, b, -a,},\ } -// DCT-8 -#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \ -{ \ - {a, b, c, d}, \ - {b, 0, -b, -b}, \ - {c, -b, -d, a}, \ - {d, -b, a, -c}, \ -} - -#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \ -{ \ - {a, b, c, d, e, f, g, h}, \ - {b, e, h, -g, -d, -a, -c, -f}, \ - {c, h, -e, -a, -f, g, b, d}, \ - {d, -g, -a, -h, c, e, -f, -b}, \ - {e, -d, -f, c, g, -b, -h, a}, \ - {f, -a, g, e, -b, h, d, -c}, \ - {g, -c, b, -f, -h, d, -a, e}, \ - {h, -f, d, -b, a, -c, e, -g}, \ -} - -#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}, \ - {b, e, h, k, n, 0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n}, \ - {c, h, m, -p, -k, -f, -a, -e, -j, -o, n, i, d, b, g, l}, \ - {d, k, -p, -i, -b, -f, -m, n, g, a, h, o, -l, -e, -c, -j}, \ - {e, n, -k, -b, -h, 0, h, b, k, -n, -e, -e, -n, k, b, h}, \ - {f, 0, -f, -f, 0, f, f, 0, -f, -f, 0, f, f, 0, -f, -f}, \ - {g, -n, -a, -m, h, f, -o, -b, -l, i, e, -p, -c, -k, j, d}, \ - {h, -k, -e, n, b, 0, -b, -n, e, k, -h, -h, k, e, -n, -b}, \ - {i, -h, -j, g, k, -f, -l, e, m, -d, -n, c, o, -b, -p, a}, \ - {j, -e, -o, a, -n, -f, i, k, -d, -p, b, -m, -g, h, l, -c}, \ - {k, -b, n, h, -e, 0, e, -h, -n, b, -k, -k, b, -n, -h, e}, \ - {l, -b, i, o, -e, f, -p, -h, c, -m, -k, a, -j, -n, d, -g}, \ - {m, -e, d, -l, -n, f, -c, k, o, -g, b, -j, -p, h, -a, i}, \ - {n, -h, b, -e, k, 0, -k, e, -b, h, -n, -n, h, -b, e, -k}, \ - {o, -k, g, -c, b, -f, j, -n, -p, l, -h, d, -a, e, -i, m}, \ - {p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o}, \ -} - - -#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F}, \ - {b, e, h, k, n, q, t, w, z, C, F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D}, \ - {c, h, m, r, w, B, 0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B, 0, B, w, r, m, h, c, c, h, m, r, w, B}, \ - {d, k, r, y, F, -A, -t, -m, -f, -b, -i, -p, -w, -D, C, v, o, h, a, g, n, u, B, -E, -x, -q, -j, -c, -e, -l, -s, -z}, \ - {e, n, w, F, -y, -p, -g, -c, -l, -u, -D, A, r, i, a, j, s, B, -C, -t, -k, -b, -h, -q, -z, E, v, m, d, f, o, x}, \ - {f, q, B, -A, -p, -e, -g, -r, -C, z, o, d, h, s, D, -y, -n, -c, -i, -t, -E, x, m, b, j, u, F, -w, -l, -a, -k, -v}, \ - {g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t}, \ - {h, w, -B, -m, -c, -r, 0, r, c, m, B, -w, -h, -h, -w, B, m, c, r, 0, -r, -c, -m, -B, w, h, h, w, -B, -m, -c, -r}, \ - {i, z, -w, -f, -l, -C, t, c, o, F, -q, -a, -r, E, n, d, u, -B, -k, -g, -x, y, h, j, A, -v, -e, -m, -D, s, b, p}, \ - {j, C, -r, -b, -u, z, g, m, F, -o, -e, -x, w, d, p, -E, -l, -h, -A, t, a, s, -B, -i, -k, -D, q, c, v, -y, -f, -n}, \ - {k, F, -m, -i, -D, o, g, B, -q, -e, -z, s, c, x, -u, -a, -v, w, b, t, -y, -d, -r, A, f, p, -C, -h, -n, E, j, l}, \ - {l, -E, -h, -p, A, d, t, -w, -a, -x, s, e, B, -o, -i, -F, k, m, -D, -g, -q, z, c, u, -v, -b, -y, r, f, C, -n, -j}, \ - {m, -B, -c, -w, r, h, 0, -h, -r, w, c, B, -m, -m, B, c, w, -r, -h, 0, h, r, -w, -c, -B, m, m, -B, -c, -w, r, h}, \ - {n, -y, -c, -D, i, s, -t, -h, E, d, x, -o, -m, z, b, C, -j, -r, u, g, -F, -e, -w, p, l, -A, -a, -B, k, q, -v, -f}, \ - {o, -v, -h, C, a, D, -g, -w, n, p, -u, -i, B, b, E, -f, -x, m, q, -t, -j, A, c, F, -e, -y, l, r, -s, -k, z, d}, \ - {p, -s, -m, v, j, -y, -g, B, d, -E, -a, -F, c, C, -f, -z, i, w, -l, -t, o, q, -r, -n, u, k, -x, -h, A, e, -D, -b}, \ - {q, -p, -r, o, s, -n, -t, m, u, -l, -v, k, w, -j, -x, i, y, -h, -z, g, A, -f, -B, e, C, -d, -D, c, E, -b, -F, a}, \ - {r, -m, -w, h, B, -c, 0, c, -B, -h, w, m, -r, -r, m, w, -h, -B, c, 0, -c, B, h, -w, -m, r, r, -m, -w, h, B, -c}, \ - {s, -j, -B, a, -C, -i, t, r, -k, -A, b, -D, -h, u, q, -l, -z, c, -E, -g, v, p, -m, -y, d, -F, -f, w, o, -n, -x, e}, \ - {t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g}, \ - {u, -d, B, n, -k, -E, g, -r, -x, a, -y, -q, h, -F, -j, o, A, -c, v, t, -e, C, m, -l, -D, f, -s, -w, b, -z, -p, i}, \ - {v, -a, w, u, -b, x, t, -c, y, s, -d, z, r, -e, A, q, -f, B, p, -g, C, o, -h, D, n, -i, E, m, -j, F, l, -k}, \ - {w, -c, r, B, -h, m, 0, -m, h, -B, -r, c, -w, -w, c, -r, -B, h, -m, 0, m, -h, B, r, -c, w, w, -c, r, B, -h, m}, \ - {x, -f, m, -E, -q, b, -t, -B, j, -i, A, u, -c, p, F, -n, e, -w, -y, g, -l, D, r, -a, s, C, -k, h, -z, -v, d, -o}, \ - {y, -i, h, -x, -z, j, -g, w, A, -k, f, -v, -B, l, -e, u, C, -m, d, -t, -D, n, -c, s, E, -o, b, -r, -F, p, -a, q}, \ - {z, -l, c, -q, E, u, -g, h, -v, -D, p, -b, m, -A, -y, k, -d, r, -F, -t, f, -i, w, C, -o, a, -n, B, x, -j, e, -s}, \ - {A, -o, c, -j, v, F, -t, h, -e, q, -C, -y, m, -a, l, -x, -D, r, -f, g, -s, E, w, -k, b, -n, z, B, -p, d, -i, u}, \ - {B, -r, h, -c, m, -w, 0, w, -m, c, -h, r, -B, -B, r, -h, c, -m, w, 0, -w, m, -c, h, -r, B, B, -r, h, -c, m, -w}, \ - {C, -u, m, -e, d, -l, t, -B, -D, v, -n, f, -c, k, -s, A, E, -w, o, -g, b, -j, r, -z, -F, x, -p, h, -a, i, -q, y}, \ - {D, -x, r, -l, f, -a, g, -m, s, -y, E, C, -w, q, -k, e, -b, h, -n, t, -z, F, B, -v, p, -j, d, -c, i, -o, u, -A}, \ - {E, -A, w, -s, o, -k, g, -c, b, -f, j, -n, r, -v, z, -D, -F, B, -x, t, -p, l, -h, d, -a, e, -i, m, -q, u, -y, C}, \ - {F, -D, B, -z, x, -v, t, -r, p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o, q, -s, u, -w, y, -A, C, -E}, \ -} - - // DST-7 ALIGNED(64) const int16_t uvg_g_dst7_4[4][4] = DEFINE_DST7_P4_MATRIX(29, 55, 74, 84); ALIGNED(64) const int16_t uvg_g_dst7_8[8][8] = DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86); @@ -1309,22 +1741,7 @@ static void mts_dct_16x16_avx2(const int16_t* input, int16_t* output, tr_type_t const int skip_line = lfnst_idx ? 8 : 0; const int skip_line2 = lfnst_idx ? 8 : 0; - if (skip_line) - { - const int reduced_line = 8, cutoff = 8; - int16_t* dst2 = output + reduced_line; - for (int j = 0; j < cutoff; j++) - { - memset(dst2, 0, sizeof(int16_t) * skip_line); - dst2 += 16; - } - } - if (skip_line2) - { - int16_t* dst2 = output + 16 * 8; - memset(dst2, 0, sizeof(int16_t) * 16 * skip_line2); - } } /**********/ @@ -1512,21 +1929,7 @@ static void mul_clip_matrix_32x32_mts_avx2(const int16_t* left, _mm256_store_si256(dst_v + dst_base + 1, h23); } - if (skip_line) - { - int16_t* dst2 = dst + reduced_line; - for (j = 0; j < cutoff; j++) - { - memset(dst2, 0, sizeof(int16_t) * skip_line); - dst2 += 32; - } - } - if (skip_line2) - { - int16_t* dst2 = dst + 32 * cutoff; - memset(dst2, 0, sizeof(int16_t) * 32 * skip_line2); - } } static void mts_dct_32x32_avx2(const int16_t* input, int16_t* output, tr_type_t type_hor, tr_type_t type_ver, uint8_t bitdepth, uint8_t lfnst_idx) @@ -1576,41 +1979,6017 @@ static tr_func* idct_table[5] = { mts_idct_4x4_avx2, mts_idct_8x8_avx2, mts_idct_16x16_avx2, mts_idct_32x32_avx2, NULL/*fastInverseDCT2_B64*/ }; +typedef void (dct_full_pass)(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver); + + +// ********************************************** +// New tailored functions for each size combination +// ********************************************** + +static void fast_forward_tr_2xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*)coeff); + __m256i v_coeff_1 = _mm256_load_si256((__m256i*)(coeff + 16)); + __m256i* v_dst_ptr = dst; + + const int reduced_line = line - skip_line; + // Handle 8 lines at a time (16 samples, 2 samples per line) + for (int j = 0; j < reduced_line; j += 8) { + // src vector: [00 01 02 03 04 05 06 07|08 09 10 11 12 13 14 15] + __m256i v_src = _mm256_load_si256((const __m256i*) src); + + // Multiply with a and add together all adjacent elements + // even vector: [a00+a01 a02+a03 a04+a05 a06+a07|a08+a09 a10+a11 a12+a13 a14+a15] + __m256i v_even = _mm256_madd_epi16(v_src, v_coeff_0); + // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15] + __m256i v_odd = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift); + + v_dst_ptr[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + v_dst_ptr++; + } +} + +static void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 8; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_2x8_coeff_ver; + if (ver == DST7) { + ver_coeff = ff_dst7_2x8_coeff_ver; + } + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_hor_pass_out; + fast_forward_tr_2xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got data for only 1 vector + // const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x8_shuffle_ver); + const __m256i v_src_raw = v_hor_pass_out; + // __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + v_madd[i] = _mm256_madd_epi16(v_src, v_coeff[i]); + } + __m256i v_hadd_0[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_hadd_0[i] = _mm256_hadd_epi32(v_madd[offset], v_madd[offset + 1]); + } + + __m256i v_trunc[2]; + for (int i = 0; i < 2; ++i) { + const int offset = i * 2; + v_trunc[i] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[offset], v_hadd_0[offset + 1]), debias, shift_2nd); + } + + __m256i v_result = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + const __m256i v_res_shfl = _mm256_load_si256((const __m256i*)ff_dct2_2x8_result_shuffle_ver); + // Shuffle values to correct order + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + v_result = _mm256_shuffle_epi32(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + v_result = _mm256_shuffle_epi8(v_result, v_res_shfl); + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_2x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_hor); + + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + __m256i v_madd_4 = _mm256_madd_epi16(v_src, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src, v_coeff[7]); + + __m256i v_hadd_00 = _mm256_hadd_epi32(v_madd_0, v_madd_1); + __m256i v_hadd_01 = _mm256_hadd_epi32(v_madd_2, v_madd_3); + __m256i v_hadd_02 = _mm256_hadd_epi32(v_madd_4, v_madd_5); + __m256i v_hadd_03 = _mm256_hadd_epi32(v_madd_6, v_madd_7); + + __m256i v_hadd_10 = _mm256_hadd_epi32(v_hadd_00, v_hadd_01); + __m256i v_hadd_11 = _mm256_hadd_epi32(v_hadd_02, v_hadd_03); + + __m256i v_trunc_0 = truncate_avx2(v_hadd_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_hadd_11, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_2x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_ver); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_res_shuffle_ver); + + __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi8(v_src, v_shuffle); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +static void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename + if (ver == DST7) { + ver_coeff = fi_dst7_8x2_coeff_hor; + } + const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out; + fast_inverse_tr_2x8_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x8_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 16; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = &uvg_g_dct_16[0][0]; + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_16[0][0]; + } + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_hor_pass_out[2]; + fast_forward_tr_2xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Permute hor pass output to correct order + __m256i v_tmp_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_tmp_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_src_0 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x31); + + const __m256i* v_coeff_ptr = (const __m256i*)ver_coeff; + + __m256i v_madd[2][16]; + for (int i = 0; i < 16; ++i) { + v_madd[0][i] = _mm256_madd_epi16(v_src_0, v_coeff_ptr[i]); + v_madd[1][i] = _mm256_madd_epi16(v_src_1, v_coeff_ptr[i]); + } + + __m256i v_hadd_0[2][8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_hadd_0[0][dst] = _mm256_hadd_epi32(v_madd[0][src], v_madd[0][src + 1]); + v_hadd_0[1][dst] = _mm256_hadd_epi32(v_madd[1][src], v_madd[1][src + 1]); + } + + __m256i v_hadd_1[2][4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_hadd_1[0][dst] = _mm256_hadd_epi32(v_hadd_0[0][src], v_hadd_0[0][src + 1]); + v_hadd_1[1][dst] = _mm256_hadd_epi32(v_hadd_0[1][src], v_hadd_0[1][src + 1]); + } + + __m256i v_tmp_00 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x20); + __m256i v_tmp_01 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x31); + __m256i v_tmp_02 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x20); + __m256i v_tmp_03 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x31); + + __m256i v_tmp_10 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x20); + __m256i v_tmp_11 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x31); + __m256i v_tmp_12 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x20); + __m256i v_tmp_13 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x31); + + __m256i v_trunc_00 = truncate_avx2((_mm256_add_epi32(v_tmp_00, v_tmp_01)), debias, shift_2nd); + __m256i v_trunc_01 = truncate_avx2((_mm256_add_epi32(v_tmp_02, v_tmp_03)), debias, shift_2nd); + + __m256i v_trunc_10 = truncate_avx2((_mm256_add_epi32(v_tmp_10, v_tmp_11)), debias, shift_2nd); + __m256i v_trunc_11 = truncate_avx2((_mm256_add_epi32(v_tmp_12, v_tmp_13)), debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_00, v_trunc_10); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_01, v_trunc_11); + + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + _mm256_store_si256((__m256i*)&dst[0], v_result_0); + _mm256_store_si256((__m256i*)&dst[16], v_result_1); +} + + +static void fast_inverse_tr_2x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + + v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + for (int c = 0; c < 16; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_coeff += 2; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + } + + __m256i v_hadd_0[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_hadd_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]); + } + + __m256i v_trunc[4]; + for (int i = 0; i < 4; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); +} + +static void fast_inverse_tr_2x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_lo = _mm256_unpacklo_epi16(src[0], src[1]); + __m256i v_src_hi = _mm256_unpackhi_epi16(src[0], src[1]); + + __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]); + __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]); + + __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]); + __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]); + + __m256i v_trunc_0 = truncate_avx2(v_madd_lo_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_lo_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_madd_hi_0, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_madd_hi_1, debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +static void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename + if (ver == DST7) { + ver_coeff = fi_dst7_16x2_coeff_hor; + } + const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_2x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = &uvg_g_dct_32[0][0]; + // For result shuffling, can use existing shuffle vector + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + ALIGNED(32) int16_t v_hor_pass_out[2*32]; + fast_forward_tr_2xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + __m256i temp_out[4]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 2; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ff_dct2_32x32_coeff_ver; + const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4); + for (int i = 0; i < 16; ++i) { + + __m256i v_src = _mm256_set1_epi32(*temp_source); + temp_source += i & 1 ? 3 : 1; + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 2); +} + + +static void fast_inverse_tr_2x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src[4]; + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle); + } + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int c = 0; c < 32; c++) { + const __m256i v_coeff_0 = _mm256_setr_epi64x(c_ptr[0], c_ptr[1], c_ptr[0], c_ptr[1]); + const __m256i v_coeff_1 = _mm256_setr_epi64x(c_ptr[2], c_ptr[3], c_ptr[2], c_ptr[3]); + const __m256i v_coeff_2 = _mm256_setr_epi64x(c_ptr[4], c_ptr[5], c_ptr[4], c_ptr[5]); + const __m256i v_coeff_3 = _mm256_setr_epi64x(c_ptr[6], c_ptr[7], c_ptr[6], c_ptr[7]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_add[c] = _mm256_add_epi32(v_add_00, v_add_01); + c_ptr += 8; + } + + __m256i v_hadd_0[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_hadd_1[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); +} + +static void fast_inverse_tr_2x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + const __m256i v_src_lo0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + const __m256i v_src_lo1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]); + const __m256i v_src_hi0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + const __m256i v_src_hi1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]); + + __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[0]), debias, shift); + __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[1]), debias, shift); + __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[0]), debias, shift); + __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[1]), debias, shift); + __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[0]), debias, shift); + __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[1]), debias, shift); + __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[0]), debias, shift); + __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[1]), debias, shift); + + __m256i v_result[4]; + __m256i v_tmp[4]; + v_tmp[0] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01), v_res_shuffle); + v_tmp[1] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11), v_res_shuffle); + v_tmp[2] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01), v_res_shuffle); + v_tmp[3] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11), v_res_shuffle); + + v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; // rename + const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_2x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + +} + + +static void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & coeff[32]); + const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & coeff[48]); + + const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0); + const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1); + + const int reduced_line = line - skip_line; + // Handle 4 lines at a time (16 samples, 4 samples per line) + for (int j = 0; j < reduced_line; j += 4) { + // line 0 line 1 line 2 line 3 + // src vector: [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15] + __m256i v_src_raw = _mm256_load_si256((const __m256i*) src); + + // Arrange data for column-wise calculation. Data and coeffs are ordered so no further shuffling + // or permutes are needed. + // vec 1 : [s00 s01 s04 s05 s08 s09 s12 s13 | s00 s01 s04 s05 s08 s09 s12 s13] + // vec 2 : [s02 s03 s06 s07 s10 s11 s14 s15 | s02 s03 s06 s07 s10 s11 s14 s15] + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3); + + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + dst += 1; + } +} + +static void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + // TODO: coeffs for DST7 and DCT8 + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = fast_forward_dct2_b4_coeff; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } + else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = fast_forward_dst7_b4_coeff; + } + else if (ver == DCT8) { + ver_coeff = fast_forward_dct8_b4_coeff; + } + + __m256i v_hor_pass_out; + fast_forward_tr_4xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & ver_coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & ver_coeff[16]); + const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & ver_coeff[32]); + const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & ver_coeff[48]); + + const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0); + const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1); + + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_1); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_4x4_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_shuffle_hor); + + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_4x4_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_result_shuffle_ver); + + __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +static void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_4xN_coeff_hor; + const int16_t* ver_coeff = fi_dct2_4xN_coeff_hor; // Can use same table for both passes + if (hor == DST7) { + hor_coeff = fi_dst7_4xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4xN_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4xN_coeff_hor; + } + + __m256i v_hor_pass_out; + fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height); +} + + +static void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = ff_dct2_4x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = ff_dst7_4x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_4x8_coeff_ver; + } + + __m256i v_hor_pass_out[2]; + fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + __m256i v_madd[2][8]; + for (int i = 0; i < 8; ++i) { + v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff[0]); + v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff[1]); + v_coeff += 2; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + v_add[i] = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]); + } + + __m256i v_trunc[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd); + } + + __m256i v_result[2]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + // Order results + v_result[0] = _mm256_permute4x64_epi64(v_result[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_permute4x64_epi64(v_result[1], _MM_SHUFFLE(3, 1, 2, 0)); + + v_result[0] = _mm256_shuffle_epi32(v_result[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_shuffle_epi32(v_result[1], _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)&dst[0], v_result[0]); + _mm256_store_si256((__m256i*)&dst[16], v_result[1]); +} + + +static void fast_inverse_tr_4x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + const __m256i v_permute = _mm256_load_si256((const __m256i*)permute_32b_0415); + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + v_src_0 = _mm256_permutevar8x32_epi32(v_src_0, v_permute); + v_src_1 = _mm256_permutevar8x32_epi32(v_src_1, v_permute); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_madd_04 = _mm256_madd_epi16(v_src_0, v_coeff[8]); + __m256i v_madd_14 = _mm256_madd_epi16(v_src_1, v_coeff[9]); + + __m256i v_madd_05 = _mm256_madd_epi16(v_src_0, v_coeff[10]); + __m256i v_madd_15 = _mm256_madd_epi16(v_src_1, v_coeff[11]); + + __m256i v_madd_06 = _mm256_madd_epi16(v_src_0, v_coeff[12]); + __m256i v_madd_16 = _mm256_madd_epi16(v_src_1, v_coeff[13]); + + __m256i v_madd_07 = _mm256_madd_epi16(v_src_0, v_coeff[14]); + __m256i v_madd_17 = _mm256_madd_epi16(v_src_1, v_coeff[15]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10); + __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11); + __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12); + __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13); + __m256i v_add_4 = _mm256_add_epi32(v_madd_04, v_madd_14); + __m256i v_add_5 = _mm256_add_epi32(v_madd_05, v_madd_15); + __m256i v_add_6 = _mm256_add_epi32(v_madd_06, v_madd_16); + __m256i v_add_7 = _mm256_add_epi32(v_madd_07, v_madd_17); + + __m256i v_hadd_0 = _mm256_hadd_epi32(v_add_0, v_add_1); + __m256i v_hadd_1 = _mm256_hadd_epi32(v_add_2, v_add_3); + __m256i v_hadd_2 = _mm256_hadd_epi32(v_add_4, v_add_5); + __m256i v_hadd_3 = _mm256_hadd_epi32(v_add_6, v_add_7); + + __m256i v_trunc_0 = truncate_avx2(v_hadd_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_hadd_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_hadd_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_hadd_3, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_4x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + v_result_0 = _mm256_shuffle_epi32(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_shuffle_epi32(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +static void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x4_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_8x4_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_8x4_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x4_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x4_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x4_coeff_hor; + } + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_4x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = &uvg_g_dct_16[0][0]; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_16[0][0]; + } else if (ver == DCT8) { + ver_coeff = &uvg_g_dct8_16[0][0]; + } + + __m256i v_hor_pass_out[4]; + fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + const int64_t* coeff_ptr = (const int64_t*)ver_coeff; // Read four coeffs at once by casting into 64 bit integer + + __m256i v_madd[4][16]; + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff_0 = _mm256_set1_epi64x(coeff_ptr[0]); + const __m256i v_coeff_1 = _mm256_set1_epi64x(coeff_ptr[1]); + const __m256i v_coeff_2 = _mm256_set1_epi64x(coeff_ptr[2]); + const __m256i v_coeff_3 = _mm256_set1_epi64x(coeff_ptr[3]); + v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff_0); + v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff_1); + v_madd[2][i] = _mm256_madd_epi16(v_hor_pass_out[2], v_coeff_2); + v_madd[3][i] = _mm256_madd_epi16(v_hor_pass_out[3], v_coeff_3); + coeff_ptr += 4; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + __m256i v_tmp0 = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]); + __m256i v_tmp1 = _mm256_add_epi32(v_madd[2][i], v_madd[3][i]); + + v_add[i] = _mm256_add_epi32(v_tmp0, v_tmp1); + } + + __m256i v_trunc[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd); + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_shuffle_epi32(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + + +static void fast_inverse_tr_4x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + __m256i v_src_2 = _mm256_shuffle_epi8(v_src_raw[2], v_shuffle); + __m256i v_src_3 = _mm256_shuffle_epi8(v_src_raw[3], v_shuffle); + + v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_2 = _mm256_permute4x64_epi64(v_src_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_3 = _mm256_permute4x64_epi64(v_src_3, _MM_SHUFFLE(3, 1, 2, 0)); + + v_src_0 = _mm256_shuffle_epi32(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_shuffle_epi32(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_2 = _mm256_shuffle_epi32(v_src_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_3 = _mm256_shuffle_epi32(v_src_3, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + __m256i v_madd_2[16]; + __m256i v_madd_3[16]; + for (int c = 0; c < 16; c++) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[2]); + v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[3]); + v_coeff += 4; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_hadd[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); +} + +static void fast_inverse_tr_4x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(src[2], src[3], 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(src[2], src[3], 0x31); + + __m256i v_madd_0[4]; + __m256i v_madd_1[4]; + __m256i v_madd_2[4]; + __m256i v_madd_3[4]; + for (int c = 0; c < 4; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[0]); + v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[1]); + v_coeff += 2; + } + + __m256i v_trunc_0[4]; + __m256i v_trunc_1[4]; + for (int i = 0; i < 4; ++i) { + v_trunc_0[i] = truncate_avx2(_mm256_add_epi32(v_madd_0[i], v_madd_1[i]), debias, shift); + v_trunc_1[i] = truncate_avx2(_mm256_add_epi32(v_madd_2[i], v_madd_3[i]), debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + + __m256i v_tmp32_0 = _mm256_unpacklo_epi32(v_tmp0, v_tmp1); + __m256i v_tmp32_1 = _mm256_unpackhi_epi32(v_tmp0, v_tmp1); + __m256i v_tmp32_2 = _mm256_unpacklo_epi32(v_tmp2, v_tmp3); + __m256i v_tmp32_3 = _mm256_unpackhi_epi32(v_tmp2, v_tmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x4_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_16x4_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x4_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x4_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x4_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x4_coeff_hor; + } + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_4x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = ff_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = ff_dst7_4x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_4x32_coeff_ver; + } + + int16_t v_hor_pass_out[4*32]; + fast_forward_tr_4xN_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + + __m256i temp_out[8]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 4; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4); + for (int i = 0; i < 16; ++i) { + + __m256i v_src = _mm256_set1_epi32(*temp_source); + temp_source += i & 1 ? 7 : 1; + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 4); + + +} + + +static void fast_inverse_tr_4x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle); + } + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[i] = _mm256_shuffle_epi32(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int c = 0; c < 32; c++) { + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + const __m256i v_coeff = _mm256_set1_epi64x(*c_ptr); + v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff); + c_ptr++; + } + + __m256i v_add_0[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]); + } + + __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]); + __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]); + + v_add[c] = _mm256_add_epi32(v_add_10, v_add_11); + } + + __m256i v_hadd[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } + // TODO: cutoff for dct8 and dst7 +} + +static void fast_inverse_tr_4x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = src; + + __m256i v_src[8]; + __m256i v_tmp[8]; + v_src[0] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x20); + v_src[1] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x31); + v_src[2] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x20); + v_src[3] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x31); + v_src[4] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x20); + v_src[5] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x31); + v_src[6] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x20); + v_src[7] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x31); + + for (int d = 0, c = 0; c < 4; ++c, d += 2) { + __m256i v_madd_00 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src[2], v_coeff[0]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src[3], v_coeff[1]); + __m256i v_madd_20 = _mm256_madd_epi16(v_src[4], v_coeff[0]); + __m256i v_madd_21 = _mm256_madd_epi16(v_src[5], v_coeff[1]); + __m256i v_madd_30 = _mm256_madd_epi16(v_src[6], v_coeff[0]); + __m256i v_madd_31 = _mm256_madd_epi16(v_src[7], v_coeff[1]); + v_coeff += 2; + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_01), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_10, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_20, v_madd_21), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_30, v_madd_31), debias, shift); + + v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + v_tmp[d + 0] = _mm256_permute4x64_epi64(v_tmp[d + 0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[d + 1] = _mm256_permute4x64_epi64(v_tmp[d + 1], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_result[8]; + transpose_avx2(v_tmp, v_result, 32, 4); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; + const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename + if (hor == DST7) { + hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename + } + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_32_t[0][0]; + } else if (ver == DCT8) { + ver_coeff = &uvg_g_dct8_32[0][0]; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_4x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_8xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const int reduced_line = line - skip_line; + // Handle 2 lines at a time (16 samples, 8 samples per line) + for (int j = 0; j < reduced_line; j += 2) { + // line 1 line 2 + // src vector: [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7] + __m256i v_src = _mm256_load_si256((const __m256i*)src); + + // Rearrange source in a way samples can be added together column-wise using add + // after first round of madd operations. + // Need 4 source vectors arranged as follows. High 128 lanes are the same as low: + // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...] + // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...] + // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...] + // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...] + + __m256i v_src_0 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i v_src_1 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(1, 1, 1, 1)); + __m256i v_src_2 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(2, 2, 2, 2)); + __m256i v_src_3 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 3, 3, 3)); + + // Lane 1 + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + // Lane 2 + __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]); + + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + // Trunc results from both lanes + __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + dst += 1; + } +} + +static void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x2_coeff_ver; + // Only DCT2 is defined for 8x2 block + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + + __m256i v_hor_pass_out; + fast_forward_tr_8xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // TODO: coeffs for DST7 and DCT8 transforms + const __m256i* v_coeff = (const __m256i*)ver_coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x2_ver_pass_shuffle); + + // 8x2, only 16 samples, handle all at once + __m256i v_src_per = _mm256_permute4x64_epi64(v_hor_pass_out, _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle); + // v_src = _mm256_unpackhi_epi16(v_src_raw, v_src_swp); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + + __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift_2nd); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); // TODO: this permute can probably be optimized away + + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_2x8_shuffle_hor); + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + // Got data for one vector + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi8(v_src, v_shuffle); + + __m256i v_even = _mm256_madd_epi16(v_src, v_coeff_0); + // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15] + __m256i v_odd = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver); + const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver); + + // Duplicate sources to enable vertical addition + __m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0)); + __m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2)); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + __m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + + __m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + + __m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + __m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01); + __m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11); + __m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21); + __m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + //v_result = _mm256_shuffle_epi8(v_result, v_shuffle1); + //v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + //v_result = _mm256_shuffle_epi8(v_result, v_shuffle2); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +static void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename + if (hor == DST7) { + hor_coeff = fi_dst7_2x8_coeff_ver; + } + // Only dct2 transform is defined for this block size + + __m256i v_ver_pass_out; + fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + +static void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x4_coeff_ver; + } + + __m256i v_hor_pass_out[2]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle); + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // 32 samples, process in two steps + __m256i v_src_per_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_src_per_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_per_0, v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_per_1, v_shuffle); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10); + __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11); + __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12); + __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(v_add_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(v_add_3, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + // Swap each middle 64 bit chunk in both 128 bit lanes + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + // Swap each middle 16 bit value in each 64 bit chunk + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); +} + + +static void fast_inverse_tr_8x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i v_src_raw_0 = _mm256_load_si256((const __m256i*) & src[0]); + const __m256i v_src_raw_1 = _mm256_load_si256((const __m256i*) & src[16]); + + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw_0, v_src_raw_1); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw_0, v_src_raw_1); + + __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x31); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[1]); + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[3]); + + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[4]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_8x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + } + + __m256i v_hadd[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[4]; + for (int i = 0; i < 4; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[2]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle); + v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle); + + __m256i v_tmp0 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x20); + __m256i v_tmp1 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x31); + + v_result[0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*) & dst[0], v_result[0]); + _mm256_store_si256((__m256i*) & dst[16], v_result[1]); +} + +static void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x8_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_4x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_4x8_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4x8_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x8_coeff_hor; + } + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_8x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x8_coeff_ver; + } + + __m256i v_hor_pass_out[4]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + const int32_t* coeff_ptr = (const int32_t*)ver_coeff; // Cast into 32 bit integer to read two coeffs at a time + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + + __m256i v_trunc[8]; + + __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x31); + + for (int i = 0; i < 8; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi32(coeff_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi32(coeff_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi32(coeff_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi32(coeff_ptr[3]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff_3); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_trunc[i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift_2nd); + coeff_ptr += 4; + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + + +static void fast_inverse_tr_8x8_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src[4]; + v_src[0] = _mm256_permute4x64_epi64(v_src_raw[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[1] = _mm256_permute4x64_epi64(v_src_raw[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[2] = _mm256_permute4x64_epi64(v_src_raw[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[3] = _mm256_permute4x64_epi64(v_src_raw[3], _MM_SHUFFLE(3, 1, 2, 0)); + + v_src[0] = _mm256_shuffle_epi8(v_src[0], v_shuffle); + v_src[1] = _mm256_shuffle_epi8(v_src[1], v_shuffle); + v_src[2] = _mm256_shuffle_epi8(v_src[2], v_shuffle); + v_src[3] = _mm256_shuffle_epi8(v_src[3], v_shuffle); + + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]); + v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]); + v_c_ptr += 4; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_add[i], debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); +} + +static void fast_inverse_tr_8x8_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[4]; + v_src[0] = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[1] = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[2] = _mm256_shuffle_epi32(src[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[3] = _mm256_shuffle_epi32(src[3], _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_tmp0 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20); + __m256i v_tmp1 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31); + __m256i v_tmp2 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20); + __m256i v_tmp3 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31); + + v_src[0] = _mm256_unpacklo_epi64(v_tmp0, v_tmp2); + v_src[1] = _mm256_unpackhi_epi64(v_tmp0, v_tmp2); + v_src[2] = _mm256_unpacklo_epi64(v_tmp1, v_tmp3); + v_src[3] = _mm256_unpackhi_epi64(v_tmp1, v_tmp3); + + + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]); + v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]); + v_c_ptr += 4; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_add[i], debias, shift); + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle); + v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle); + v_result[2] = _mm256_shuffle_epi8(v_result[2], v_res_shuffle); + v_result[3] = _mm256_shuffle_epi8(v_result[3], v_res_shuffle); + + __m256i v_rtmp0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]); + __m256i v_rtmp1 = _mm256_unpackhi_epi32(v_result[0], v_result[1]); + __m256i v_rtmp2 = _mm256_unpacklo_epi32(v_result[2], v_result[3]); + __m256i v_rtmp3 = _mm256_unpackhi_epi32(v_result[2], v_result[3]); + + __m256i v_tmp20 = _mm256_unpacklo_epi64(v_rtmp0, v_rtmp2); + __m256i v_tmp21 = _mm256_unpackhi_epi64(v_rtmp0, v_rtmp2); + __m256i v_tmp22 = _mm256_unpacklo_epi64(v_rtmp1, v_rtmp3); + __m256i v_tmp23 = _mm256_unpackhi_epi64(v_rtmp1, v_rtmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_8x8_coeff_hor; + const int16_t* ver_coeff = fi_dct2_8x8_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_8x8_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x8_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x8_coeff_hor; + } + + __m256i v_hor_pass_out[4]; + fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height); +} + + +static void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 16; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x16_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Can use same shuffles as 8x4 + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle); + //const __m256i* v_coeff = (const __m256i*)ver_coeff; + const int32_t *line_coeff = (const int32_t*)ver_coeff; + + // Multiply+add all source vectors with coeff vectors + __m256i v_madd[8][16]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 8; ++i) { + __m256i v_src_per = _mm256_permute4x64_epi64(v_src_ptr[0], _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle); + + for (int ii = 0; ii < 16; ++ii) { + //int coeff_row = ii * 8 + i; + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd[i][ii] = _mm256_madd_epi16(v_src, v_coeff); + } + line_coeff += 16; + v_src_ptr += 1; + } + + // Add vectors + __m256i v_add_0[4][16]; + for (int i = 0; i < 4; ++i) { + for (int ii = 0; ii < 16; ++ii) { + int offset = i * 2; + v_add_0[i][ii] = _mm256_add_epi32(v_madd[offset][ii], v_madd[offset + 1][ii]); + } + } + // Second round of additions + __m256i v_add_1[2][16]; + for (int i = 0; i < 2; ++i) { + for (int ii = 0; ii < 16; ++ii) { + int offset = i * 2; + v_add_1[i][ii] = _mm256_add_epi32(v_add_0[offset][ii], v_add_0[offset + 1][ii]); + } + } + // Third round of additions + __m256i v_trunc[16]; + for (int ii = 0; ii < 16; ++ii) { + v_trunc[ii] = _mm256_add_epi32(v_add_1[0][ii], v_add_1[1][ii]); + v_trunc[ii] = truncate_avx2(v_trunc[ii], debias, shift_2nd); + } + + + for (int i = 0; i < 16; i += 2) { + __m256i v_result = _mm256_packs_epi32(v_trunc[i], v_trunc[i + 1]); + + // Swap each middle 64 bit chunk in both 128 bit lanes + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + // Swap each middle 16 bit value in each 64 bit chunk + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); + dst += 16; + } +} + + +static void fast_inverse_tr_8x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_tmp[8]; + for (int i = 0; i < 8; ++i) { + v_tmp[i] = _mm256_permute4x64_epi64(v_src_raw[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_tmp[i], v_shuffle); + } + + __m256i v_trunc[16]; + for (int c = 0; c < 16; c++) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]); + __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff[7]); + + v_coeff += 8; + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift); + } + + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } +} + +static void fast_inverse_tr_8x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi32(src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_tmp[8]; + v_tmp[0] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20); + v_tmp[1] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20); + v_tmp[2] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x20); + v_tmp[3] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x20); + v_tmp[4] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31); + v_tmp[5] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31); + v_tmp[6] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x31); + v_tmp[7] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x31); + + v_src[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); + v_src[1] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); + v_src[2] = _mm256_unpacklo_epi32(v_tmp[4], v_tmp[5]); + v_src[3] = _mm256_unpackhi_epi32(v_tmp[4], v_tmp[5]); + v_src[4] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); + v_src[5] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); + v_src[6] = _mm256_unpacklo_epi32(v_tmp[6], v_tmp[7]); + v_src[7] = _mm256_unpackhi_epi32(v_tmp[6], v_tmp[7]); + + __m256i v_trunc[2][8]; + for (int d = 0, s = 0; d < 2; ++d, s += 4) { + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int c = 0; c < 8; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src[s + 0], v_c_ptr[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src[s + 1], v_c_ptr[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src[s + 2], v_c_ptr[2]); + v_madd_3[c] = _mm256_madd_epi16(v_src[s + 3], v_c_ptr[3]); + v_c_ptr += 4; + } + + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_trunc[d][i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift); + } + } + + __m256i v_rtmp[8]; + v_rtmp[0] = _mm256_packs_epi32(v_trunc[0][0], v_trunc[0][1]); + v_rtmp[1] = _mm256_packs_epi32(v_trunc[0][2], v_trunc[0][3]); + v_rtmp[2] = _mm256_packs_epi32(v_trunc[0][4], v_trunc[0][5]); + v_rtmp[3] = _mm256_packs_epi32(v_trunc[0][6], v_trunc[0][7]); + v_rtmp[4] = _mm256_packs_epi32(v_trunc[1][0], v_trunc[1][1]); + v_rtmp[5] = _mm256_packs_epi32(v_trunc[1][2], v_trunc[1][3]); + v_rtmp[6] = _mm256_packs_epi32(v_trunc[1][4], v_trunc[1][5]); + v_rtmp[7] = _mm256_packs_epi32(v_trunc[1][6], v_trunc[1][7]); + + for (int i = 0; i < 8; ++i) { + v_rtmp[i] = _mm256_shuffle_epi8(v_rtmp[i], v_res_shuffle); + } + + __m256i v_tmp32_lo0 = _mm256_unpacklo_epi32(v_rtmp[0], v_rtmp[1]); + __m256i v_tmp32_lo1 = _mm256_unpacklo_epi32(v_rtmp[2], v_rtmp[3]); + __m256i v_tmp32_lo2 = _mm256_unpacklo_epi32(v_rtmp[4], v_rtmp[5]); + __m256i v_tmp32_lo3 = _mm256_unpacklo_epi32(v_rtmp[6], v_rtmp[7]); + + __m256i v_tmp32_hi0 = _mm256_unpackhi_epi32(v_rtmp[0], v_rtmp[1]); + __m256i v_tmp32_hi1 = _mm256_unpackhi_epi32(v_rtmp[2], v_rtmp[3]); + __m256i v_tmp32_hi2 = _mm256_unpackhi_epi32(v_rtmp[4], v_rtmp[5]); + __m256i v_tmp32_hi3 = _mm256_unpackhi_epi32(v_rtmp[6], v_rtmp[7]); + + __m256i v_tmp64_lo0 = _mm256_unpacklo_epi64(v_tmp32_lo0, v_tmp32_lo1); + __m256i v_tmp64_lo1 = _mm256_unpacklo_epi64(v_tmp32_hi0, v_tmp32_hi1); + __m256i v_tmp64_lo2 = _mm256_unpacklo_epi64(v_tmp32_lo2, v_tmp32_lo3); + __m256i v_tmp64_lo3 = _mm256_unpacklo_epi64(v_tmp32_hi2, v_tmp32_hi3); + + __m256i v_tmp64_hi0 = _mm256_unpackhi_epi64(v_tmp32_lo0, v_tmp32_lo1); + __m256i v_tmp64_hi1 = _mm256_unpackhi_epi64(v_tmp32_hi0, v_tmp32_hi1); + __m256i v_tmp64_hi2 = _mm256_unpackhi_epi64(v_tmp32_lo2, v_tmp32_lo3); + __m256i v_tmp64_hi3 = _mm256_unpackhi_epi64(v_tmp32_hi2, v_tmp32_hi3); + + __m256i v_result[8]; + v_result[0] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x31); + v_result[4] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x20); + v_result[5] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x20); + v_result[7] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x8_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_16x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x8_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x8_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x8_coeff_hor; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_8x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x32_coeff_ver; + } + + ALIGNED(32) int16_t v_hor_pass_out[8 * 32]; + fast_forward_tr_8xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + __m256i temp_out[16]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 8; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 16]; + source[1] = v_hor_pass_out[j + i * 16 + 8]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 8); +#undef NUM_PARTS +#undef PART_DIMENSION + +} + + +static void fast_inverse_tr_8x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_tmp[16]; + for (int i = 0; i < 16; i += 2) { + v_tmp[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20); + v_tmp[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31); + } + + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_tmp[s + 0], v_tmp[s + 1]); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_src[16]; + for (int d = 0, s = 0; d < 16; d += 2, ++s) { + v_src[d + 0] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x20); + v_src[d + 1] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x31); + } + + __m256i v_trunc[32]; + + for (int row = 0; row < 32; ++row) { + __m256i v_res = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd = _mm256_madd_epi16(v_src[i], v_coeff); + v_res = _mm256_add_epi32(v_res, v_madd); + c_ptr++; + } + + v_trunc[row] = truncate_avx2(v_res, debias, shift); + } + + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } +} + +static void fast_inverse_tr_8x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src[16]; + for (int i = 0; i < 16; i += 2) { + v_src[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20); + v_src[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31); + } + + __m256i v_tmp[16]; + for (int s = 0; s < 16; s += 2) { + __m256i v_add[8]; + for (int d = 0, c = 0; d < 8; ++d, c += 2) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[s + 0], v_coeff[c + 0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[s + 1], v_coeff[c + 1]); + + v_add[d] = _mm256_add_epi32(v_madd_0, v_madd_1); + } + + __m256i v_hadd[4]; + v_hadd[0] = _mm256_hadd_epi32(v_add[0], v_add[1]); + v_hadd[1] = _mm256_hadd_epi32(v_add[2], v_add[3]); + v_hadd[2] = _mm256_hadd_epi32(v_add[4], v_add[5]); + v_hadd[3] = _mm256_hadd_epi32(v_add[6], v_add[7]); + + __m256i v_trunc[4]; + v_trunc[0] = truncate_avx2(v_hadd[0], debias, shift); + v_trunc[1] = truncate_avx2(v_hadd[1], debias, shift); + v_trunc[2] = truncate_avx2(v_hadd[2], debias, shift); + v_trunc[3] = truncate_avx2(v_hadd[3], debias, shift); + + v_tmp[s + 0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_tmp[s + 1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + } + + for (int i = 0; i < 16; ++i) { + v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp64_lo[d] = _mm256_unpacklo_epi64(v_tmp[s + 0], v_tmp[s + 1]); + v_tmp64_hi[d] = _mm256_unpackhi_epi64(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_result[16]; + for (int d = 0, s = 0; d < 16; d += 2, ++s) { + v_result[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x20); + v_result[d + 1] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x31); + } + + for (int i = 0; i < 16; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: mts cutoff +} + +static void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; + const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table + if (hor == DST7) { + hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename + } + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_32_t[0][0]; + } else if (ver == DCT8) { + ver_coeff = &uvg_g_dct8_32[0][0]; + } + + __m256i v_ver_pass_out[16]; + fast_inverse_tr_8x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_DCT2_B16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // ISP_TODO: might be faster to load these from arrays + const __m256i v_permute_0 = _mm256_set1_epi32(0); + const __m256i v_permute_1 = _mm256_set1_epi32(1); + const __m256i v_permute_2 = _mm256_set1_epi32(2); + const __m256i v_permute_3 = _mm256_set1_epi32(3); + const __m256i v_permute_4 = _mm256_set1_epi32(4); + const __m256i v_permute_5 = _mm256_set1_epi32(5); + const __m256i v_permute_6 = _mm256_set1_epi32(6); + const __m256i v_permute_7 = _mm256_set1_epi32(7); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const int reduced_line = line - skip_line; + // Handle 1 line at a time, 16 samples per line + for (int j = 0; j < reduced_line; ++j) { + // line 1 + // src vector: [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15] + __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + // Arrange data so calculations can be done column-wise (to avoid using hadds). + // Need 8 source vectors. First will be filled with s00 and s01 pairs. Second with s02 and s03 pairs and so on + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1); + __m256i v_src_2 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_2); + __m256i v_src_3 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_3); + __m256i v_src_4 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_4); + __m256i v_src_5 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_5); + __m256i v_src_6 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_6); + __m256i v_src_7 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_7); + + __m256i v_madd_0_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_0_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_0_02 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_0_03 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + __m256i v_madd_0_04 = _mm256_madd_epi16(v_src_4, v_coeff[4]); + __m256i v_madd_0_05 = _mm256_madd_epi16(v_src_5, v_coeff[5]); + __m256i v_madd_0_06 = _mm256_madd_epi16(v_src_6, v_coeff[6]); + __m256i v_madd_0_07 = _mm256_madd_epi16(v_src_7, v_coeff[7]); + + __m256i v_madd_0_08 = _mm256_madd_epi16(v_src_0, v_coeff[8]); + __m256i v_madd_0_09 = _mm256_madd_epi16(v_src_1, v_coeff[9]); + __m256i v_madd_0_10 = _mm256_madd_epi16(v_src_2, v_coeff[10]); + __m256i v_madd_0_11 = _mm256_madd_epi16(v_src_3, v_coeff[11]); + __m256i v_madd_0_12 = _mm256_madd_epi16(v_src_4, v_coeff[12]); + __m256i v_madd_0_13 = _mm256_madd_epi16(v_src_5, v_coeff[13]); + __m256i v_madd_0_14 = _mm256_madd_epi16(v_src_6, v_coeff[14]); + __m256i v_madd_0_15 = _mm256_madd_epi16(v_src_7, v_coeff[15]); + + __m256i v_madd_1_0 = _mm256_add_epi32(v_madd_0_00, v_madd_0_01); + __m256i v_madd_1_1 = _mm256_add_epi32(v_madd_0_02, v_madd_0_03); + __m256i v_madd_1_2 = _mm256_add_epi32(v_madd_0_04, v_madd_0_05); + __m256i v_madd_1_3 = _mm256_add_epi32(v_madd_0_06, v_madd_0_07); + __m256i v_madd_1_4 = _mm256_add_epi32(v_madd_0_08, v_madd_0_09); + __m256i v_madd_1_5 = _mm256_add_epi32(v_madd_0_10, v_madd_0_11); + __m256i v_madd_1_6 = _mm256_add_epi32(v_madd_0_12, v_madd_0_13); + __m256i v_madd_1_7 = _mm256_add_epi32(v_madd_0_14, v_madd_0_15); + + __m256i v_madd_2_0 = _mm256_add_epi32(v_madd_1_0, v_madd_1_1); + __m256i v_madd_2_1 = _mm256_add_epi32(v_madd_1_2, v_madd_1_3); + __m256i v_madd_2_2 = _mm256_add_epi32(v_madd_1_4, v_madd_1_5); + __m256i v_madd_2_3 = _mm256_add_epi32(v_madd_1_6, v_madd_1_7); + + __m256i v_madd_3_0 = _mm256_add_epi32(v_madd_2_0, v_madd_2_1); + __m256i v_madd_3_1 = _mm256_add_epi32(v_madd_2_2, v_madd_2_3); + + __m256i v_trunc_0 = truncate_avx2(v_madd_3_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_3_1, debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + dst[0] = v_result; + + src += 16; + dst++; + } +} + +static void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 2; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x2_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } + + __m256i v_hor_pass_out[2]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 2 source vectors + // Unpack -> samples to be added are adjacent + __m256i v_src_hi = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + + __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]); + __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]); + __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]); + __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]); + + __m256i v_trunc_hi_0 = truncate_avx2(v_madd_hi_0, debias, shift_2nd); + __m256i v_trunc_hi_1 = truncate_avx2(v_madd_hi_1, debias, shift_2nd); + __m256i v_trunc_lo_0 = truncate_avx2(v_madd_lo_0, debias, shift_2nd); + __m256i v_trunc_lo_1 = truncate_avx2(v_madd_lo_1, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); +} + + +static void fast_inverse_tr_16x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + const __m256i v_src_0 = _mm256_load_si256((const __m256i*) & src[0]); + const __m256i v_src_1 = _mm256_load_si256((const __m256i*) & src[16]); + + const __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_0, v_src_1); + const __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_0, v_src_1); + + __m256i v_trunc_0 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_0), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_1), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_0), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_1), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_16x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + __m256i v_madd_e[16]; + __m256i v_madd_o[16]; + for (int i = 0, c = 0; i < 16; ++i, c += 2) { + v_madd_e[i] = _mm256_madd_epi16(src[0], v_coeff[c + 0]); + v_madd_o[i] = _mm256_madd_epi16(src[1], v_coeff[c + 1]); + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_e[i], v_madd_o[i]); + } + + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_permute4x64_epi64(v_add[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_hadd_0[8]; + for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) { + v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[4]; + for (int src = 0, dst = 0; dst < 4; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]), debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +static void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename + if (hor == DST7) { + hor_coeff = fi_dst7_2x16_coeff_ver; + } + // DST7 and DCT8 are not defined for this block size + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_16x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 4; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x4_coeff_ver; + } + + __m256i v_hor_pass_out[4]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 4 vectors + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + + __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[2]); + __m256i v_madd_hi_02 = _mm256_madd_epi16(v_src_hi_0, v_coeff[4]); + __m256i v_madd_hi_03 = _mm256_madd_epi16(v_src_hi_0, v_coeff[6]); + __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[3]); + __m256i v_madd_hi_12 = _mm256_madd_epi16(v_src_hi_1, v_coeff[5]); + __m256i v_madd_hi_13 = _mm256_madd_epi16(v_src_hi_1, v_coeff[7]); + + __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[2]); + __m256i v_madd_lo_02 = _mm256_madd_epi16(v_src_lo_0, v_coeff[4]); + __m256i v_madd_lo_03 = _mm256_madd_epi16(v_src_lo_0, v_coeff[6]); + __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[3]); + __m256i v_madd_lo_12 = _mm256_madd_epi16(v_src_lo_1, v_coeff[5]); + __m256i v_madd_lo_13 = _mm256_madd_epi16(v_src_lo_1, v_coeff[7]); + + __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi_00, v_madd_hi_10); + __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi_01, v_madd_hi_11); + __m256i v_add_hi_2 = _mm256_add_epi32(v_madd_hi_02, v_madd_hi_12); + __m256i v_add_hi_3 = _mm256_add_epi32(v_madd_hi_03, v_madd_hi_13); + + __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo_00, v_madd_lo_10); + __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo_01, v_madd_lo_11); + __m256i v_add_lo_2 = _mm256_add_epi32(v_madd_lo_02, v_madd_lo_12); + __m256i v_add_lo_3 = _mm256_add_epi32(v_madd_lo_03, v_madd_lo_13); + + __m256i v_trunc_hi_0 = truncate_avx2(v_add_hi_0, debias, shift_2nd); + __m256i v_trunc_hi_1 = truncate_avx2(v_add_hi_1, debias, shift_2nd); + __m256i v_trunc_hi_2 = truncate_avx2(v_add_hi_2, debias, shift_2nd); + __m256i v_trunc_hi_3 = truncate_avx2(v_add_hi_3, debias, shift_2nd); + + __m256i v_trunc_lo_0 = truncate_avx2(v_add_lo_0, debias, shift_2nd); + __m256i v_trunc_lo_1 = truncate_avx2(v_add_lo_1, debias, shift_2nd); + __m256i v_trunc_lo_2 = truncate_avx2(v_add_lo_2, debias, shift_2nd); + __m256i v_trunc_lo_3 = truncate_avx2(v_add_lo_3, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_2, v_trunc_hi_2); + __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_3, v_trunc_hi_3); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); + _mm256_store_si256((__m256i*)(dst + 32), v_result_2); + _mm256_store_si256((__m256i*)(dst + 48), v_result_3); +} + + +static void fast_inverse_tr_16x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[2], v_src_raw[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[2], v_src_raw[3]); + + __m256i v_madd_lo_0[4]; + __m256i v_madd_lo_1[4]; + __m256i v_madd_hi_0[4]; + __m256i v_madd_hi_1[4]; + for (int i = 0; i < 4; i++) { + v_madd_lo_0[i] = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + v_madd_lo_1[i] = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + + v_madd_hi_0[i] = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + v_madd_hi_1[i] = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_trunc_lo[4]; + __m256i v_trunc_hi[4]; + for (int i = 0; i < 4; ++i) { + v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[i], v_madd_lo_1[i]), debias, shift); + v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[i], v_madd_hi_1[i]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_hi[0]); + dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_hi[1]); + dst[2] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_hi[2]); + dst[3] = _mm256_packs_epi32(v_trunc_lo[3], v_trunc_hi[3]); +} + +static void fast_inverse_tr_16x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[2], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[2], 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(src[1], src[3], 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(src[1], src[3], 0x31); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + __m256i v_madd_2[16]; + __m256i v_madd_3[16]; + for (int i = 0; i < 16; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src_2, v_coeff[0]); + v_madd_3[i] = _mm256_madd_epi16(v_src_3, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_add_0[16]; + __m256i v_add_1[16]; + for (int i = 0; i < 16; ++i) { + v_add_0[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + v_add_1[i] = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + } + + __m256i v_hadd_0[16]; + for (int i = 0; i < 16; ++i) { + v_hadd_0[i] = _mm256_hadd_epi32(v_add_0[i], v_add_1[i]); + } + + __m256i v_hadd_1[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x16_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_4x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_4x16_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4x16_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x16_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x16_coeff_hor; + } + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_16x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x8_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* line_coeff = (const int32_t*)ver_coeff; + + // Got 8 lines of samples. Handle two lines at a time (beacuse of unpack) + __m256i v_madd_hi[4][8]; + __m256i v_madd_lo[4][8]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 4; ++i) { + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + + // Apply coefficients + for (int ii = 0; ii < 8; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff); + v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff); + } + + line_coeff += 8; + v_src_ptr += 2; + } + + // First round of additions + __m256i v_add_hi[2][8]; + __m256i v_add_lo[2][8]; + for (int i = 0; i < 2; ++i) { + for (int ii = 0; ii < 8; ++ii) { + const int offset = i * 2; + v_add_hi[i][ii] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]); + v_add_lo[i][ii] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]); + } + } + + // Final round of additions, truncation and store + for (int ii = 0; ii < 8; ++ii) { + __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi[0][ii], v_add_hi[1][ii]), debias, shift_2nd); + __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo[0][ii], v_add_lo[1][ii]), debias, shift_2nd); + __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + + _mm256_store_si256((__m256i*)dst, v_result); + dst += 16; + } +} + + +static void fast_inverse_tr_16x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[4]; + __m256i v_src_hi[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_src_lo[dst] = _mm256_unpacklo_epi16(v_src_raw[src + 0], v_src_raw[src + 1]); + v_src_hi[dst] = _mm256_unpackhi_epi16(v_src_raw[src + 0], v_src_raw[src + 1]); + } + + __m256i v_trunc_lo[8]; + __m256i v_trunc_hi[8]; + + for (int c = 0; c < 8; c++) { + __m256i v_madd_lo[4]; + __m256i v_madd_hi[4]; + for (int i = 0; i < 4; ++i) { + v_madd_lo[i] = _mm256_madd_epi16(v_src_lo[i], v_coeff[i]); + v_madd_hi[i] = _mm256_madd_epi16(v_src_hi[i], v_coeff[i]); + } + v_coeff += 4; + + __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo[0], v_madd_lo[1]); + __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo[2], v_madd_lo[3]); + + __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi[0], v_madd_hi[1]); + __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi[2], v_madd_hi[3]); + + v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_0, v_add_lo_1), debias, shift); + v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_0, v_add_hi_1), debias, shift); + } + + for (int i = 0; i < 8; ++i) { + dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + } +} + +static void fast_inverse_tr_16x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_tmp32_lo_0 = _mm256_unpacklo_epi32(src[0], src[1]); + __m256i v_tmp32_lo_1 = _mm256_unpacklo_epi32(src[2], src[3]); + __m256i v_tmp32_lo_2 = _mm256_unpacklo_epi32(src[4], src[5]); + __m256i v_tmp32_lo_3 = _mm256_unpacklo_epi32(src[6], src[7]); + + __m256i v_tmp32_hi_0 = _mm256_unpackhi_epi32(src[0], src[1]); + __m256i v_tmp32_hi_1 = _mm256_unpackhi_epi32(src[2], src[3]); + __m256i v_tmp32_hi_2 = _mm256_unpackhi_epi32(src[4], src[5]); + __m256i v_tmp32_hi_3 = _mm256_unpackhi_epi32(src[6], src[7]); + + __m256i v_tmp64_lo_0 = _mm256_unpacklo_epi64(v_tmp32_lo_0, v_tmp32_lo_1); + __m256i v_tmp64_lo_1 = _mm256_unpacklo_epi64(v_tmp32_lo_2, v_tmp32_lo_3); + __m256i v_tmp64_lo_2 = _mm256_unpacklo_epi64(v_tmp32_hi_0, v_tmp32_hi_1); + __m256i v_tmp64_lo_3 = _mm256_unpacklo_epi64(v_tmp32_hi_2, v_tmp32_hi_3); + + __m256i v_tmp64_hi_0 = _mm256_unpackhi_epi64(v_tmp32_lo_0, v_tmp32_lo_1); + __m256i v_tmp64_hi_1 = _mm256_unpackhi_epi64(v_tmp32_lo_2, v_tmp32_lo_3); + __m256i v_tmp64_hi_2 = _mm256_unpackhi_epi64(v_tmp32_hi_0, v_tmp32_hi_1); + __m256i v_tmp64_hi_3 = _mm256_unpackhi_epi64(v_tmp32_hi_2, v_tmp32_hi_3); + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x20); + v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x20); + v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x20); + v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x20); + v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x31); + v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x31); + v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x31); + v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x31); + + + __m256i v_trunc[16]; + for (int c = 0; c < 16; ++c) { + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff[i]); + } + v_coeff += 8; + + __m256i v_add_0[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_0[dst] = _mm256_add_epi32(v_madd[src + 0], v_madd[src + 1]); + } + + __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]); + __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]); + + v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift); + } + + __m256i v_result[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_result[dst] = _mm256_packs_epi32(v_trunc[src + 0], v_trunc[src + 1]); + } + + for (int i = 0; i < 8; ++i) { + v_result[i] = _mm256_shuffle_epi8(v_result[i], v_res_shuffle); + } + + __m256i v_rtmp32_lo_0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]); + __m256i v_rtmp32_lo_1 = _mm256_unpacklo_epi32(v_result[2], v_result[3]); + __m256i v_rtmp32_lo_2 = _mm256_unpacklo_epi32(v_result[4], v_result[5]); + __m256i v_rtmp32_lo_3 = _mm256_unpacklo_epi32(v_result[6], v_result[7]); + + __m256i v_rtmp32_hi_0 = _mm256_unpackhi_epi32(v_result[0], v_result[1]); + __m256i v_rtmp32_hi_1 = _mm256_unpackhi_epi32(v_result[2], v_result[3]); + __m256i v_rtmp32_hi_2 = _mm256_unpackhi_epi32(v_result[4], v_result[5]); + __m256i v_rtmp32_hi_3 = _mm256_unpackhi_epi32(v_result[6], v_result[7]); + + __m256i v_rtmp64_lo_0 = _mm256_unpacklo_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1); + __m256i v_rtmp64_lo_1 = _mm256_unpacklo_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3); + __m256i v_rtmp64_lo_2 = _mm256_unpacklo_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1); + __m256i v_rtmp64_lo_3 = _mm256_unpacklo_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3); + + __m256i v_rtmp64_hi_0 = _mm256_unpackhi_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1); + __m256i v_rtmp64_hi_1 = _mm256_unpackhi_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3); + __m256i v_rtmp64_hi_2 = _mm256_unpackhi_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1); + __m256i v_rtmp64_hi_3 = _mm256_unpackhi_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3); + + v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x16_coeff_hor; + const int16_t* hor_coeff = fi_dct2_8x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_8x16_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x16_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x16_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x16_coeff_hor; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_16x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x16_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + +#define NUM_PARTS 4 +#define PART_DIMENSION (16 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast into 32 bit integer to read two coeffs at a time + const __m256i* v_src_ptr = v_hor_pass_out; + + __m256i v_madd_lo[8][PART_DIMENSION]; + __m256i v_madd_hi[8][PART_DIMENSION]; + for (int i = 0; i < 8; ++i) { + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + + for (int c = 0; c < PART_DIMENSION; ++c) { + const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]); + v_madd_lo[i][c] = _mm256_madd_epi16(v_src_lo, v_coeff); + v_madd_hi[i][c] = _mm256_madd_epi16(v_src_hi, v_coeff); + } + v_src_ptr += 2; + coeff_ptr += 16; + } + + __m256i v_trunc_lo[PART_DIMENSION]; + __m256i v_trunc_hi[PART_DIMENSION]; + for (int i = 0; i < PART_DIMENSION; ++i) { + __m256i v_add_lo_0[4]; + __m256i v_add_hi_0[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_lo_0[dst] = _mm256_add_epi32(v_madd_lo[src + 0][i], v_madd_lo[src + 1][i]); + v_add_hi_0[dst] = _mm256_add_epi32(v_madd_hi[src + 0][i], v_madd_hi[src + 1][i]); + } + + __m256i v_add_lo_1[2]; + __m256i v_add_hi_1[2]; + for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) { + v_add_lo_1[dst] = _mm256_add_epi32(v_add_lo_0[src + 0], v_add_lo_0[src + 1]); + v_add_hi_1[dst] = _mm256_add_epi32(v_add_hi_0[src + 0], v_add_hi_0[src + 1]); + } + + v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0], v_add_lo_1[1]), debias, shift_2nd); + v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0], v_add_hi_1[1]), debias, shift_2nd); + } + __m256i v_result[PART_DIMENSION]; + for (int i = 0; i < PART_DIMENSION; ++i) { + v_result[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + } + + for (int i = 0; i < PART_DIMENSION; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + } + +#undef NUM_PARTS +#undef PART_DIMENSION + +} + + +static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + //const __m256i* v_src_raw = (const __m256i*)src; + + //__m256i v_madd_lo[8][16]; + //__m256i v_madd_hi[8][16]; + //for (int s = 0; s < 8; ++s) { + // __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]); + // __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]); + // v_src_raw += 2; + + // for (int c = 0; c < 16; ++c) { + // const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + // v_madd_lo[s][c] = _mm256_madd_epi16(v_src_lo, v_coeff); + // v_madd_hi[s][c] = _mm256_madd_epi16(v_src_hi, v_coeff); + // c_ptr++; + // } + //} + + //__m256i v_add_lo_0[4][16]; + //__m256i v_add_hi_0[4][16]; + //for (int s = 0, d = 0; d < 4; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_lo_0[d][c] = _mm256_add_epi32(v_madd_lo[s + 0][c], v_madd_lo[s + 1][c]); + // v_add_hi_0[d][c] = _mm256_add_epi32(v_madd_hi[s + 0][c], v_madd_hi[s + 1][c]); + // } + //} + + //__m256i v_add_lo_1[2][16]; + //__m256i v_add_hi_1[2][16]; + //for (int s = 0, d = 0; d < 2; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_lo_1[d][c] = _mm256_add_epi32(v_add_lo_0[s + 0][c], v_add_lo_0[s + 1][c]); + // v_add_hi_1[d][c] = _mm256_add_epi32(v_add_hi_0[s + 0][c], v_add_hi_0[s + 1][c]); + // } + //} + + //__m256i v_trunc_lo[16]; + //__m256i v_trunc_hi[16]; + //for (int c = 0; c < 16; ++c) { + // v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0][c], v_add_lo_1[1][c]), debias, shift); + // v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0][c], v_add_hi_1[1][c]), debias, shift); + //} + + //for (int i = 0; i < 16; ++i) { + // dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + //} + + for (int j = 0; j < line; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + + __m256i *coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src[j + i * 32]; + source[1] = src[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i v_coeff0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1); + + res_0 = _mm256_add_epi32(res_0, v_madd0); + res_1 = _mm256_add_epi32(res_1, v_madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + dst[j] = packed; + } +} + +static void fast_inverse_tr_16x16_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + __m256i v_result[16]; + int16_t *src_p = (int16_t*)src; + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i* coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src_p[j + i * 32]; + source[1] = src_p[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i coeff_0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i coeff_1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i madd0 = _mm256_madd_epi16(v_src, coeff_0); + __m256i madd1 = _mm256_madd_epi16(v_src, coeff_1); + + res_0 = _mm256_add_epi32(res_0, madd0); + res_1 = _mm256_add_epi32(res_1, madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i *)dst, packed); + dst += 16; + } + //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + //const __m256i* v_src_raw = src; + + //// Do a 32-bit transpose to arrange result from previous pass + //__m256i v_tmp32_lo[8]; + //__m256i v_tmp32_hi[8]; + //for (int d = 0, s = 0; d < 8; ++d, s += 2) { + // v_tmp32_lo[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 1]); + // v_tmp32_hi[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 1]); + //} + + //__m256i v_tmp64_lo[8]; + //__m256i v_tmp64_hi[8]; + //for (int d = 0, s = 0; d < 4; ++d, s += 2) { + // v_tmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]); + // v_tmp64_lo[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]); + + // v_tmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]); + // v_tmp64_hi[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]); + //} + // + //__m256i v_src[16]; + //v_src[ 0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20); + //v_src[ 1] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20); + //v_src[ 2] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x20); + //v_src[ 3] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x20); + //v_src[ 4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31); + //v_src[ 5] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31); + //v_src[ 6] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x31); + //v_src[ 7] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x31); + + //v_src[ 8] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20); + //v_src[ 9] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20); + //v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x20); + //v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x20); + //v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31); + //v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31); + //v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x31); + //v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x31); + + //__m256i v_madd_0[8][16]; + //__m256i v_madd_1[8][16]; + //for (int s = 0; s < 8; ++s) { + // for (int c = 0; c < 16; ++c) { + // const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + // v_madd_0[s][c] = _mm256_madd_epi16(v_src[0 + s], v_coeff); + // v_madd_1[s][c] = _mm256_madd_epi16(v_src[8 + s], v_coeff); + // c_ptr++; + // } + //} + + //__m256i v_add_00[4][16]; + //__m256i v_add_01[4][16]; + //for (int s = 0, d = 0; d < 4; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_00[d][c] = _mm256_add_epi32(v_madd_0[s + 0][c], v_madd_0[s + 1][c]); + // v_add_01[d][c] = _mm256_add_epi32(v_madd_1[s + 0][c], v_madd_1[s + 1][c]); + // } + //} + + //__m256i v_add_10[2][16]; + //__m256i v_add_11[2][16]; + //for (int s = 0, d = 0; d < 2; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_10[d][c] = _mm256_add_epi32(v_add_00[s + 0][c], v_add_00[s + 1][c]); + // v_add_11[d][c] = _mm256_add_epi32(v_add_01[s + 0][c], v_add_01[s + 1][c]); + // } + //} + + //__m256i v_trunc_0[16]; + //__m256i v_trunc_1[16]; + //for (int c = 0; c < 16; ++c) { + // v_trunc_0[c] = truncate_avx2(_mm256_add_epi32(v_add_10[0][c], v_add_10[1][c]), debias, shift); + // v_trunc_1[c] = truncate_avx2(_mm256_add_epi32(v_add_11[0][c], v_add_11[1][c]), debias, shift); + //} + + //__m256i v_result[16]; + //for (int d = 0; d < 16; ++d) { + // v_result[d] = _mm256_packs_epi32(v_trunc_0[d], v_trunc_1[d]); + //} + //for (int d = 0; d < 16; ++d) { + // v_result[d] = _mm256_permute4x64_epi64(v_result[d], _MM_SHUFFLE(3, 1, 2, 0)); + //} + + //transpose_avx2(v_result, (__m256i*)dst, 16, 16); +} + +static void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor; + const int16_t* ver_coeff = fi_dct2_16x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x16_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x16_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x16_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height); +} + + +static void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x32_coeff_ver; + } + + int16_t v_hor_pass_out[32*16]; + fast_forward_DCT2_B16_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + + __m256i temp_out[32]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + if(ver == DCT2) { + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 32]; + source[1] = v_hor_pass_out[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 16); + } + else { + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 32]; + source[1] = v_hor_pass_out[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 48; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 16); + } +#if 0 + // To how many parts the vertical pass should be split. + // At least on my testing it seems that there is no further gain by splitting to more than 4 parts. +#define NUM_PARTS 4 +#define PART_DIMENSION (32/NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got 32 / NUM_PARTS lines of samples. Handle two lines at a time (beacuse of unpack) + __m256i v_madd_hi[16][PART_DIMENSION]; + __m256i v_madd_lo[16][PART_DIMENSION]; + // Samples are the same between the parts + __m256i* v_src_ptr = v_hor_pass_out; + // However for coefficients, the starting point needs to be adjusted + const int32_t* line_coeff = (const int32_t*)ver_coeff + PART_DIMENSION * part; + for (int i = 0; i < 16; ++i) { + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + + // Apply coefficients + // TODO: Here try loading the coefficient directly instead of set1 + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff); + v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff); + } + + line_coeff += 32; + v_src_ptr += 2; + } + + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + // First round of additions + __m256i v_add_hi_0[8]; + __m256i v_add_lo_0[8]; + for (int i = 0; i < 8; ++i) { + const int offset = i * 2; + v_add_hi_0[i] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]); + v_add_lo_0[i] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]); + } + + // Second round of additions + __m256i v_add_hi_1[4]; + __m256i v_add_lo_1[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_add_hi_1[i] = _mm256_add_epi32(v_add_hi_0[offset], v_add_hi_0[offset + 1]); + v_add_lo_1[i] = _mm256_add_epi32(v_add_lo_0[offset], v_add_lo_0[offset + 1]); + } + + // Third round of addtions + __m256i v_add_hi_2[2]; + __m256i v_add_lo_2[2]; + for (int i = 0; i < 2; ++i) { + const int offset = i * 2; + v_add_hi_2[i] = _mm256_add_epi32(v_add_hi_1[offset], v_add_hi_1[offset + 1]); + v_add_lo_2[i] = _mm256_add_epi32(v_add_lo_1[offset], v_add_lo_1[offset + 1]); + } + + // Final round of additions, truncate and store + __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi_2[0], v_add_hi_2[1]), debias, shift_2nd); + __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo_2[0], v_add_lo_2[1]), debias, shift_2nd); + __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + _mm256_store_si256((__m256i*)dst, v_result); + + dst += 16; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION +#endif + +} + + +static void fast_inverse_tr_16x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vectors at a time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_tmp16_lo[16]; + __m256i v_tmp16_hi[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 1]); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 1]); + } + int row = 0; + for (; row < 32 - skip_line2; ++row) { + __m256i v_res_lo = _mm256_setzero_si256(); + __m256i v_res_hi = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_lo = _mm256_madd_epi16(v_tmp16_lo[i], v_coeff); + __m256i v_madd_hi = _mm256_madd_epi16(v_tmp16_hi[i], v_coeff); + c_ptr++; + + v_res_lo = _mm256_add_epi32(v_res_lo, v_madd_lo); + v_res_hi = _mm256_add_epi32(v_res_hi, v_madd_hi); + } + + __m256i v_trunc_lo = truncate_avx2(v_res_lo, debias, shift); + __m256i v_trunc_hi = truncate_avx2(v_res_hi, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + dst[row] = packed; + } + + for (; row < 32; ++row) { + dst[row] = _mm256_setzero_si256(); + } +} + +static void fast_inverse_tr_16x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + int32_t * src_32 = (int32_t *)src; + for (int row = 0, d = 0; row < 32; ++row) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i *coeff_start = (__m256i*) coeff; + for (int i = 0; i < 8; ++i) { + __m256i v_src = _mm256_set1_epi32(*src_32); + src_32++; + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start)); + coeff_start++; + __m256i v_madd_1 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start)); + coeff_start++; + + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*) dst + row, packed); + } +} + +static void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; + const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x32_coeff_hor; + } + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_32_t[0][0]; + } else if (ver == DCT8) { + ver_coeff = &uvg_g_dct8_32[0][0]; + } + + __m256i v_ver_pass_out[32]; + fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, 0); + int16_t* ver_pass_out = (int16_t*)v_ver_pass_out; + fast_inverse_tr_16x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_DCT2_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int reduced_line = line - skip_line; + + for(int j = 0; j < reduced_line; ++j) { + int32_t source[16]; + memcpy(source, src, sizeof(int16_t) * 32); + src += 32; + + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t *coeff_start = coeff; + for(int i = 0; i < 16; i++) { + __m256i v_src = _mm256_set1_epi32(source[i]); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + if(line == 32) { + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + } + + _mm256_store_si256(dst, v_trunc_0); + dst++; + _mm256_store_si256(dst, v_trunc_1); + dst++; + } +} + +static void fast_forward_DCT8_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int cutoff = 32 - skip_line2; + const int reduced_line = line - skip_line; + + ALIGNED(32) int16_t temp_source[32 * 32]; + __m256i* v_src_p = (__m256i*) src; + for (int i = 0; i < reduced_line / 2; ++i) { + __m256i first_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]); + __m256i first_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]); + __m256i second_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]); + __m256i second_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]); + + _mm256_store_si256((__m256i*)temp_source + i * 4, first_half_lo); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 1, first_half_hi); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 2, second_half_lo); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 3, second_half_hi); + } + + for (int j = 0; j < reduced_line / 2; ++j) { + + int32_t source[32]; + memcpy(source, temp_source + 64 * j, sizeof(int16_t) * 64); + + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = coeff; + + for (int i = 0; i < 32; i += 2) { + __m256i v_src0 = _mm256_set1_epi32(source[i]); + __m256i v_src1 = _mm256_set1_epi32(source[i + 1]); + + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 48; + + __m256i madd_0 = _mm256_madd_epi16(v_src0, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src0, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src1, v_coeff_0); + __m256i madd_3 = _mm256_madd_epi16(v_src1, v_coeff_1); + + res_0 = _mm256_add_epi32(madd_0, res_0); + res_1 = _mm256_add_epi32(madd_1, res_1); + res_2 = _mm256_add_epi32(madd_2, res_2); + res_3 = _mm256_add_epi32(madd_3, res_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + if (line == 32) { + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0)); + } + _mm256_store_si256(dst, v_trunc_0); + dst+=2; + _mm256_store_si256(dst, v_trunc_2); + dst+=2; + } +} + + +static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_src_ptr = src; + + // Prepare coeffs + // TODO: either rename these old coeff tables to be consistent with other new avx2 functions + // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[16]); + + // Got data for 4 vectors, 32 lines with 2 samples each + __m256i v_result_e[4]; + __m256i v_result_o[4]; + for (int j = 0; j < 4; ++j) { + const __m256i v_src = v_src_ptr[0]; + + v_result_e[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_0), debias, shift); + v_result_o[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_1), debias, shift); + + v_src_ptr++; + } + + __m256i v_tmp[4]; + v_tmp[0] = _mm256_packs_epi32(v_result_e[0], v_result_e[1]); + v_tmp[1] = _mm256_packs_epi32(v_result_e[2], v_result_e[3]); + v_tmp[2] = _mm256_packs_epi32(v_result_o[0], v_result_o[1]); + v_tmp[3] = _mm256_packs_epi32(v_result_o[2], v_result_o[3]); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0)); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)&dst[i * 16], v_tmp[i]); + } +} + +static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Got data for 8 vectors, 32 lines with 4 samples each + + // Prepare coeffs + const int16_t* coeff = &uvg_g_dct_4[0][0]; + const int a = coeff[0]; + const int b = coeff[1 * 4 + 0]; + const int c = coeff[1 * 4 + 1]; + + __m256i v_coeff_0 = _mm256_set1_epi16(a); + __m256i v_coeff_1 = _mm256_setr_epi16(b, c, -c, -b, b, c, -c, -b, b, c, -c, -b, b, c, -c, -b); + __m256i v_coeff_2 = _mm256_setr_epi16(a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a); + __m256i v_coeff_3 = _mm256_setr_epi16(c, -b, b, -c, c, -b, b, -c, c, -b, b, -c, c, -b, b, -c); + + const __m256i* v_src_ptr = src; + __m256i v_trunc_0[8]; + __m256i v_trunc_1[8]; + for (int j = 0; j < 8; ++j) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_3); + + v_trunc_0[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + v_trunc_1[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + v_src_ptr++; + } + + __m256i v_result[8]; + __m256i v_tmp[8]; + for (int i = 0; i < 8; ++i) { + v_trunc_0[i] = _mm256_permute4x64_epi64(v_trunc_0[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1[i] = _mm256_permute4x64_epi64(v_trunc_1[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + v_tmp[0] = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]); + v_tmp[1] = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]); + v_tmp[2] = _mm256_packs_epi32(v_trunc_0[4], v_trunc_0[5]); + v_tmp[3] = _mm256_packs_epi32(v_trunc_0[6], v_trunc_0[7]); + v_tmp[4] = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]); + v_tmp[5] = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]); + v_tmp[6] = _mm256_packs_epi32(v_trunc_1[4], v_trunc_1[5]); + v_tmp[7] = _mm256_packs_epi32(v_trunc_1[6], v_trunc_1[7]); + + v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); + + v_result[4] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x20); + v_result[5] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x20); + v_result[6] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)&dst[i * 16], v_result[i]); + } +} + + +static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + int16_t* const p_dst = dst; + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Re-use coeff table + const __m256i* v_coeff = (const __m256i*)ff_dct2_16x8_coeff_ver; + + const int reduced_line = line - skip_line; + const __m256i* v_src_ptr = src; + __m256i v_tmp_result[16]; + // Handle 2 lines at a time (16 samples, 8 samples per line) + for (int i = 0; i < 16; ++i) { + // line 1 line 2 + // src vector: [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7] + // __m256i v_src = _mm256_load_si256((const __m256i*)src); + + // Rearrange source in a way samples can be added together column-wise using add + // after first round of madd operations. + // Need 4 source vectors arranged as follows. High 128 lanes are the same as low: + // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...] + // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...] + // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...] + // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...] + + __m256i v_src_0 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(0, 0, 0, 0)); + __m256i v_src_1 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(1, 1, 1, 1)); + __m256i v_src_2 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(2, 2, 2, 2)); + __m256i v_src_3 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(3, 3, 3, 3)); + + // Lane 1 + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + // Lane 2 + __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]); + + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + // Trunc results from both lanes + __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift); + + v_tmp_result[i] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + v_src_ptr++; + } + + __m256i v_result[16]; + transpose_avx2(v_tmp_result, v_result, 8, 32); + + for (int i = 0; i < 16; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + +} + + +static void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 2; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x2_coeff_ver; + + __m256i v_hor_pass_out[4]; + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 4 source vectors, 2 lines 32 samples each + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[1], v_hor_pass_out[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[1], v_hor_pass_out[3]); + + __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[1]); + __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[0]); + __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + + __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[1]); + __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[0]); + __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + + __m256i v_trunc_hi_00 = truncate_avx2(v_madd_hi_00, debias, shift_2nd); + __m256i v_trunc_hi_01 = truncate_avx2(v_madd_hi_01, debias, shift_2nd); + __m256i v_trunc_hi_10 = truncate_avx2(v_madd_hi_10, debias, shift_2nd); + __m256i v_trunc_hi_11 = truncate_avx2(v_madd_hi_11, debias, shift_2nd); + + __m256i v_trunc_lo_00 = truncate_avx2(v_madd_lo_00, debias, shift_2nd); + __m256i v_trunc_lo_01 = truncate_avx2(v_madd_lo_01, debias, shift_2nd); + __m256i v_trunc_lo_10 = truncate_avx2(v_madd_lo_10, debias, shift_2nd); + __m256i v_trunc_lo_11 = truncate_avx2(v_madd_lo_11, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_hi_00); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_hi_10); // Swap middle hi-lo lanes + __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_01, v_trunc_hi_01); + __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_11, v_trunc_hi_11); + + // Swap middle 64-bit chunks + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_2 = _mm256_permute4x64_epi64(v_result_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_3 = _mm256_permute4x64_epi64(v_result_3, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); + _mm256_store_si256((__m256i*)(dst + 32), v_result_2); + _mm256_store_si256((__m256i*)(dst + 48), v_result_3); +} + + +static void fast_inverse_tr_32x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + const __m256i* v_src = (const __m256i*)src; + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src[0], v_src[2]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src[1], v_src[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src[0], v_src[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src[1], v_src[3]); + + __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_0), debias, shift); + __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_1), debias, shift); + __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_0), debias, shift); + __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_1), debias, shift); + + __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_0), debias, shift); + __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_1), debias, shift); + __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_0), debias, shift); + __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_1), debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); +} + +static void fast_inverse_tr_32x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + __m256i v_src[4]; + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_permute4x64_epi64(src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_add[i] = _mm256_add_epi32(v_add_00, v_add_01); + v_coeff += 4; + } + + __m256i v_hadd_0[16]; + for (int src = 0, dst = 0; dst < 16; ++dst, src += 2) { + v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_hadd_1[8]; + for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) { + v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_result[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: cutoff for DCT8 and DST7 +} + +static void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 2; + + int skip_width = 0; // DST7 and DCT8 are not defined for this size. Therefore no skip width needed. + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x32_coeff_ver; // rename + // No DST7 and DCT8 tables needed. + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_32x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 4; + + int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x4_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + if(hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 8 vectors. 4 lines with 32 samples each. Need 2 vectors for each line + // Handle two lines at a time + __m256i v_madd_lo_even[2][4]; + __m256i v_madd_lo_odd[2][4]; + __m256i v_madd_hi_even[2][4]; + __m256i v_madd_hi_odd[2][4]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 2; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < 4; ++ii) { + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]); + } + + v_coeff += 4; + v_src_ptr += 4; + } + + // Final add and truncate + __m256i v_trunc_lo_even[4]; + __m256i v_trunc_hi_even[4]; + __m256i v_trunc_lo_odd[4]; + __m256i v_trunc_hi_odd[4]; + for (int ii = 0; ii < 4; ++ii) { + v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_lo_even[0][ii], v_madd_lo_even[1][ii]), debias, shift_2nd); + v_trunc_lo_odd[ii] = truncate_avx2(_mm256_add_epi32( v_madd_lo_odd[0][ii], v_madd_lo_odd[1][ii]), debias, shift_2nd); + v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_hi_even[0][ii], v_madd_hi_even[1][ii]), debias, shift_2nd); + v_trunc_hi_odd[ii] = truncate_avx2(_mm256_add_epi32( v_madd_hi_odd[0][ii], v_madd_hi_odd[1][ii]), debias, shift_2nd); + } + + // Permute and store + for (int i = 0; i < 4; ++i) { + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + +} + + +static void fast_inverse_tr_32x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_madd_lo_0[2][4]; + __m256i v_madd_lo_1[2][4]; + __m256i v_madd_hi_0[2][4]; + __m256i v_madd_hi_1[2][4]; + const __m256i* v_c_ptr = v_coeff; + for (int src = 0; src < 2; ++src) { + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]); + + for (int i = 0; i < 4; i++) { + v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]); + v_madd_lo_1[src][i] = _mm256_madd_epi16(v_src_lo_1, v_c_ptr[i]); + v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]); + v_madd_hi_1[src][i] = _mm256_madd_epi16(v_src_hi_1, v_c_ptr[i]); + } + v_c_ptr += 4; + v_src_raw += 4; + } + + __m256i v_trunc_lo[8]; + __m256i v_trunc_hi[8]; + for (int dst = 0, src = 0; src < 4; ++src, dst += 2) { + v_trunc_lo[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift); + v_trunc_lo[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_lo_1[0][src], v_madd_lo_1[1][src]), debias, shift); + v_trunc_hi[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift); + v_trunc_hi[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_hi_1[0][src], v_madd_hi_1[1][src]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[2]); + dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[2]); + dst[4] = _mm256_packs_epi32(v_trunc_lo[4], v_trunc_lo[6]); + dst[6] = _mm256_packs_epi32(v_trunc_hi[4], v_trunc_hi[6]); + + if(skip_line == 0) { + dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_lo[3]); + dst[3] = _mm256_packs_epi32(v_trunc_hi[1], v_trunc_hi[3]); + dst[5] = _mm256_packs_epi32(v_trunc_lo[5], v_trunc_lo[7]); + dst[7] = _mm256_packs_epi32(v_trunc_hi[5], v_trunc_hi[7]); + } + else { + dst[1] = _mm256_setzero_si256(); + dst[3] = _mm256_setzero_si256(); + dst[5] = _mm256_setzero_si256(); + dst[7] = _mm256_setzero_si256(); + } + + // TODO: mts cutoff +} +static void fast_inverse_tr_32x4_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_madd_lo_0[2][4]; + __m256i v_madd_hi_0[2][4]; + const __m256i* v_c_ptr = v_coeff; + for (int src = 0; src < 2; ++src) { + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + + for (int i = 0; i < 4; i++) { + v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]); + v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]); + } + v_c_ptr += 4; + v_src_raw += 4; + } + + __m256i v_trunc_lo[4]; + __m256i v_trunc_hi[4]; + for (int src = 0; src < 4; ++src) { + v_trunc_lo[src] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift); + v_trunc_hi[src] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[1]); + dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[1]); + dst[4] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_lo[3]); + dst[6] = _mm256_packs_epi32(v_trunc_hi[2], v_trunc_hi[3]); + + dst[1] = _mm256_setzero_si256(); + dst[3] = _mm256_setzero_si256(); + dst[5] = _mm256_setzero_si256(); + dst[7] = _mm256_setzero_si256(); + + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + + v_src[4] = _mm256_permute2x128_si256(src[1], src[5], 0x20); + v_src[5] = _mm256_permute2x128_si256(src[3], src[7], 0x20); + v_src[6] = _mm256_permute2x128_si256(src[1], src[5], 0x31); + v_src[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31); + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]); + __m256i v_coeff_4 = _mm256_set1_epi64x(c_ptr[4]); + __m256i v_coeff_5 = _mm256_set1_epi64x(c_ptr[5]); + __m256i v_coeff_6 = _mm256_set1_epi64x(c_ptr[6]); + __m256i v_coeff_7 = _mm256_set1_epi64x(c_ptr[7]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff_4); + __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff_5); + __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff_6); + __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff_7); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + v_add[i] = _mm256_add_epi32(v_add_10, v_add_11); + c_ptr += 8; + } + + __m256i v_hadd[16]; + for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[8]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]); + __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]); + __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]); + __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle); + v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle); + v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle); + v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + // TODO: cutoff for dct8 and dst7 +} +static void fast_inverse_tr_32x4_avx2_mts_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + v_add[i] = v_add_10; + c_ptr += 8; + } + + __m256i v_hadd[16]; + for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[8]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]); + __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]); + __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]); + __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle); + v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle); + v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle); + v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + // TODO: cutoff for dct8 and dst7 +} + +static void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 4; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename + const int16_t* hor_coeff = &uvg_g_dct_32_t[0][0]; + if (hor == DST7) { + hor_coeff = &uvg_g_dst7_32_t[0][0]; + } else if (hor == DCT8) { + hor_coeff = &uvg_g_dct8_32[0][0]; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x32_coeff_hor; // TODO: rename + } + + __m256i v_ver_pass_out[8]; + if(ver == DCT2) { + fast_inverse_tr_32x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + } + else { + fast_inverse_tr_32x4_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + } + + if(hor == DCT2) { + fast_inverse_tr_32x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + } + else { + fast_inverse_tr_32x4_avx2_mts_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + } +} + + +static void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 8; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x8_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + if (hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + // Same as for the other 32 and other dimension 8 or 16 + // However all 1,2,4 seem to be producing similar results as with increasing the value + // just shifts the pressure from one point to another +#define NUM_PARTS 4 +#define PART_DIMENSION (8 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got data for 16 vectors, 8 lines 32 samples each + // Handle two lines at a time + __m256i v_madd_lo_even[4][PART_DIMENSION]; + __m256i v_madd_lo_odd[4][PART_DIMENSION]; + __m256i v_madd_hi_even[4][PART_DIMENSION]; + __m256i v_madd_hi_odd[4][PART_DIMENSION]; + __m256i* v_src_ptr = v_hor_pass_out; + const __m256i* v_coeff = (const __m256i*)ver_coeff + part * PART_DIMENSION; + for (int i = 0; i < 4; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]); + } + + v_coeff += 8; + v_src_ptr += 4; + } + + // First round of additions + __m256i v_add_lo_even[2][PART_DIMENSION]; + __m256i v_add_hi_even[2][PART_DIMENSION]; + __m256i v_add_lo_odd[2][PART_DIMENSION]; + __m256i v_add_hi_odd[2][PART_DIMENSION]; + for (int i = 0; i < 2; ++i) { + const int offset = 2 * i; + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_add_lo_even[i][ii] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]); + v_add_hi_even[i][ii] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]); + v_add_lo_odd[i][ii] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]); + v_add_hi_odd[i][ii] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]); + } + } + + // Final add and truncate + __m256i v_trunc_lo_even[PART_DIMENSION]; + __m256i v_trunc_hi_even[PART_DIMENSION]; + __m256i v_trunc_lo_odd[PART_DIMENSION]; + __m256i v_trunc_hi_odd[PART_DIMENSION]; + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_even[0][ii], v_add_lo_even[1][ii]), debias, shift_2nd); + v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_even[0][ii], v_add_hi_even[1][ii]), debias, shift_2nd); + v_trunc_lo_odd[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_odd[0][ii], v_add_lo_odd[1][ii]), debias, shift_2nd); + v_trunc_hi_odd[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_odd[0][ii], v_add_hi_odd[1][ii]), debias, shift_2nd); + } + + // Permute and store + for (int i = 0; i < PART_DIMENSION; ++i) { + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION + +} + + +static void fast_inverse_tr_32x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[8]; + __m256i v_src_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 4) { + v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_lo[d + 1] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + + v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_hi[d + 1] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + for (int c = 0; c < 8; ++c) { + __m256i v_madd_lo_0[4]; + __m256i v_madd_lo_1[4]; + __m256i v_madd_hi_0[4]; + __m256i v_madd_hi_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]); + v_madd_lo_1[d] = _mm256_madd_epi16(v_src_lo[s + 1], v_coeff[d]); + v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]); + v_madd_hi_1[d] = _mm256_madd_epi16(v_src_hi[s + 1], v_coeff[d]); + } + v_coeff += 4; + + __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]); + __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]); + __m256i v_add_lo_10 = _mm256_add_epi32(v_madd_lo_1[0], v_madd_lo_1[1]); + __m256i v_add_lo_11 = _mm256_add_epi32(v_madd_lo_1[2], v_madd_lo_1[3]); + + __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]); + __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]); + __m256i v_add_hi_10 = _mm256_add_epi32(v_madd_hi_1[0], v_madd_hi_1[1]); + __m256i v_add_hi_11 = _mm256_add_epi32(v_madd_hi_1[2], v_madd_hi_1[3]); + + __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift); + __m256i v_trunc_lo_1 = truncate_avx2(_mm256_add_epi32(v_add_lo_10, v_add_lo_11), debias, shift); + + __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift); + __m256i v_trunc_hi_1 = truncate_avx2(_mm256_add_epi32(v_add_hi_10, v_add_hi_11), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + dst[1] = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + dst += 2; + } + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x8_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[4]; + __m256i v_src_hi[4]; + for (int d = 0, s = 0; d < 4; d += 1, s += 4) { + v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + } + + for (int c = 0; c < 8; ++c) { + __m256i v_madd_lo_0[4]; + __m256i v_madd_hi_0[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 1) { + v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]); + v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]); + } + v_coeff += 4; + + __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]); + __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]); + + __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]); + __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]); + + __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift); + + __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + dst[1] = _mm256_setzero_si256(); + dst += 2; + } + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int limit = skip_line2 == 16 ? 8 : 16; + + int32_t *src_32 = (int32_t*)src; + for (int j = 0; j < line; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + + __m256i *coeff_start = (__m256i*)coeff; + for (int i = 0; i < limit; ++i) { + __m256i v_src = _mm256_set1_epi32(*src_32); + src_32++; + + __m256i v_coeff0 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff2 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff3 = _mm256_loadu_si256(coeff_start); + coeff_start++; + + __m256i madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i madd1 = _mm256_madd_epi16(v_src, v_coeff1); + __m256i madd2 = _mm256_madd_epi16(v_src, v_coeff2); + __m256i madd3 = _mm256_madd_epi16(v_src, v_coeff3); + + res_0 = _mm256_add_epi32(res_0, madd0); + res_1 = _mm256_add_epi32(res_1, madd1); + res_2 = _mm256_add_epi32(res_2, madd2); + res_3 = _mm256_add_epi32(res_3, madd3); + } + src_32 += limit == 8 ? 8 : 0; + + __m256i v_trunk0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunk1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunk2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunk3 = truncate_avx2(res_3, debias, shift); + + __m256i packed0 = _mm256_packs_epi32(v_trunk0, v_trunk1); + __m256i packed1 = _mm256_packs_epi32(v_trunk2, v_trunk3); + + packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0)); + packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, packed0); + _mm256_store_si256((__m256i*)dst + 1, packed1); + dst += 32; + } + + // TODO: cutoff for dct8 and dst7 +} + +static void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 8; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x32_coeff_hor; // TODO: rename this table + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x32_coeff_hor; // TODO: rename + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x32_coeff_hor; // TODO: rename + } + + __m256i v_ver_pass_out[16]; + if(ver == DCT2 || hor == DCT2) { + fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_inverse_tr_32x8_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + } + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 16; + + int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x16_coeff_ver; + } + + __m256i v_hor_pass_out[32]; + if (hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + // Same as for 8x32 and 16x32, 4 parts is optimal +#define NUM_PARTS 4 +#define PART_DIMENSION (16 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got samples for 32 vectors, 16 lines with 32 samples each + // Handle two lines at a time + __m256i v_madd_lo_even[8][PART_DIMENSION]; + __m256i v_madd_lo_odd[8][PART_DIMENSION]; + __m256i v_madd_hi_even[8][PART_DIMENSION]; + __m256i v_madd_hi_odd[8][PART_DIMENSION]; + __m256i* v_src_ptr = v_hor_pass_out; + const int32_t* line_coeff = (const int32_t*)ver_coeff + part * PART_DIMENSION; + for (int i = 0; i < 8; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff); + } + + line_coeff += 16; + v_src_ptr += 4; + } + + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + // First round of additions + __m256i v_add_lo_even_0[4]; + __m256i v_add_hi_even_0[4]; + __m256i v_add_lo_odd_0[4]; + __m256i v_add_hi_odd_0[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_add_lo_even_0[i] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]); + v_add_hi_even_0[i] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]); + v_add_lo_odd_0[i] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]); + v_add_hi_odd_0[i] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]); + } + + // Second round of additions + __m256i v_add_lo_even_1[2]; + __m256i v_add_hi_even_1[2]; + __m256i v_add_lo_odd_1[2]; + __m256i v_add_hi_odd_1[2]; + for (int i = 0; i < 2; ++i) { + const int offset = 2 * i; + v_add_lo_even_1[i] = _mm256_add_epi32(v_add_lo_even_0[offset], v_add_lo_even_0[offset + 1]); + v_add_hi_even_1[i] = _mm256_add_epi32(v_add_hi_even_0[offset], v_add_hi_even_0[offset + 1]); + v_add_lo_odd_1[i] = _mm256_add_epi32(v_add_lo_odd_0[offset], v_add_lo_odd_0[offset + 1]); + v_add_hi_odd_1[i] = _mm256_add_epi32(v_add_hi_odd_0[offset], v_add_hi_odd_0[offset + 1]); + } + + // Final add and truncate + __m256i v_trunc_lo_even; + __m256i v_trunc_hi_even; + __m256i v_trunc_lo_odd; + __m256i v_trunc_hi_odd; + v_trunc_lo_even = truncate_avx2(_mm256_add_epi32(v_add_lo_even_1[0], v_add_lo_even_1[1]), debias, shift_2nd); + v_trunc_hi_even = truncate_avx2(_mm256_add_epi32(v_add_hi_even_1[0], v_add_hi_even_1[1]), debias, shift_2nd); + v_trunc_lo_odd = truncate_avx2(_mm256_add_epi32(v_add_lo_odd_1[0], v_add_lo_odd_1[1]), debias, shift_2nd); + v_trunc_hi_odd = truncate_avx2(_mm256_add_epi32(v_add_hi_odd_1[0], v_add_hi_odd_1[1]), debias, shift_2nd); + + + // Permute and store + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even, v_trunc_hi_even); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd, v_trunc_hi_odd); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION +} + + +static void fast_inverse_tr_32x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int limit = 32 - skip_line; + __m256i temp[32]; + for (int j = 0; j < limit; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + + __m256i* coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src[j + i * 64]; + source[1] = src[j + i * 64 + 32]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i v_coeff0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1); + + res_0 = _mm256_add_epi32(res_0, v_madd0); + res_1 = _mm256_add_epi32(res_1, v_madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + temp[j] = packed; + } + for (int j = limit; j < 32; ++j) { + temp[j] = _mm256_setzero_si256(); + } + transpose_avx2(temp, dst, 16, 32); +} + +static void fast_inverse_tr_32x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + // Do a 32-bit transpose to arrange result from previous pass + __m256i v_tmp32_lo_e[8]; + __m256i v_tmp32_hi_e[8]; + __m256i v_tmp32_lo_o[8]; + __m256i v_tmp32_hi_o[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 4) { + v_tmp32_lo_e[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 2]); + v_tmp32_hi_e[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 2]); + v_tmp32_lo_o[d] = _mm256_unpacklo_epi32(v_src_raw[s + 1], v_src_raw[s + 3]); + v_tmp32_hi_o[d] = _mm256_unpackhi_epi32(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + __m256i v_tmp64_lo_e[8]; + __m256i v_tmp64_hi_e[8]; + __m256i v_tmp64_lo_o[8]; + __m256i v_tmp64_hi_o[8]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_tmp64_lo_e[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]); + v_tmp64_lo_e[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]); + + v_tmp64_hi_e[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]); + v_tmp64_hi_e[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]); + + v_tmp64_lo_o[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]); + v_tmp64_lo_o[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]); + + v_tmp64_hi_o[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]); + v_tmp64_hi_o[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]); + } + + __m256i v_src[32]; + v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x20); + v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x20); + v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x20); + v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x20); + + v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x31); + v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x31); + v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x31); + v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x31); + + v_src[8] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x20); + v_src[9] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x20); + v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x20); + v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x20); + + v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x31); + v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x31); + v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x31); + v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x31); + + v_src[16] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x20); + v_src[17] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x20); + v_src[18] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x20); + v_src[19] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x20); + + v_src[20] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x31); + v_src[21] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x31); + v_src[22] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x31); + v_src[23] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x31); + + v_src[24] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x20); + v_src[25] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x20); + v_src[26] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x20); + v_src[27] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x20); + + v_src[28] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x31); + v_src[29] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x31); + v_src[30] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x31); + v_src[31] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x31); + + __m256i v_trunc[64]; + __m256i* v_src_ptr = v_src; + __m256i* v_tr_ptr = v_trunc; + + + for (int chunk = 0; chunk < 2; ++chunk) { + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + for (int c = 0; c < 32; ++c) { + __m256i v_madd[16]; + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + v_madd[i] = _mm256_madd_epi16(v_src_ptr[i], v_coeff); + c_ptr++; + } + + __m256i v_add_0[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]); + } + + __m256i v_add_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_add_1[d] = _mm256_add_epi32(v_add_0[s + 0], v_add_0[s + 1]); + } + + __m256i v_add_2[2]; + for (int d = 0, s = 0; d < 2; ++d, s += 2) { + v_add_2[d] = _mm256_add_epi32(v_add_1[s + 0], v_add_1[s + 1]); + } + + v_tr_ptr[c] = truncate_avx2(_mm256_add_epi32(v_add_2[0], v_add_2[1]), debias, shift); + } + v_tr_ptr += 32; + v_src_ptr += 16; + } + + __m256i v_tmp[32]; + __m256i v_result[32]; + for (int i = 0, s = 0; i < 32; ++i, s += 2) { + v_tmp[i] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle); + } + + __m256i v_rtmp32_lo[16]; + __m256i v_rtmp32_hi[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_rtmp32_lo[d] = _mm256_unpacklo_epi32(v_tmp[s + 0], v_tmp[s + 1]); + v_rtmp32_hi[d] = _mm256_unpackhi_epi32(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_rtmp64_lo[16]; + __m256i v_rtmp64_hi[16]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_rtmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]); + v_rtmp64_lo[8 + d] = _mm256_unpacklo_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]); + + v_rtmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]); + v_rtmp64_hi[8 + d] = _mm256_unpackhi_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]); + } + + v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x20); + v_result[2] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x20); + v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x20); + v_result[5] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x20); + v_result[6] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x20); + v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x20); + + v_result[8] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x31); + v_result[9] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x31); + v_result[10] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x31); + v_result[11] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x31); + + v_result[12] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x31); + v_result[13] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x31); + v_result[14] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x31); + v_result[15] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x31); + + v_result[16] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x20); + v_result[17] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x20); + v_result[18] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x20); + v_result[19] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x20); + + v_result[20] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x20); + v_result[21] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x20); + v_result[22] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x20); + v_result[23] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x20); + + v_result[24] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x31); + v_result[25] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x31); + v_result[26] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x31); + v_result[27] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x31); + + v_result[28] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x31); + v_result[29] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x31); + v_result[30] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x31); + v_result[31] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x31); + + for (int i = 0; i < 32; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: MTS cutoff +} + +static void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 16; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_32x16_coeff_ver; + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_32x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_32x16_coeff_ver; + } + + __m256i v_ver_pass_out[32]; + fast_inverse_tr_32x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 32; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x32_coeff_ver; + } + + ALIGNED(32) int16_t v_hor_pass_out[32 * 32]; + if(hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + __m256i temp_out[32 * 2]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + for (int j = 0; j < reduced_line; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 64]; + source[1] = v_hor_pass_out[j + i * 64 + 32]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + __m256i v_coeff_2; + __m256i v_coeff_3; + if(skip_height == 0) { + coeff_start += 16; + v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + } + else { + coeff_start += 48; + } + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2; + __m256i madd_3; + if(skip_height == 0) { + madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + } + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + if(skip_height == 0) { + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2; + __m256i v_trunc_3; + if(skip_height == 0) { + v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + } + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + if(skip_height == 0) { + v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_2); + } + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 32); +#if 0 + // 8 is probably best, though difference to 16 is not that large +#define NUM_PARTS 8 +#define PART_DIMENSION (32 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast to 32 bit integer to read 2 coeffs at a time + const __m256i* v_src_ptr = v_hor_pass_out; + + __m256i v_madd_lo_e[16][PART_DIMENSION]; + __m256i v_madd_lo_o[16][PART_DIMENSION]; + __m256i v_madd_hi_e[16][PART_DIMENSION]; + __m256i v_madd_hi_o[16][PART_DIMENSION]; + for (int i = 0; i < 16; ++i) { + __m256i v_src_lo_e = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_o = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_hi_e = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_o = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + + + for (int c = 0; c < PART_DIMENSION; ++c) { + const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]); + v_madd_lo_e[i][c] = _mm256_madd_epi16(v_src_lo_e, v_coeff); + v_madd_lo_o[i][c] = _mm256_madd_epi16(v_src_lo_o, v_coeff); + v_madd_hi_e[i][c] = _mm256_madd_epi16(v_src_hi_e, v_coeff); + v_madd_hi_o[i][c] = _mm256_madd_epi16(v_src_hi_o, v_coeff); + } + coeff_ptr += 32; + v_src_ptr += 4; + } + + for (int c = 0; c < PART_DIMENSION; ++c) { + __m256i v_add_lo_e0[8]; + __m256i v_add_lo_o0[8]; + __m256i v_add_hi_e0[8]; + __m256i v_add_hi_o0[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_add_lo_e0[dst] = _mm256_add_epi32(v_madd_lo_e[src + 0][c], v_madd_lo_e[src + 1][c]); + v_add_lo_o0[dst] = _mm256_add_epi32(v_madd_lo_o[src + 0][c], v_madd_lo_o[src + 1][c]); + v_add_hi_e0[dst] = _mm256_add_epi32(v_madd_hi_e[src + 0][c], v_madd_hi_e[src + 1][c]); + v_add_hi_o0[dst] = _mm256_add_epi32(v_madd_hi_o[src + 0][c], v_madd_hi_o[src + 1][c]); + } + + __m256i v_add_lo_e1[4]; + __m256i v_add_lo_o1[4]; + __m256i v_add_hi_e1[4]; + __m256i v_add_hi_o1[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_lo_e1[dst] = _mm256_add_epi32(v_add_lo_e0[src + 0], v_add_lo_e0[src + 1]); + v_add_lo_o1[dst] = _mm256_add_epi32(v_add_lo_o0[src + 0], v_add_lo_o0[src + 1]); + v_add_hi_e1[dst] = _mm256_add_epi32(v_add_hi_e0[src + 0], v_add_hi_e0[src + 1]); + v_add_hi_o1[dst] = _mm256_add_epi32(v_add_hi_o0[src + 0], v_add_hi_o0[src + 1]); + } + + __m256i v_add_lo_e2[2]; + __m256i v_add_lo_o2[2]; + __m256i v_add_hi_e2[2]; + __m256i v_add_hi_o2[2]; + for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) { + v_add_lo_e2[dst] = _mm256_add_epi32(v_add_lo_e1[src + 0], v_add_lo_e1[src + 1]); + v_add_lo_o2[dst] = _mm256_add_epi32(v_add_lo_o1[src + 0], v_add_lo_o1[src + 1]); + v_add_hi_e2[dst] = _mm256_add_epi32(v_add_hi_e1[src + 0], v_add_hi_e1[src + 1]); + v_add_hi_o2[dst] = _mm256_add_epi32(v_add_hi_o1[src + 0], v_add_hi_o1[src + 1]); + } + + __m256i v_trunc_lo_e = truncate_avx2(_mm256_add_epi32(v_add_lo_e2[0], v_add_lo_e2[1]), debias, shift_2nd); + __m256i v_trunc_lo_o = truncate_avx2(_mm256_add_epi32(v_add_lo_o2[0], v_add_lo_o2[1]), debias, shift_2nd); + __m256i v_trunc_hi_e = truncate_avx2(_mm256_add_epi32(v_add_hi_e2[0], v_add_hi_e2[1]), debias, shift_2nd); + __m256i v_trunc_hi_o = truncate_avx2(_mm256_add_epi32(v_add_hi_o2[0], v_add_hi_o2[1]), debias, shift_2nd); + + __m256i v_result_e = _mm256_packs_epi32(v_trunc_lo_e, v_trunc_hi_e); + __m256i v_result_o = _mm256_packs_epi32(v_trunc_lo_o, v_trunc_hi_o); + + v_result_e = _mm256_permute4x64_epi64(v_result_e, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_o = _mm256_permute4x64_epi64(v_result_o, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, v_result_e); + dst += 16; + _mm256_store_si256((__m256i*)dst, v_result_o); + dst += 16; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION +#endif + +} + + +static void fast_inverse_tr_32x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src[16][4]; + for (int d = 0, s = 0; d < 16; ++d, s += 4) { + v_src[d][0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src[d][1] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src[d][2] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + v_src[d][3] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + for (int row = 0, d = 0; row < 32; ++row, d += 2) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i v_res_2 = _mm256_setzero_si256(); + __m256i v_res_3 = _mm256_setzero_si256(); + if(skip_line == 0) { + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[i][2], v_coeff); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[i][3], v_coeff); + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + v_res_2 = _mm256_add_epi32(v_res_2, v_madd_2); + v_res_3 = _mm256_add_epi32(v_res_3, v_madd_3); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift); + + dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + } + else { + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff); + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + + dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[d + 1] = _mm256_setzero_si256(); + } + } +} + +static void fast_inverse_tr_32x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + + // Do a 32 bit transpose on input + __m256i v_tmp32_lo[32]; + __m256i v_tmp32_hi[32]; + for (int d = 0, s = 0; d < 32; d += 2, s += 4) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(src[s + 0], src[s + 2]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(src[s + 1], src[s + 3]); + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(src[s + 0], src[s + 2]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(src[s + 1], src[s + 3]); + } + + __m256i v_tmp64_lo[32]; + __m256i v_tmp64_hi[32]; + for (int i = 0; i < 32; i += 4) { + v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]); + v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]); + v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]); + v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]); + + v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]); + v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]); + v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]); + v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]); + } + + __m256i v_src[64]; + for (int d = 0, s = 0; d < 64; d += 16, s += 8) { + v_src[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x20); + v_src[d + 1] = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x20); + v_src[d + 2] = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x20); + v_src[d + 3] = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x20); + + v_src[d + 4] = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x31); + v_src[d + 5] = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x31); + v_src[d + 6] = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x31); + v_src[d + 7] = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x31); + + v_src[d + 8] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x20); + v_src[d + 9] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x20); + v_src[d + 10] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x20); + v_src[d + 11] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x20); + + v_src[d + 12] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x31); + v_src[d + 13] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x31); + v_src[d + 14] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x31); + v_src[d + 15] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x31); + } + + __m256i v_tmp[64]; + for (int row = 0, d = 0; row < 32; ++row, d += 2) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i v_res_2 = _mm256_setzero_si256(); + __m256i v_res_3 = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i + 0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i + 16], v_coeff); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[i + 32], v_coeff); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[i + 48], v_coeff); + + v_res_0 = _mm256_add_epi32(v_madd_0, v_res_0); + v_res_1 = _mm256_add_epi32(v_madd_1, v_res_1); + v_res_2 = _mm256_add_epi32(v_madd_2, v_res_2); + v_res_3 = _mm256_add_epi32(v_madd_3, v_res_3); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift); + + v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + } + + for (int i = 0; i < 64; ++i) { + v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_result[64]; + transpose_avx2(v_tmp, v_result, 32, 32); + + for (int i = 0; i < 64; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +static void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 32; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = &uvg_g_dst7_32_t[0][0]; + } else if (ver == DCT8) { + ver_coeff = &uvg_g_dct8_32[0][0]; + } + + __m256i v_ver_pass_out[64]; + fast_inverse_tr_32x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static dct_full_pass* dct_function_table[6][6] = { + { NULL, NULL, fast_forward_tr_2x8_avx2, fast_forward_tr_2x16_avx2, fast_forward_tr_2x32_avx2, NULL }, + { NULL, fast_forward_tr_4x4_avx2, fast_forward_tr_4x8_avx2, fast_forward_tr_4x16_avx2, fast_forward_tr_4x32_avx2, NULL }, + { fast_forward_tr_8x2_avx2, fast_forward_tr_8x4_avx2, fast_forward_tr_8x8_avx2, fast_forward_tr_8x16_avx2, fast_forward_tr_8x32_avx2, NULL }, + { fast_forward_tr_16x2_avx2, fast_forward_tr_16x4_avx2, fast_forward_tr_16x8_avx2, fast_forward_tr_16x16_avx2, fast_forward_tr_16x32_avx2, NULL }, + { fast_forward_tr_32x2_avx2, fast_forward_tr_32x4_avx2, fast_forward_tr_32x8_avx2, fast_forward_tr_32x16_avx2, fast_forward_tr_32x32_avx2, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL } +}; + + +static dct_full_pass* idct_function_table[6][6] = { + { NULL, NULL, fast_inverse_tr_2x8_avx2, fast_inverse_tr_2x16_avx2, fast_inverse_tr_2x32_avx2, NULL }, + { NULL, fast_inverse_tr_4x4_avx2, fast_inverse_tr_4x8_avx2, fast_inverse_tr_4x16_avx2, fast_inverse_tr_4x32_avx2, NULL }, + { fast_inverse_tr_8x2_avx2, fast_inverse_tr_8x4_avx2, fast_inverse_tr_8x8_avx2, fast_inverse_tr_8x16_avx2, fast_inverse_tr_8x32_avx2, NULL }, + { fast_inverse_tr_16x2_avx2, fast_inverse_tr_16x4_avx2, fast_inverse_tr_16x8_avx2, fast_inverse_tr_16x16_avx2, fast_inverse_tr_16x32_avx2, NULL }, + { fast_inverse_tr_32x2_avx2, fast_inverse_tr_32x4_avx2, fast_inverse_tr_32x8_avx2, fast_inverse_tr_32x16_avx2, fast_inverse_tr_32x32_avx2, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL }, +}; + extern void uvg_get_tr_type( int8_t width, + int8_t height, color_t color, const cu_info_t* tu, tr_type_t* hor_out, tr_type_t* ver_out, - const int8_t mts_idx); + const int8_t mts_type); static void mts_dct_avx2( const int8_t bitdepth, const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx) + const int8_t mts_type) { tr_type_t type_hor; tr_type_t type_ver; - uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); + uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type); - if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx) + if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && width == height) { - dct_func* dct_func = uvg_get_dct_func(width, color, tu->type); + dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type); dct_func(bitdepth, input, output); } - else - { - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; - - tr_func* dct = dct_table[log2_width_minus2]; - - dct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx); + else{ + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + // Transforms with 1 lenght dimensions are handled separately since their interface differ from other full pass functions + if (height == 1) { + if (width == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0); + } else if (width == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0); + } + } + else if (width == 1){ + if (height == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0); + } else if (height == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0); + } + } + else { + dct_full_pass* dct_func = dct_function_table[log2_width_minus1][log2_height_minus1]; + dct_func(input, output, type_hor, type_ver); + } } } @@ -1620,27 +7999,45 @@ static void mts_idct_avx2( const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx) + const int8_t mts_type) { tr_type_t type_hor; tr_type_t type_ver; - uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); + uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type); - if (type_hor == DCT2 && type_ver == DCT2) + if (type_hor == DCT2 && type_ver == DCT2 && width == height) { - dct_func* idct_func = uvg_get_idct_func(width, color, tu->type); + dct_func* idct_func = uvg_get_idct_func(width, height, color, tu->type); idct_func(bitdepth, input, output); } - else - { - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; - - tr_func* idct = idct_table[log2_width_minus2]; - - idct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx); + else { + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + // Transforms with 1 lenght dimensions can be transformed with existing forward functions + if (height == 1) { + if (width == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0); + _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0))); + } else if (width == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0); + } + } + else if (width == 1){ + if (height == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0); + _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0))); + } else if (height == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0); + } + } + else { + dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1]; + idct_func(input, output, type_hor, type_ver); + } } } @@ -1653,14 +8050,14 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) #if COMPILE_INTEL_AVX2 #if UVG_BIT_DEPTH == 8 if (bitdepth == 8){ - success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); + //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2); success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2); success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2); success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2); - success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); + // success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2); success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2); diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h new file mode 100644 index 00000000..5d02b617 --- /dev/null +++ b/src/strategies/avx2/dct_avx2_tables.h @@ -0,0 +1,4827 @@ +#ifndef DCT_AVX2_TABLES_H +#define DCT_AVX2_TABLES_H + +#include "global.h" + +// Shuffle tables for simple avx2 functions + +ALIGNED(32) static const int32_t ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 }; +ALIGNED(32) static const int32_t ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 }; + +ALIGNED(32) static const int32_t fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 }; +ALIGNED(32) static const int32_t fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 }; +ALIGNED(32) static const int32_t fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 }; +ALIGNED(32) static const int32_t fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 }; + +ALIGNED(32) static const int32_t ff_dct2_b32_permute[8][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {2, 2, 2, 2, 2, 2, 2, 2}, + {3, 3, 3, 3, 3, 3, 3, 3}, + {4, 4, 4, 4, 4, 4, 4, 4}, + {5, 5, 5, 5, 5, 5, 5, 5}, + {6, 6, 6, 6, 6, 6, 6, 6}, + {7, 7, 7, 7, 7, 7, 7, 7}, +}; + + +// Coeff tables for simple avx2 functions + +ALIGNED(32) static const int16_t fast_forward_dct2_b2_coeff[32] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +}; + +static const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward + +// Coeff arrays for B4 +ALIGNED(32) static const int16_t fast_forward_dct2_b4_coeff[64] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83, +-36, -83, -36, -83, -36, -83, -36, -83, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) static const int16_t fast_forward_dst7_b4_coeff[64] = { + 29, 55, 29, 55, 29, 55, 29, 55, 84, -29, 84, -29, 84, -29, 84, -29, + 74, 84, 74, 84, 74, 84, 74, 84, -74, 55, -74, 55, -74, 55, -74, 55, + 74, 74, 74, 74, 74, 74, 74, 74, 55, -84, 55, -84, 55, -84, 55, -84, + 0, -74, 0, -74, 0, -74, 0, -74, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) static const int16_t fast_forward_dct8_b4_coeff[64] = { + 84, 74, 84, 74, 84, 74, 84, 74, 55, -74, 55, -74, 55, -74, 55, -74, + 55, 29, 55, 29, 55, 29, 55, 29, -29, 84, -29, 84, -29, 84, -29, 84, + 74, 0, 74, 0, 74, 0, 74, 0, 29, -74, 29, -74, 29, -74, 29, -74, +-74, -74, -74, -74, -74, -74, -74, -74, 84, -55, 84, -55, 84, -55, 84, -55, +}; + +// Coeff arrays for inverse B4 +ALIGNED(32) static const int16_t fast_inverse_dct2_b4_coeff[64] = { + 64, 83, 64, 36, 64, -36, 64, -83, 64, 83, 64, 36, 64, -36, 64, -83, + 64, 36, -64, -83, -64, 83, 64, -36, 64, 36, -64, -83, -64, 83, 64, -36, + 64, 83, 64, 36, 64, -36, 64, -83, 64, 83, 64, 36, 64, -36, 64, -83, + 64, 36, -64, -83, -64, 83, 64, -36, 64, 36, -64, -83, -64, 83, 64, -36, +}; + +ALIGNED(32) static const int16_t fast_inverse_dst7_b4_coeff[64] = { + 29, 74, 55, 74, 74, 0, 84, -74, 29, 74, 55, 74, 74, 0, 84, -74, + 84, 55, -29, -84, -74, 74, 55, -29, 84, 55, -29, -84, -74, 74, 55, -29, + 29, 74, 55, 74, 74, 0, 84, -74, 29, 74, 55, 74, 74, 0, 84, -74, + 84, 55, -29, -84, -74, 74, 55, -29, 84, 55, -29, -84, -74, 74, 55, -29, +}; + +ALIGNED(32) static const int16_t fast_inverse_dct8_b4_coeff[64] = { + 84, 74, 74, 0, 55, -74, 29, -74, 84, 74, 74, 0, 55, -74, 29, -74, + 55, 29, -74, -74, -29, 84, 84, -55, 55, 29, -74, -74, -29, 84, 84, -55, + 84, 74, 74, 0, 55, -74, 29, -74, 84, 74, 74, 0, 55, -74, 29, -74, + 55, 29, -74, -74, -29, 84, 84, -55, 55, 29, -74, -74, -29, 84, 84, -55, +}; + +// Coeff arrays for forward B8 +ALIGNED(32) static const int16_t fast_forward_dct2_b8_coeff[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, +-64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, +-64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18, +}; + +ALIGNED(32) static const int16_t fast_forward_dst7_b8_coeff[128] = { + 17, 32, 46, 78, 71, 85, 85, 46, 17, 32, 46, 78, 71, 85, 85, 46, + 46, 60, 86, 71, 32, -46, -60, -78, 46, 60, 86, 71, 32, -46, -60, -78, + 71, 78, 32, -17, -86, -60, 17, 86, 71, 78, 32, -17, -86, -60, 17, 86, + 85, 86, -60, -85, 17, 78, 32, -71, 85, 86, -60, -85, 17, 78, 32, -71, + 86, -17, 78, -71, 60, -86, 32, -60, 86, -17, 78, -71, 60, -86, 32, -60, +-85, 32, -17, 85, 71, -17, 78, -86, -85, 32, -17, 85, 71, -17, 78, -86, + 78, -46, -60, -32, -46, 85, 85, -71, 78, -46, -60, -32, -46, 85, 85, -71, +-71, 60, 86, -46, -78, 32, 46, -17, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) static const int16_t fast_forward_dct8_b8_coeff[128] = { + 86, 85, 85, 60, 78, 17, 71, -32, 86, 85, 85, 60, 78, 17, 71, -32, + 78, 71, 17, -32, -60, -86, -86, -17, 78, 71, 17, -32, -60, -86, -86, -17, + 60, 46, -71, -86, -46, 32, 78, 60, 60, 46, -71, -86, -46, 32, 78, 60, + 32, 17, -78, -46, 85, 71, -46, -85, 32, 17, -78, -46, 85, 71, -46, -85, + 60, -71, 46, -86, 32, -78, 17, -46, 60, -71, 46, -86, 32, -78, 17, -46, +-46, 78, 32, 60, 85, -46, 71, -85, -46, 78, 32, 60, 85, -46, 71, -85, + 32, -85, -85, 17, -17, 71, 86, -78, 32, -85, -85, 17, -17, 71, 86, -78, +-17, 86, 71, -78, -86, 60, 60, -32, -17, 86, 71, -78, -86, 60, 60, -32, +}; + +// Coeff arrays for inverse B8 +ALIGNED(32) static const int16_t fast_inverse_dct2_b8_coeff[128] = { + 64, 89, 64, 75, 64, 50, 64, 18, 64, 89, 64, 75, 64, 50, 64, 18, + 83, 75, 36, -18, -36, -89, -83, -50, 83, 75, 36, -18, -36, -89, -83, -50, + 64, 50, -64, -89, -64, 18, 64, 75, 64, 50, -64, -89, -64, 18, 64, 75, + 36, 18, -83, -50, 83, 75, -36, -89, 36, 18, -83, -50, 83, 75, -36, -89, + 64, -18, 64, -50, 64, -75, 64, -89, 64, -18, 64, -50, 64, -75, 64, -89, +-83, 50, -36, 89, 36, 18, 83, -75, -83, 50, -36, 89, 36, 18, 83, -75, + 64, -75, -64, -18, -64, 89, 64, -50, 64, -75, -64, -18, -64, 89, 64, -50, +-36, 89, 83, -75, -83, 50, 36, -18, -36, 89, 83, -75, -83, 50, 36, -18, +}; + +ALIGNED(32) static const int16_t fast_inverse_dst7_b8_coeff[128] = { + 17, 46, 32, 78, 46, 86, 60, 71, 17, 46, 32, 78, 46, 86, 60, 71, + 71, 85, 85, 46, 32, -60, -46, -78, 71, 85, 85, 46, 32, -60, -46, -78, + 86, 78, -17, -71, -85, -17, 32, 85, 86, 78, -17, -71, -85, -17, 32, 85, + 60, 32, -86, -60, 71, 78, -17, -86, 60, 32, -86, -60, 71, 78, -17, -86, + 71, 32, 78, -17, 85, -60, 86, -85, 71, 32, 78, -17, 85, -60, 86, -85, +-86, 17, -60, 86, 17, 32, 78, -71, -86, 17, -60, 86, 17, 32, 78, -71, + 78, -60, -46, -32, -71, 86, 60, -46, 78, -60, -46, -32, -71, 86, 60, -46, +-46, 85, 85, -71, -78, 46, 32, -17, -46, 85, 85, -71, -78, 46, 32, -17, +}; + +static const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also. + +// Coeff arrays for forward B16 +ALIGNED(32) static const int16_t fast_forward_dct2_b16_coeff[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 64, -64, 57, -80, 50, -89, 43, -90, + 64, 64, 80, 70, 50, 18, 9, -43, -64, 64, -25, 90, 18, 75, 57, 25, + 64, 64, 57, 43, -18, -50, -80, -90, 64, -64, -9, -87, -75, -18, -87, 70, + 64, 64, 25, 9, -75, -89, -70, -25, -64, 64, 43, 70, 89, -50, 9, -80, + 64, 64, -9, -25, -89, -75, 25, 70, 64, -64, -70, -43, -50, 89, 80, -9, + 64, 64, -43, -57, -50, -18, 90, 80, -64, 64, 87, 9, -18, -75, -70, 87, + 64, 64, -70, -80, 18, 50, 43, -9, 64, -64, -90, 25, 75, 18, -25, -57, + 64, 64, -87, -90, 75, 89, -57, -87, -64, 64, 80, -57, -89, 50, 90, -43, + 83, 36, 80, 9, 75, -18, 70, -43, 36, -83, 25, -70, 18, -50, 9, -25, +-36, -83, -70, -87, -89, -50, -87, 9, 83, -36, 90, -80, 75, -89, 43, -57, +-83, -36, -25, 57, 50, 89, 90, 25, -36, 83, 43, 9, 89, -75, 70, -80, + 36, 83, 90, 43, 18, -75, -80, -57, -83, 36, -57, 87, 50, -18, 87, -90, + 83, 36, -43, -90, -75, 18, 57, 80, 36, -83, -87, 57, -18, 50, 90, -87, +-36, -83, -57, 25, 89, 50, -25, -90, 83, -36, -9, -43, -75, 89, 80, -70, +-83, -36, 87, 70, -50, -89, -9, 87, -36, 83, 80, -90, -89, 75, 57, -43, + 36, 83, -9, -80, -18, 75, 43, -70, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) static const int16_t fast_forward_dst7_b16_coeff[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 88, -8, 87, -40, 81, -68, 73, -85, // 0 + 25, 33, 68, 81, 88, 85, 81, 40, -88, 17, -68, 73, -25, 88, 25, 55, + 40, 48, 88, 88, 62, 25, -17, -68, 87, -25, 33, -88, -48, -48, -88, 48, + 55, 62, 81, 68, -17, -55, -88, -73, -85, 33, 8, 85, 88, -25, 33, -87, + 68, 73, 48, 25, -81, -88, -25, 33, 81, -40, -48, -62, -68, 81, 68, 8, + 77, 81, 0, -25, -77, -48, 77, 88, -77, 48, 77, 25, 0, -81, -77, 81, + 85, 87, -48, -68, -8, 33, 62, 8, 73, -55, -88, 17, 68, 25, -17, -62, + 88, 88, -81, -88, 68, 87, -48, -85, -68, 62, 81, -55, -88, 48, 88, -40, + 68, 88, 77, 77, 85, 55, 88, 25, 62, -88, 48, -81, 33, -62, 17, -33, // 8 + 48, -25, 0, -77, -48, -87, -81, -48, 68, -8, 88, -68, 81, -88, 48, -62, +-81, -81, -77, 0, -8, 81, 68, 68, -55, 88, 25, 25, 85, -68, 73, -81, +-25, 48, 77, 77, 62, -40, -48, -81, -73, 17, -68, 88, 40, -8, 87, -88, + 88, 68, 0, -77, -88, -17, 25, 88, 48, -87, -81, 48, -25, 55, 88, -85, + 0, -68, -77, 0, 77, 68, 0, -88, 77, -25, 0, -48, -77, 88, 77, -68, +-88, -48, 77, 77, -33, -88, -25, 81, -40, 85, 81, -88, -87, 73, 55, -40, + 25, 81, 0, -77, -25, 73, 48, -68, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) static const int16_t fast_forward_dct8_b16_coeff[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 62, -68, 55, -81, 48, -88, 40, -88, // 0 + 87, 85, 68, 48, 33, -8, -8, -62, -55, 73, -17, 88, 25, 68, 62, 17, + 81, 77, 25, 0, -48, -77, -88, -77, 48, -77, -25, -77, -81, 0, -81, 77, + 73, 68, -25, -48, -88, -81, -33, 25, -40, 81, 62, 48, 81, -68, -8, -68, + 62, 55, -68, -81, -55, -17, 73, 88, 33, -85, -85, -8, -25, 88, 87, -33, + 48, 40, -88, -88, 25, 62, 68, 17, -25, 87, 88, -33, -48, -48, -48, 88, + 33, 25, -81, -68, 85, 88, -40, -81, 17, -88, -73, 68, 88, -25, -55, -25, + 17, 8, -48, -25, 73, 40, -87, -55, -8, 88, 40, -87, -68, 81, 85, -73, + 81, 25, 77, 0, 73, -25, 68, -48, 33, -81, 25, -68, 17, -48, 8, -25, // 8 +-48, -88, -77, -77, -88, -33, -81, 25, 85, -40, 88, -81, 73, -87, 40, -55, +-68, 0, 0, 77, 68, 77, 88, 0, -25, 77, 48, 0, 88, -77, 68, -77, + 68, 88, 77, 0, -17, -88, -88, -25, -87, 48, -48, 81, 55, -25, 85, -88, + 48, -25, -77, -77, -40, 62, 81, 48, 17, -73, -88, 68, -8, 40, 88, -87, +-81, -81, 0, 77, 81, -8, -68, -68, 88, -55, -25, -25, -68, 85, 81, -73, +-25, 48, 77, 0, -87, -48, 48, 81, -8, 68, 68, -88, -88, 81, 62, -48, + 88, 68, -77, -77, 55, 85, -25, -88, -88, 62, 81, -48, -62, 33, 33, -17, +}; + +// Coeff arrays for inverse B16 +ALIGNED(32) static const int16_t fast_inverse_dct2_b16_coeff[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, -9, 64, -25, 64, -43, 64, -57, + 89, 87, 75, 57, 50, 9, 18, -43, -89, 25, -75, 70, -50, 90, -18, 80, + 83, 80, 36, 9, -36, -70, -83, -87, 83, -43, 36, -90, -36, -57, -83, 25, + 75, 70, -18, -43, -89, -87, -50, 9, -75, 57, 18, 80, 89, -25, 50, -90, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -70, -64, -43, -64, 87, 64, 9, + 50, 43, -89, -90, 18, 57, 75, 25, -50, 80, 89, -9, -18, -70, -75, 87, + 36, 25, -83, -70, 83, 90, -36, -80, 36, -87, -83, 57, 83, -9, -36, -43, + 18, 9, -50, -25, 75, 43, -89, -57, -18, 90, 50, -87, -75, 80, 89, -70, + 64, 57, 64, 43, 64, 25, 64, 9, 64, -70, 64, -80, 64, -87, 64, -90, + -18, -80, -50, -90, -75, -70, -89, -25, 18, 43, 50, -9, 75, -57, 89, -87, + -83, -25, -36, 57, 36, 90, 83, 43, -83, 87, -36, 70, 36, -9, 83, -80, + 50, 90, 89, 25, 18, -80, -75, -57, -50, -9, -89, 87, -18, 43, 75, -70, + 64, -9, -64, -87, -64, 43, 64, 70, 64, -90, -64, 25, -64, 80, 64, -57, + -75, -87, -18, 70, 89, 9, -50, -80, 75, -25, 18, -57, -89, 90, 50, -43, + -36, 43, 83, 9, -83, -57, 36, 87, -36, 80, 83, -90, -83, 70, 36, -25, + 89, 70, -75, -80, 50, 87, -18, -90, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) static const int16_t fast_inverse_dst7_b16_coeff[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 68, 48, 73, 25, 77, 0, 81, -25, // 0 + 40, 55, 73, 87, 88, 81, 85, 40, -81, -25, -88, 33, -77, 77, -48, 88, + 68, 77, 88, 77, 48, 0, -25, -77, 88, 0, 68, -77, 0, -77, -68, 0, + 85, 88, 55, 25, -48, -81, -87, -48, -88, 25, -17, 88, 77, 0, 68, -88, + 88, 87, -8, -40, -88, -68, 17, 73, 81, -48, -40, -62, -77, 77, 48, 25, + 81, 73, -68, -85, -25, 25, 88, 55, -68, 68, 81, 8, 0, -77, -81, 81, + 62, 48, -88, -81, 68, 88, -8, -68, 48, -81, -87, 48, 77, 0, -25, -48, + 33, 17, -62, -33, 81, 48, -88, -62, -25, 88, 55, -85, -77, 77, 88, -68, + 40, 88, 48, 88, 55, 81, 62, 68, 85, -48, 87, -68, 88, -81, 88, -88, // 8 + 62, -17, 25, -68, -17, -88, -55, -73, -8, 62, 33, 8, 68, -48, 87, -85, +-81, -77, -81, 0, -25, 77, 48, 77, -88, 77, -48, 77, 25, 0, 81, -77, + -8, 68, 81, 68, 62, -48, -40, -81, -33, -25, -88, 81, -25, 48, 73, -68, + 87, 33, -25, -88, -85, 8, 33, 85, 73, -88, -55, 17, -68, 81, 62, -55, +-48, -88, -48, 48, 88, 33, -25, -87, 68, -17, 25, -62, -88, 88, 48, -40, +-55, 25, 88, 25, -73, -68, 17, 88, -40, 81, 85, -88, -81, 68, 33, -25, + 85, 73, -68, -81, 40, 87, -8, -88, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +static const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff; + +// Coeff arrays for forward B32 +ALIGNED(32) static const int16_t fast_forward_dct2_b32_coeff[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t fast_forward_dst7_b32_coeff[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t fast_forward_dct8_b32_coeff[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +// Coeff arrays for inverse B32 +ALIGNED(32) static const int16_t fast_inverse_dct2_b32_coeff[1024] = { + 64, 90, 64, 90, 64, 88, 64, 85, 64, 82, 64, 78, 64, 73, 64, 67, // 0 + 64, 61, 64, 54, 64, 46, 64, 38, 64, 31, 64, 22, 64, 13, 64, 4, + 64, -4, 64, -13, 64, -22, 64, -31, 64, -38, 64, -46, 64, -54, 64, -61, + 64, -67, 64, -73, 64, -78, 64, -82, 64, -85, 64, -88, 64, -90, 64, -90, + 90, 90, 87, 82, 80, 67, 70, 46, 57, 22, 43, -4, 25, -31, 9, -54, + -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13, +-90, 13, -87, 38, -80, 61, -70, 78, -57, 88, -43, 90, -25, 85, -9, 73, + 9, 54, 25, 31, 43, 4, 57, -22, 70, -46, 80, -67, 87, -82, 90, -90, + 89, 88, 75, 67, 50, 31, 18, -13, -18, -54, -50, -82, -75, -90, -89, -78, // 8 +-89, -46, -75, -4, -50, 38, -18, 73, 18, 90, 50, 85, 75, 61, 89, 22, + 89, -22, 75, -61, 50, -85, 18, -90, -18, -73, -50, -38, -75, 4, -89, 46, +-89, 78, -75, 90, -50, 82, -18, 54, 18, 13, 50, -31, 75, -67, 89, -88, + 87, 85, 57, 46, 9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25, 38, + 25, 82, 70, 88, 90, 54, 80, -4, 43, -61, -9, -90, -57, -78, -87, -31, +-87, 31, -57, 78, -9, 90, 43, 61, 80, 4, 90, -54, 70, -88, 25, -82, +-25, -38, -70, 22, -90, 73, -80, 90, -43, 67, 9, 13, 57, -46, 87, -85, + 83, 82, 36, 22, -36, -54, -83, -90, -83, -61, -36, 13, 36, 78, 83, 85, // 16 + 83, 31, 36, -46, -36, -90, -83, -67, -83, 4, -36, 73, 36, 88, 83, 38, + 83, -38, 36, -88, -36, -73, -83, -4, -83, 67, -36, 90, 36, 46, 83, -31, + 83, -85, 36, -78, -36, -13, -83, 61, -83, 90, -36, 54, 36, -22, 83, -82, + 80, 78, 9, -4, -70, -82, -87, -73, -25, 13, 57, 85, 90, 67, 43, -22, +-43, -88, -90, -61, -57, 31, 25, 90, 87, 54, 70, -38, -9, -90, -80, -46, +-80, 46, -9, 90, 70, 38, 87, -54, 25, -90, -57, -31, -90, 61, -43, 88, + 43, 22, 90, -67, 57, -85, -25, -13, -87, 73, -70, 82, 9, 4, 80, -78, + 75, 73, -18, -31, -89, -90, -50, -22, 50, 78, 89, 67, 18, -38, -75, -90, // 24 +-75, -13, 18, 82, 89, 61, 50, -46, -50, -88, -89, -4, -18, 85, 75, 54, + 75, -54, -18, -85, -89, 4, -50, 88, 50, 46, 89, -61, 18, -82, -75, 13, +-75, 90, 18, 38, 89, -67, 50, -78, -50, 22, -89, 90, -18, 31, 75, -73, + 70, 67, -43, -54, -87, -78, 9, 38, 90, 85, 25, -22, -80, -90, -57, 4, + 57, 90, 80, 13, -25, -88, -90, -31, -9, 82, 87, 46, 43, -73, -70, -61, +-70, 61, 43, 73, 87, -46, -9, -82, -90, 31, -25, 88, 80, -13, 57, -90, +-57, -4, -80, 90, 25, 22, 90, -85, 9, -38, -87, 78, -43, 54, 70, -67, + 64, 61, -64, -73, -64, -46, 64, 82, 64, 31, -64, -88, -64, -13, 64, 90, // 32 + 64, -4, -64, -90, -64, 22, 64, 85, 64, -38, -64, -78, -64, 54, 64, 67, + 64, -67, -64, -54, -64, 78, 64, 38, 64, -85, -64, -22, -64, 90, 64, 4, + 64, -90, -64, 13, -64, 88, 64, -31, 64, -82, -64, 46, -64, 73, 64, -61, + 57, 54, -80, -85, -25, -4, 90, 88, -9, -46, -87, -61, 43, 82, 70, 13, +-70, -90, -43, 38, 87, 67, 9, -78, -90, -22, 25, 90, 80, -31, -57, -73, +-57, 73, 80, 31, 25, -90, -90, 22, 9, 78, 87, -67, -43, -38, -70, 90, + 70, -13, 43, -82, -87, 61, -9, 46, 90, -88, -25, 4, -80, 85, 57, -54, + 50, 46, -89, -90, 18, 38, 75, 54, -75, -90, -18, 31, 89, 61, -50, -88, // 40 +-50, 22, 89, 67, -18, -85, -75, 13, 75, 73, 18, -82, -89, 4, 50, 78, + 50, -78, -89, -4, 18, 82, 75, -73, -75, -13, -18, 85, 89, -67, -50, -22, +-50, 88, 89, -61, -18, -31, -75, 90, 75, -54, 18, -38, -89, 90, 50, -46, + 43, 38, -90, -88, 57, 73, 25, -4, -87, -67, 70, 90, 9, -46, -80, -31, + 80, 85, -9, -78, -70, 13, 87, 61, -25, -90, -57, 54, 90, 22, -43, -82, +-43, 82, 90, -22, -57, -54, -25, 90, 87, -61, -70, -13, -9, 78, 80, -85, +-80, 31, 9, 46, 70, -90, -87, 67, 25, 4, 57, -73, -90, 88, 43, -38, + 36, 31, -83, -78, 83, 90, -36, -61, -36, 4, 83, 54, -83, -88, 36, 82, // 48 + 36, -38, -83, -22, 83, 73, -36, -90, -36, 67, 83, -13, -83, -46, 36, 85, + 36, -85, -83, 46, 83, 13, -36, -67, -36, 90, 83, -73, -83, 22, 36, 38, + 36, -82, -83, 88, 83, -54, -36, -4, -36, 61, 83, -90, -83, 78, 36, -31, + 25, 22, -70, -61, 90, 85, -80, -90, 43, 73, 9, -38, -57, -4, 87, 46, +-87, -78, 57, 90, -9, -82, -43, 54, 80, -13, -90, -31, 70, 67, -25, -88, +-25, 88, 70, -67, -90, 31, 80, 13, -43, -54, -9, 82, 57, -90, -87, 78, + 87, -46, -57, 4, 9, 38, 43, -73, -80, 90, 90, -85, -70, 61, 25, -22, + 18, 13, -50, -38, 75, 61, -89, -78, 89, 88, -75, -90, 50, 85, -18, -73, // 56 +-18, 54, 50, -31, -75, 4, 89, 22, -89, -46, 75, 67, -50, -82, 18, 90, + 18, -90, -50, 82, 75, -67, -89, 46, 89, -22, -75, -4, 50, 31, -18, -54, +-18, 73, 50, -85, -75, 90, 89, -88, -89, 78, 75, -61, -50, 38, 18, -13, + 9, 4, -25, -13, 43, 22, -57, -31, 70, 38, -80, -46, 87, 54, -90, -61, + 90, 67, -87, -73, 80, 78, -70, -82, 57, 85, -43, -88, 25, 90, -9, -90, + -9, 90, 25, -90, -43, 88, 57, -85, -70, 82, 80, -78, -87, 73, 90, -67, +-90, 61, 87, -54, -80, 46, 70, -38, -57, 31, 43, -22, -25, 13, 9, -4, +}; + +ALIGNED(32) static const int16_t fast_inverse_dst7_b32_coeff[1024] = { + 4, 13, 9, 26, 13, 38, 17, 50, 21, 60, 26, 68, 30, 77, 34, 82, // 0 + 38, 86, 42, 89, 46, 90, 50, 88, 53, 85, 56, 80, 60, 74, 63, 66, + 66, 56, 68, 46, 72, 34, 74, 21, 77, 9, 78, -4, 80, -17, 82, -30, + 84, -42, 85, -53, 86, -63, 87, -72, 88, -78, 89, -84, 90, -87, 90, -90, + 21, 30, 42, 56, 60, 77, 74, 87, 84, 89, 89, 80, 89, 63, 84, 38, + 74, 9, 60, -21, 42, -50, 21, -72, 0, -85, -21, -90, -42, -84, -60, -68, +-74, -46, -84, -17, -89, 13, -89, 42, -84, 66, -74, 82, -60, 90, -42, 86, +-21, 74, 0, 53, 21, 26, 42, -4, 60, -34, 74, -60, 84, -78, 89, -88, + 38, 46, 68, 78, 86, 90, 88, 77, 74, 42, 46, -4, 9, -50, -30, -80, // 8 +-63, -90, -84, -74, -90, -38, -78, 9, -53, 53, -17, 82, 21, 89, 56, 72, + 80, 34, 90, -13, 82, -56, 60, -84, 26, -88, -13, -68, -50, -30, -77, 17, +-89, 60, -85, 85, -66, 87, -34, 66, 4, 26, 42, -21, 72, -63, 87, -86, + 53, 60, 85, 89, 85, 74, 53, 21, 0, -42, -53, -84, -85, -84, -85, -42, +-53, 21, 0, 74, 53, 89, 85, 60, 85, 0, 53, -60, 0, -89, -53, -74, +-85, -21, -85, 42, -53, 84, 0, 84, 53, 42, 85, -21, 85, -74, 53, -89, + 0, -60, -53, 0, -85, 60, -85, 89, -53, 74, 0, 21, 53, -42, 85, -84, + 66, 72, 90, 86, 56, 34, -13, -46, -74, -89, -87, -63, -46, 13, 26, 78, // 16 + 80, 82, 84, 21, 34, -56, -38, -90, -85, -53, -78, 26, -21, 84, 50, 77, + 88, 9, 72, -66, 9, -88, -60, -42, -90, 38, -63, 87, 4, 68, 68, -4, + 89, -74, 53, -85, -17, -30, -77, 50, -86, 90, -42, 60, 30, -17, 82, -80, + 77, 80, 80, 72, 9, -17, -72, -86, -84, -60, -17, 34, 66, 90, 86, 46, + 26, -50, -60, -89, -88, -30, -34, 63, 53, 85, 90, 13, 42, -74, -46, -78, +-90, 4, -50, 82, 38, 68, 89, -21, 56, -87, -30, -56, -87, 38, -63, 90, + 21, 42, 85, -53, 68, -88, -13, -26, -82, 66, -74, 84, 4, 9, 78, -77, + 84, 86, 60, 46, -42, -63, -89, -78, -21, 21, 74, 90, 74, 26, -21, -77, // 24 +-89, -66, -42, 42, 60, 87, 84, 4, 0, -85, -84, -50, -60, 60, 42, 80, + 89, -17, 21, -90, -74, -30, -74, 74, 21, 68, 89, -38, 42, -88, -60, -9, +-84, 84, 0, 53, 84, -56, 60, -82, -42, 13, -89, 89, -21, 34, 74, -72, + 88, 90, 30, 13, -78, -87, -56, -26, 60, 84, 77, 38, -34, -78, -87, -50, + 4, 72, 89, 60, 26, -63, -80, -68, -53, 53, 63, 77, 74, -42, -38, -82, +-86, 30, 9, 86, 90, -17, 21, -89, -82, 4, -50, 90, 66, 9, 72, -88, +-42, -21, -85, 85, 13, 34, 90, -80, 17, -46, -84, 74, -46, 56, 68, -66, + 90, 89, -4, -21, -90, -84, 9, 42, 89, 74, -13, -60, -88, -60, 17, 74, // 32 + 87, 42, -21, -84, -86, -21, 26, 89, 85, 0, -30, -89, -84, 21, 34, 84, + 82, -42, -38, -74, -80, 60, 42, 60, 78, -74, -46, -42, -77, 84, 50, 21, + 74, -89, -53, 0, -72, 89, 56, -21, 68, -84, -60, 42, -66, 74, 63, -60, + 87, 85, -38, -53, -72, -53, 68, 85, 42, 0, -86, -85, -4, 53, 88, 53, +-34, -85, -74, 0, 66, 85, 46, -53, -85, -53, -9, 85, 89, 0, -30, -85, +-77, 53, 63, 53, 50, -85, -84, 0, -13, 85, 90, -53, -26, -53, -78, 85, + 60, 0, 53, -85, -82, 53, -17, 53, 90, -85, -21, 0, -80, 85, 56, -53, + 82, 78, -66, -77, -30, -4, 90, 80, -42, -74, -56, -9, 86, 82, -13, -72, // 40 +-77, -13, 74, 84, 17, -68, -87, -17, 53, 85, 46, -66, -89, -21, 26, 86, + 68, -63, -80, -26, -4, 87, 84, -60, -63, -30, -34, 88, 90, -56, -38, -34, +-60, 89, 85, -53, -9, -38, -78, 90, 72, -50, 21, -42, -88, 90, 50, -46, + 74, 68, -84, -88, 21, 46, 60, 30, -89, -84, 42, 78, 42, -17, -89, -56, + 60, 90, 21, -60, -84, -13, 74, 77, 0, -85, -74, 34, 84, 42, -21, -87, +-60, 72, 89, -4, -42, -66, -42, 89, 89, -50, -60, -26, -21, 82, 84, -80, +-74, 21, 0, 53, 74, -90, -84, 63, 21, 9, 60, -74, -89, 86, 42, -38, + 63, 56, -90, -87, 66, 80, -4, -38, -60, -21, 90, 72, -68, -90, 9, 68, // 48 + 56, -17, -89, -42, 72, 82, -13, -86, -53, 53, 88, 4, -74, -60, 17, 88, + 50, -78, -87, 34, 77, 26, -21, -74, -46, 90, 86, -66, -78, 13, 26, 46, + 42, -84, -85, 85, 80, -50, -30, -9, -38, 63, 84, -89, -82, 77, 34, -30, + 50, 42, -82, -74, 88, 89, -66, -84, 21, 60, 30, -21, -72, -21, 90, 60, +-78, -84, 42, 89, 9, -74, -56, 42, 85, 0, -86, -42, 60, 74, -13, -89, +-38, 84, 77, -60, -90, 21, 74, 21, -34, -60, -17, 84, 63, -89, -87, 74, + 84, -42, -53, 0, 4, 42, 46, -74, -80, 89, 89, -84, -68, 60, 26, -21, + 34, 26, -63, -50, 82, 68, -90, -82, 84, 89, -66, -88, 38, 80, -4, -66, // 56 +-30, 46, 60, -21, -80, -4, 90, 30, -85, -53, 68, 72, -42, -84, 9, 90, + 26, -87, -56, 78, 78, -63, -89, 42, 86, -17, -72, -9, 46, 34, -13, -56, +-21, 74, 53, -85, -77, 90, 88, -86, -87, 77, 74, -60, -50, 38, 17, -13, + 17, 9, -34, -17, 50, 26, -63, -34, 74, 42, -82, -50, 87, 56, -90, -63, + 88, 68, -84, -74, 77, 78, -66, -82, 53, 85, -38, -87, 21, 89, -4, -90, +-13, 90, 30, -88, -46, 86, 60, -84, -72, 80, 80, -77, -86, 72, 90, -66, +-89, 60, 85, -53, -78, 46, 68, -38, -56, 30, 42, -21, -26, 13, 9, -4, +}; + +static const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff; + + +// Shuffle tables for advanced and optimized avx2 functions + +// Shuffle 16 bit samples inside lanes. Put each sample four spaces from each other adjacent to each other. +// _mm256_shuffle_epi8 +// Input [0 1 2 3 4 5 6 7 | XX +// Output [0 4 1 5 2 6 3 7 | XX +ALIGNED(32) static const int8_t shuffle_16b_0415[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +// Shuffle 16 bit samples inside lanes. Put each even indexed sample next to each other, then each odd sample. +// _mm256_shuffle_epi8 +// Input [0 1 2 3 4 5 6 7 | +// Output [0 2 4 6 1 3 5 7 | +ALIGNED(32) static const int8_t shuffle_16b_0246[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +// Permute 32 bit samples across lanes. Put each sample four spaces from each other adjacent to each other. +// _mm256_permutevar8x32_epi32 +// Input [0 1 2 3 | 4 5 6 7] +// Output [0 1 4 5 | 2 6 3 7] +ALIGNED(32) static const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + + static const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415; + +ALIGNED(32) static const int8_t fi_tr_2x8_result_shuffle1_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) static const int8_t ff_dct2_2x8_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) static const int8_t ff_dct2_2x8_result_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) static const int8_t fi_tr_2x8_result_shuffle2_ver[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) static const int8_t ff_dct2_2x16_ver_result_shuffle[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) static const int8_t fi_tr_4x4_shuffle_hor[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) static const int8_t fi_tr_4x4_result_shuffle_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) static const int8_t fi_tr_4x8_result_shuffle_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) static const int8_t ff_dct2_8x2_ver_pass_shuffle[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; + +ALIGNED(32) static const int8_t fi_tr_8x2_shuffle_hor[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) static const int8_t fi_tr_8x2_shuffle_ver[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + + static const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415; + +ALIGNED(32) static const int8_t ff_dct2_8x4_ver_pass_shuffle[32] = { + 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15, + 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15, +}; + +// TODO: remove duplicate tables. Rename with a more descriptive name. +ALIGNED(32) static const int8_t ff_dct2_8x4_ver_pass_result_shuffle[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, +}; + +ALIGNED(32) static const int8_t ff_dct2_8x16_butterfly_shuffle[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +ALIGNED(32) static const int8_t ff_dct2_8x16_butterfly_shuffle_order[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +// Arrange samples into butterfly formation +ALIGNED(32) static const int8_t ff_dct2_16x8_butterfly_shuffle[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +// Swap two middle 16-bit values in each 64-bit chunk +ALIGNED(32) static const int8_t ff_dct2_16x8_butterfly_res_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) static const int8_t ff_dct2_16x32_reverse_64b_order[32] = { + 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, + 22, 23, 20, 21, 18, 19, 16, 17, 30, 31, 28, 29, 26, 27, 24, 25, +}; + +ALIGNED(32) static const int8_t ff_dct2_32x2_butterfly_order_shuffle[32] = { + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, + 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 +}; + +ALIGNED(32) static const int8_t ff_dct2_32x8_shuffle_order[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +ALIGNED(32) static const int8_t ff_dct2_32x8_shuffle_result[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 +}; + + +// Coeff tables for advanced and optimized avx2 functions + +// 2xN +ALIGNED(32) static const int16_t ff_dct2_2xN_coeff_hor[32] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64 +}; + +ALIGNED(32) static const int16_t ff_dct2_2x8_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 89, 75, 50, 18, -18, -50, -75, -89, 89, 75, 50, 18, -18, -50, -75, -89, + 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, + 75, -18, -89, -50, 50, 89, 18, -75, 75, -18, -89, -50, 50, 89, 18, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, + 50, -89, 18, 75, -75, -18, 89, -50, 50, -89, 18, 75, -75, -18, 89, -50, + 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, + 18, -50, 75, -89, 89, -75, 50, -18, 18, -50, 75, -89, 89, -75, 50, -18 +}; + +ALIGNED(32) static +const int16_t ff_dst7_2x8_coeff_ver[128] = { + 17, 32, 46, 60, 71, 78, 85, 86, 17, 32, 46, 60, 71, 78, 85, 86, + 46, 78, 86, 71, 32, -17, -60, -85, 46, 78, 86, 71, 32, -17, -60, -85, + 71, 85, 32, -46, -86, -60, 17, 78, 71, 85, 32, -46, -86, -60, 17, 78, + 85, 46, -60, -78, 17, 86, 32, -71, 85, 46, -60, -78, 17, 86, 32, -71, + 86, -17, -85, 32, 78, -46, -71, 60, 86, -17, -85, 32, 78, -46, -71, 60, + 78, -71, -17, 85, -60, -32, 86, -46, 78, -71, -17, 85, -60, -32, 86, -46, + 60, -86, 71, -17, -46, 85, -78, 32, 60, -86, 71, -17, -46, 85, -78, 32, + 32, -60, 78, -86, 85, -71, 46, -17, 32, -60, 78, -86, 85, -71, 46, -17, +}; + + +ALIGNED(32) static const int16_t fi_dct2_2x8_coeff_ver[128] = { + 64, 89, 83, 75, 64, 75, 36, -18, 64, 89, 83, 75, 64, 75, 36, -18, + 64, 50, 36, 18, -64, -89, -83, -50, 64, 50, 36, 18, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 18, -83, -50, 64, 50, -36, -89, 64, 18, -83, -50, +-64, 18, 83, 75, 64, 75, -36, -89, -64, 18, 83, 75, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -50, -36, 89, 64, -18, -83, 50, 64, -50, -36, 89, + 64, -75, -36, 89, -64, -18, 83, -75, 64, -75, -36, 89, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -89, 83, -75, 64, -75, 36, 18, 64, -89, 83, -75, +-64, 89, -83, 50, 64, -50, 36, -18, -64, 89, -83, 50, 64, -50, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_2x8_coeff_ver[128] = { + 17, 46, 71, 85, 32, 78, 85, 46, 17, 46, 71, 85, 32, 78, 85, 46, + 86, 78, 60, 32, -17, -71, -86, -60, 86, 78, 60, 32, -17, -71, -86, -60, + 46, 86, 32, -60, 60, 71, -46, -78, 46, 86, 32, -60, 60, 71, -46, -78, +-85, -17, 71, 78, 32, 85, -17, -86, -85, -17, 71, 78, 32, 85, -17, -86, + 71, 32, -86, 17, 78, -17, -60, 86, 71, 32, -86, 17, 78, -17, -60, 86, + 78, -60, -46, 85, -46, -32, 85, -71, 78, -60, -46, 85, -46, -32, 85, -71, + 85, -60, 17, 32, 86, -85, 78, -71, 85, -60, 17, 32, 86, -85, 78, -71, +-71, 86, -78, 46, 60, -46, 32, -17, -71, 86, -78, 46, 60, -46, 32, -17, +}; + +ALIGNED(32) static const int16_t fi_dct8_2x8_coeff_ver[128] = { + 86, 85, 78, 71, 85, 60, 17, -32, 86, 85, 78, 71, 85, 60, 17, -32, + 60, 46, 32, 17, -71, -86, -78, -46, 60, 46, 32, 17, -71, -86, -78, -46, + 78, 17, -60, -86, 71, -32, -86, -17, 78, 17, -60, -86, 71, -32, -86, -17, +-46, 32, 85, 71, 78, 60, -46, -85, -46, 32, 85, 71, 78, 60, -46, -85, + 60, -71, -46, 78, 46, -86, 32, 60, 60, -71, -46, 78, 46, -86, 32, 60, + 32, -85, -17, 86, -85, 17, 71, -78, 32, -85, -17, 86, -85, 17, 71, -78, + 32, -78, 85, -46, 17, -46, 71, -85, 32, -78, 85, -46, 17, -46, 71, -85, +-17, 71, -86, 60, 86, -78, 60, -32, -17, 71, -86, 60, 86, -78, 60, -32, +}; + + + +ALIGNED(32) static const int16_t fi_dct2_2x16_coeff_ver[512] = { + 64, 90, 89, 87, 64, 90, 89, 87, 64, 57, 50, 43, 64, 57, 50, 43, // 0 + 83, 80, 75, 70, 83, 80, 75, 70, 36, 25, 18, 9, 36, 25, 18, 9, + 64, 87, 75, 57, 64, 87, 75, 57, -64, -80, -89, -90, -64, -80, -89, -90, + 36, 9, -18, -43, 36, 9, -18, -43, -83, -70, -50, -25, -83, -70, -50, -25, + 64, 80, 50, 9, 64, 80, 50, 9, -64, -25, 18, 57, -64, -25, 18, 57, +-36, -70, -89, -87, -36, -70, -89, -87, 83, 90, 75, 43, 83, 90, 75, 43, + 64, 70, 18, -43, 64, 70, 18, -43, 64, 90, 75, 25, 64, 90, 75, 25, +-83, -87, -50, 9, -83, -87, -50, 9, -36, -80, -89, -57, -36, -80, -89, -57, + 64, 57, -18, -80, 64, 57, -18, -80, 64, -9, -75, -87, 64, -9, -75, -87, // 8 +-83, -25, 50, 90, -83, -25, 50, 90, -36, 43, 89, 70, -36, 43, 89, 70, + 64, 43, -50, -90, 64, 43, -50, -90, -64, -87, -18, 70, -64, -87, -18, 70, +-36, 57, 89, 25, -36, 57, 89, 25, 83, 9, -75, -80, 83, 9, -75, -80, + 64, 25, -75, -70, 64, 25, -75, -70, -64, 43, 89, 9, -64, 43, 89, 9, + 36, 90, 18, -80, 36, 90, 18, -80, -83, -57, 50, 87, -83, -57, 50, 87, + 64, 9, -89, -25, 64, 9, -89, -25, 64, 70, -50, -80, 64, 70, -50, -80, + 83, 43, -75, -57, 83, 43, -75, -57, 36, 87, -18, -90, 36, 87, -18, -90, + 64, -9, -89, 25, 64, -9, -89, 25, 64, -70, -50, 80, 64, -70, -50, 80, // 16 + 83, -43, -75, 57, 83, -43, -75, 57, 36, -87, -18, 90, 36, -87, -18, 90, + 64, -25, -75, 70, 64, -25, -75, 70, -64, -43, 89, -9, -64, -43, 89, -9, + 36, -90, 18, 80, 36, -90, 18, 80, -83, 57, 50, -87, -83, 57, 50, -87, + 64, -43, -50, 90, 64, -43, -50, 90, -64, 87, -18, -70, -64, 87, -18, -70, +-36, -57, 89, -25, -36, -57, 89, -25, 83, -9, -75, 80, 83, -9, -75, 80, + 64, -57, -18, 80, 64, -57, -18, 80, 64, 9, -75, 87, 64, 9, -75, 87, +-83, 25, 50, -90, -83, 25, 50, -90, -36, -43, 89, -70, -36, -43, 89, -70, + 64, -70, 18, 43, 64, -70, 18, 43, 64, -90, 75, -25, 64, -90, 75, -25, // 24 +-83, 87, -50, -9, -83, 87, -50, -9, -36, 80, -89, 57, -36, 80, -89, 57, + 64, -80, 50, -9, 64, -80, 50, -9, -64, 25, 18, -57, -64, 25, 18, -57, +-36, 70, -89, 87, -36, 70, -89, 87, 83, -90, 75, -43, 83, -90, 75, -43, + 64, -87, 75, -57, 64, -87, 75, -57, -64, 80, -89, 90, -64, 80, -89, 90, + 36, -9, -18, 43, 36, -9, -18, 43, -83, 70, -50, 25, -83, 70, -50, 25, + 64, -90, 89, -87, 64, -90, 89, -87, 64, -57, 50, -43, 64, -57, 50, -43, + 83, -80, 75, -70, 83, -80, 75, -70, 36, -25, 18, -9, 36, -25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_2x16_coeff_ver[512] = { + 8, 25, 40, 55, 8, 25, 40, 55, 88, 87, 81, 73, 88, 87, 81, 73, // 0 + 68, 77, 85, 88, 68, 77, 85, 88, 62, 48, 33, 17, 62, 48, 33, 17, + 17, 48, 73, 87, 17, 48, 73, 87, -8, -40, -68, -85, -8, -40, -68, -85, + 88, 77, 55, 25, 88, 77, 55, 25, -88, -81, -62, -33, -88, -81, -62, -33, + 25, 68, 88, 81, 25, 68, 88, 81, -88, -68, -25, 25, -88, -68, -25, 25, + 48, 0, -48, -81, 48, 0, -48, -81, 68, 88, 81, 48, 68, 88, 81, 48, + 33, 81, 85, 40, 33, 81, 85, 40, 17, 73, 88, 55, 17, 73, 88, 55, +-25, -77, -87, -48, -25, -77, -87, -48, -8, -68, -88, -62, -8, -68, -88, -62, + 40, 88, 62, -17, 40, 88, 62, -17, 87, 33, -48, -88, 87, 33, -48, -88, // 8 +-81, -77, -8, 68, -81, -77, -8, 68, -55, 25, 85, 73, -55, 25, 85, 73, + 48, 88, 25, -68, 48, 88, 25, -68, -25, -88, -48, 48, -25, -88, -48, 48, +-81, 0, 81, 68, -81, 0, 81, 68, 88, 25, -68, -81, 88, 25, -68, -81, + 55, 81, -17, -88, 55, 81, -17, -88, -85, 8, 88, 33, -85, 8, 88, 33, +-25, 77, 62, -48, -25, 77, 62, -48, -73, -68, 40, 87, -73, -68, 40, 87, + 62, 68, -55, -73, 62, 68, -55, -73, 33, 85, -25, -87, 33, 85, -25, -87, + 48, 77, -40, -81, 48, 77, -40, -81, 17, 88, -8, -88, 17, 88, -8, -88, + 68, 48, -81, -25, 68, 48, -81, -25, 81, -48, -68, 68, 81, -48, -68, 68, // 16 + 88, 0, -88, 25, 88, 0, -88, 25, 48, -81, -25, 88, 48, -81, -25, 88, + 73, 25, -88, 33, 73, 25, -88, 33, -40, -62, 81, 8, -40, -62, 81, 8, + 68, -77, -17, 88, 68, -77, -17, 88, -87, 48, 55, -85, -87, 48, 55, -85, + 77, 0, -77, 77, 77, 0, -77, 77, -77, 77, 0, -77, -77, 77, 0, -77, + 0, -77, 77, 0, 0, -77, 77, 0, 77, 0, -77, 77, 77, 0, -77, 77, + 81, -25, -48, 88, 81, -25, -48, 88, 48, 25, -81, 81, 48, 25, -81, 81, +-68, 0, 68, -88, -68, 0, 68, -88, -25, -48, 88, -68, -25, -48, 88, -68, + 85, -48, -8, 62, 85, -48, -8, 62, 73, -88, 68, -17, 73, -88, 68, -17, // 24 +-88, 77, -33, -25, -88, 77, -33, -25, -40, 81, -87, 55, -40, 81, -87, 55, + 87, -68, 33, 8, 87, -68, 33, 8, -55, 17, 25, -62, -55, 17, 25, -62, +-48, 77, -88, 81, -48, 77, -88, 81, 85, -88, 73, -40, 85, -88, 73, -40, + 88, -81, 68, -48, 88, -81, 68, -48, -68, 81, -88, 88, -68, 81, -88, 88, + 25, 0, -25, 48, 25, 0, -25, 48, -81, 68, -48, 25, -81, 68, -48, 25, + 88, -88, 87, -85, 88, -88, 87, -85, 62, -55, 48, -40, 62, -55, 48, -40, + 81, -77, 73, -68, 81, -77, 73, -68, 33, -25, 17, -8, 33, -25, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct2_2x32_coeff_ver[2048] = { + 64, 90, 90, 90, 89, 88, 87, 85, 64, 90, 90, 90, 89, 88, 87, 85, // 0 + 83, 82, 80, 78, 75, 73, 70, 67, 83, 82, 80, 78, 75, 73, 70, 67, + 64, 61, 57, 54, 50, 46, 43, 38, 64, 61, 57, 54, 50, 46, 43, 38, + 36, 31, 25, 22, 18, 13, 9, 4, 36, 31, 25, 22, 18, 13, 9, 4, + 64, 90, 87, 82, 75, 67, 57, 46, 64, 90, 87, 82, 75, 67, 57, 46, + 36, 22, 9, -4, -18, -31, -43, -54, 36, 22, 9, -4, -18, -31, -43, -54, +-64, -73, -80, -85, -89, -90, -90, -88, -64, -73, -80, -85, -89, -90, -90, -88, +-83, -78, -70, -61, -50, -38, -25, -13, -83, -78, -70, -61, -50, -38, -25, -13, + 64, 88, 80, 67, 50, 31, 9, -13, 64, 88, 80, 67, 50, 31, 9, -13, // 8 +-36, -54, -70, -82, -89, -90, -87, -78, -36, -54, -70, -82, -89, -90, -87, -78, +-64, -46, -25, -4, 18, 38, 57, 73, -64, -46, -25, -4, 18, 38, 57, 73, + 83, 90, 90, 85, 75, 61, 43, 22, 83, 90, 90, 85, 75, 61, 43, 22, + 64, 85, 70, 46, 18, -13, -43, -67, 64, 85, 70, 46, 18, -13, -43, -67, +-83, -90, -87, -73, -50, -22, 9, 38, -83, -90, -87, -73, -50, -22, 9, 38, + 64, 82, 90, 88, 75, 54, 25, -4, 64, 82, 90, 88, 75, 54, 25, -4, +-36, -61, -80, -90, -89, -78, -57, -31, -36, -61, -80, -90, -89, -78, -57, -31, + 64, 82, 57, 22, -18, -54, -80, -90, 64, 82, 57, 22, -18, -54, -80, -90, // 16 +-83, -61, -25, 13, 50, 78, 90, 85, -83, -61, -25, 13, 50, 78, 90, 85, + 64, 31, -9, -46, -75, -90, -87, -67, 64, 31, -9, -46, -75, -90, -87, -67, +-36, 4, 43, 73, 89, 88, 70, 38, -36, 4, 43, 73, 89, 88, 70, 38, + 64, 78, 43, -4, -50, -82, -90, -73, 64, 78, 43, -4, -50, -82, -90, -73, +-36, 13, 57, 85, 89, 67, 25, -22, -36, 13, 57, 85, 89, 67, 25, -22, +-64, -88, -87, -61, -18, 31, 70, 90, -64, -88, -87, -61, -18, 31, 70, 90, + 83, 54, 9, -38, -75, -90, -80, -46, 83, 54, 9, -38, -75, -90, -80, -46, + 64, 73, 25, -31, -75, -90, -70, -22, 64, 73, 25, -31, -75, -90, -70, -22, // 24 + 36, 78, 90, 67, 18, -38, -80, -90, 36, 78, 90, 67, 18, -38, -80, -90, +-64, -13, 43, 82, 89, 61, 9, -46, -64, -13, 43, 82, 89, 61, 9, -46, +-83, -88, -57, -4, 50, 85, 87, 54, -83, -88, -57, -4, 50, 85, 87, 54, + 64, 67, 9, -54, -89, -78, -25, 38, 64, 67, 9, -54, -89, -78, -25, 38, + 83, 85, 43, -22, -75, -90, -57, 4, 83, 85, 43, -22, -75, -90, -57, 4, + 64, 90, 70, 13, -50, -88, -80, -31, 64, 90, 70, 13, -50, -88, -80, -31, + 36, 82, 87, 46, -18, -73, -90, -61, 36, 82, 87, 46, -18, -73, -90, -61, + 64, 61, -9, -73, -89, -46, 25, 82, 64, 61, -9, -73, -89, -46, 25, 82, // 32 + 83, 31, -43, -88, -75, -13, 57, 90, 83, 31, -43, -88, -75, -13, 57, 90, + 64, -4, -70, -90, -50, 22, 80, 85, 64, -4, -70, -90, -50, 22, 80, 85, + 36, -38, -87, -78, -18, 54, 90, 67, 36, -38, -87, -78, -18, 54, 90, 67, + 64, 54, -25, -85, -75, -4, 70, 88, 64, 54, -25, -85, -75, -4, 70, 88, + 36, -46, -90, -61, 18, 82, 80, 13, 36, -46, -90, -61, 18, 82, 80, 13, +-64, -90, -43, 38, 89, 67, -9, -78, -64, -90, -43, 38, 89, 67, -9, -78, +-83, -22, 57, 90, 50, -31, -87, -73, -83, -22, 57, 90, 50, -31, -87, -73, + 64, 46, -43, -90, -50, 38, 90, 54, 64, 46, -43, -90, -50, 38, 90, 54, // 40 +-36, -90, -57, 31, 89, 61, -25, -88, -36, -90, -57, 31, 89, 61, -25, -88, +-64, 22, 87, 67, -18, -85, -70, 13, -64, 22, 87, 67, -18, -85, -70, 13, + 83, 73, -9, -82, -75, 4, 80, 78, 83, 73, -9, -82, -75, 4, 80, 78, + 64, 38, -57, -88, -18, 73, 80, -4, 64, 38, -57, -88, -18, 73, 80, -4, +-83, -67, 25, 90, 50, -46, -90, -31, -83, -67, 25, 90, 50, -46, -90, -31, + 64, 85, 9, -78, -75, 13, 87, 61, 64, 85, 9, -78, -75, 13, 87, 61, +-36, -90, -43, 54, 89, 22, -70, -82, -36, -90, -43, 54, 89, 22, -70, -82, + 64, 31, -70, -78, 18, 90, 43, -61, 64, 31, -70, -78, 18, 90, 43, -61, // 48 +-83, 4, 87, 54, -50, -88, -9, 82, -83, 4, 87, 54, -50, -88, -9, 82, + 64, -38, -90, -22, 75, 73, -25, -90, 64, -38, -90, -22, 75, 73, -25, -90, +-36, 67, 80, -13, -89, -46, 57, 85, -36, 67, 80, -13, -89, -46, 57, 85, + 64, 22, -80, -61, 50, 85, -9, -90, 64, 22, -80, -61, 50, 85, -9, -90, +-36, 73, 70, -38, -89, -4, 87, 46, -36, 73, 70, -38, -89, -4, 87, 46, +-64, -78, 25, 90, 18, -82, -57, 54, -64, -78, 25, 90, 18, -82, -57, 54, + 83, -13, -90, -31, 75, 67, -43, -88, 83, -13, -90, -31, 75, 67, -43, -88, + 64, 13, -87, -38, 75, 61, -57, -78, 64, 13, -87, -38, 75, 61, -57, -78, // 56 + 36, 88, -9, -90, -18, 85, 43, -73, 36, 88, -9, -90, -18, 85, 43, -73, +-64, 54, 80, -31, -89, 4, 90, 22, -64, 54, 80, -31, -89, 4, 90, 22, +-83, -46, 70, 67, -50, -82, 25, 90, -83, -46, 70, 67, -50, -82, 25, 90, + 64, 4, -90, -13, 89, 22, -87, -31, 64, 4, -90, -13, 89, 22, -87, -31, + 83, 38, -80, -46, 75, 54, -70, -61, 83, 38, -80, -46, 75, 54, -70, -61, + 64, 67, -57, -73, 50, 78, -43, -82, 64, 67, -57, -73, 50, 78, -43, -82, + 36, 85, -25, -88, 18, 90, -9, -90, 36, 85, -25, -88, 18, 90, -9, -90, + 64, -4, -90, 13, 89, -22, -87, 31, 64, -4, -90, 13, 89, -22, -87, 31, // 64 + 83, -38, -80, 46, 75, -54, -70, 61, 83, -38, -80, 46, 75, -54, -70, 61, + 64, -67, -57, 73, 50, -78, -43, 82, 64, -67, -57, 73, 50, -78, -43, 82, + 36, -85, -25, 88, 18, -90, -9, 90, 36, -85, -25, 88, 18, -90, -9, 90, + 64, -13, -87, 38, 75, -61, -57, 78, 64, -13, -87, 38, 75, -61, -57, 78, + 36, -88, -9, 90, -18, -85, 43, 73, 36, -88, -9, 90, -18, -85, 43, 73, +-64, -54, 80, 31, -89, -4, 90, -22, -64, -54, 80, 31, -89, -4, 90, -22, +-83, 46, 70, -67, -50, 82, 25, -90, -83, 46, 70, -67, -50, 82, 25, -90, + 64, -22, -80, 61, 50, -85, -9, 90, 64, -22, -80, 61, 50, -85, -9, 90, // 72 +-36, -73, 70, 38, -89, 4, 87, -46, -36, -73, 70, 38, -89, 4, 87, -46, +-64, 78, 25, -90, 18, 82, -57, -54, -64, 78, 25, -90, 18, 82, -57, -54, + 83, 13, -90, 31, 75, -67, -43, 88, 83, 13, -90, 31, 75, -67, -43, 88, + 64, -31, -70, 78, 18, -90, 43, 61, 64, -31, -70, 78, 18, -90, 43, 61, +-83, -4, 87, -54, -50, 88, -9, -82, -83, -4, 87, -54, -50, 88, -9, -82, + 64, 38, -90, 22, 75, -73, -25, 90, 64, 38, -90, 22, 75, -73, -25, 90, +-36, -67, 80, 13, -89, 46, 57, -85, -36, -67, 80, 13, -89, 46, 57, -85, + 64, -38, -57, 88, -18, -73, 80, 4, 64, -38, -57, 88, -18, -73, 80, 4, // 80 +-83, 67, 25, -90, 50, 46, -90, 31, -83, 67, 25, -90, 50, 46, -90, 31, + 64, -85, 9, 78, -75, -13, 87, -61, 64, -85, 9, 78, -75, -13, 87, -61, +-36, 90, -43, -54, 89, -22, -70, 82, -36, 90, -43, -54, 89, -22, -70, 82, + 64, -46, -43, 90, -50, -38, 90, -54, 64, -46, -43, 90, -50, -38, 90, -54, +-36, 90, -57, -31, 89, -61, -25, 88, -36, 90, -57, -31, 89, -61, -25, 88, +-64, -22, 87, -67, -18, 85, -70, -13, -64, -22, 87, -67, -18, 85, -70, -13, + 83, -73, -9, 82, -75, -4, 80, -78, 83, -73, -9, 82, -75, -4, 80, -78, + 64, -54, -25, 85, -75, 4, 70, -88, 64, -54, -25, 85, -75, 4, 70, -88, // 88 + 36, 46, -90, 61, 18, -82, 80, -13, 36, 46, -90, 61, 18, -82, 80, -13, +-64, 90, -43, -38, 89, -67, -9, 78, -64, 90, -43, -38, 89, -67, -9, 78, +-83, 22, 57, -90, 50, 31, -87, 73, -83, 22, 57, -90, 50, 31, -87, 73, + 64, -61, -9, 73, -89, 46, 25, -82, 64, -61, -9, 73, -89, 46, 25, -82, + 83, -31, -43, 88, -75, 13, 57, -90, 83, -31, -43, 88, -75, 13, 57, -90, + 64, 4, -70, 90, -50, -22, 80, -85, 64, 4, -70, 90, -50, -22, 80, -85, + 36, 38, -87, 78, -18, -54, 90, -67, 36, 38, -87, 78, -18, -54, 90, -67, + 64, -67, 9, 54, -89, 78, -25, -38, 64, -67, 9, 54, -89, 78, -25, -38, // 96 + 83, -85, 43, 22, -75, 90, -57, -4, 83, -85, 43, 22, -75, 90, -57, -4, + 64, -90, 70, -13, -50, 88, -80, 31, 64, -90, 70, -13, -50, 88, -80, 31, + 36, -82, 87, -46, -18, 73, -90, 61, 36, -82, 87, -46, -18, 73, -90, 61, + 64, -73, 25, 31, -75, 90, -70, 22, 64, -73, 25, 31, -75, 90, -70, 22, + 36, -78, 90, -67, 18, 38, -80, 90, 36, -78, 90, -67, 18, 38, -80, 90, +-64, 13, 43, -82, 89, -61, 9, 46, -64, 13, 43, -82, 89, -61, 9, 46, +-83, 88, -57, 4, 50, -85, 87, -54, -83, 88, -57, 4, 50, -85, 87, -54, + 64, -78, 43, 4, -50, 82, -90, 73, 64, -78, 43, 4, -50, 82, -90, 73, // 104 +-36, -13, 57, -85, 89, -67, 25, 22, -36, -13, 57, -85, 89, -67, 25, 22, +-64, 88, -87, 61, -18, -31, 70, -90, -64, 88, -87, 61, -18, -31, 70, -90, + 83, -54, 9, 38, -75, 90, -80, 46, 83, -54, 9, 38, -75, 90, -80, 46, + 64, -82, 57, -22, -18, 54, -80, 90, 64, -82, 57, -22, -18, 54, -80, 90, +-83, 61, -25, -13, 50, -78, 90, -85, -83, 61, -25, -13, 50, -78, 90, -85, + 64, -31, -9, 46, -75, 90, -87, 67, 64, -31, -9, 46, -75, 90, -87, 67, +-36, -4, 43, -73, 89, -88, 70, -38, -36, -4, 43, -73, 89, -88, 70, -38, + 64, -85, 70, -46, 18, 13, -43, 67, 64, -85, 70, -46, 18, 13, -43, 67, // 112 +-83, 90, -87, 73, -50, 22, 9, -38, -83, 90, -87, 73, -50, 22, 9, -38, + 64, -82, 90, -88, 75, -54, 25, 4, 64, -82, 90, -88, 75, -54, 25, 4, +-36, 61, -80, 90, -89, 78, -57, 31, -36, 61, -80, 90, -89, 78, -57, 31, + 64, -88, 80, -67, 50, -31, 9, 13, 64, -88, 80, -67, 50, -31, 9, 13, +-36, 54, -70, 82, -89, 90, -87, 78, -36, 54, -70, 82, -89, 90, -87, 78, +-64, 46, -25, 4, 18, -38, 57, -73, -64, 46, -25, 4, 18, -38, 57, -73, + 83, -90, 90, -85, 75, -61, 43, -22, 83, -90, 90, -85, 75, -61, 43, -22, + 64, -90, 87, -82, 75, -67, 57, -46, 64, -90, 87, -82, 75, -67, 57, -46, // 120 + 36, -22, 9, 4, -18, 31, -43, 54, 36, -22, 9, 4, -18, 31, -43, 54, +-64, 73, -80, 85, -89, 90, -90, 88, -64, 73, -80, 85, -89, 90, -90, 88, +-83, 78, -70, 61, -50, 38, -25, 13, -83, 78, -70, 61, -50, 38, -25, 13, + 64, -90, 90, -90, 89, -88, 87, -85, 64, -90, 90, -90, 89, -88, 87, -85, + 83, -82, 80, -78, 75, -73, 70, -67, 83, -82, 80, -78, 75, -73, 70, -67, + 64, -61, 57, -54, 50, -46, 43, -38, 64, -61, 57, -54, 50, -46, 43, -38, + 36, -31, 25, -22, 18, -13, 9, -4, 36, -31, 25, -22, 18, -13, 9, -4, +}; + + +// 4xN +ALIGNED(32) static const int16_t ff_dct2_4x8_coeff_ver[256] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 0 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 89, 75, 50, 18, 89, 75, 50, 18, 89, 75, 50, 18, 89, 75, 50, 18, +-18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, + 83, 36, -36, -83, 83, 36, -36, -83, 83, 36, -36, -83, 83, 36, -36, -83, +-83, -36, 36, 83, -83, -36, 36, 83, -83, -36, 36, 83, -83, -36, 36, 83, + 75, -18, -89, -50, 75, -18, -89, -50, 75, -18, -89, -50, 75, -18, -89, -50, + 50, 89, 18, -75, 50, 89, 18, -75, 50, 89, 18, -75, 50, 89, 18, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, // 8 + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, + 50, -89, 18, 75, 50, -89, 18, 75, 50, -89, 18, 75, 50, -89, 18, 75, +-75, -18, 89, -50, -75, -18, 89, -50, -75, -18, 89, -50, -75, -18, 89, -50, + 36, -83, 83, -36, 36, -83, 83, -36, 36, -83, 83, -36, 36, -83, 83, -36, +-36, 83, -83, 36, -36, 83, -83, 36, -36, 83, -83, 36, -36, 83, -83, 36, + 18, -50, 75, -89, 18, -50, 75, -89, 18, -50, 75, -89, 18, -50, 75, -89, + 89, -75, 50, -18, 89, -75, 50, -18, 89, -75, 50, -18, 89, -75, 50, -18, +}; + +ALIGNED(32) static const int16_t ff_dst7_4x8_coeff_ver[256] = { + 17, 32, 46, 60, 17, 32, 46, 60, 17, 32, 46, 60, 17, 32, 46, 60, // 0 + 71, 78, 85, 86, 71, 78, 85, 86, 71, 78, 85, 86, 71, 78, 85, 86, + 46, 78, 86, 71, 46, 78, 86, 71, 46, 78, 86, 71, 46, 78, 86, 71, + 32, -17, -60, -85, 32, -17, -60, -85, 32, -17, -60, -85, 32, -17, -60, -85, + 71, 85, 32, -46, 71, 85, 32, -46, 71, 85, 32, -46, 71, 85, 32, -46, +-86, -60, 17, 78, -86, -60, 17, 78, -86, -60, 17, 78, -86, -60, 17, 78, + 85, 46, -60, -78, 85, 46, -60, -78, 85, 46, -60, -78, 85, 46, -60, -78, + 17, 86, 32, -71, 17, 86, 32, -71, 17, 86, 32, -71, 17, 86, 32, -71, + 86, -17, -85, 32, 86, -17, -85, 32, 86, -17, -85, 32, 86, -17, -85, 32, // 8 + 78, -46, -71, 60, 78, -46, -71, 60, 78, -46, -71, 60, 78, -46, -71, 60, + 78, -71, -17, 85, 78, -71, -17, 85, 78, -71, -17, 85, 78, -71, -17, 85, +-60, -32, 86, -46, -60, -32, 86, -46, -60, -32, 86, -46, -60, -32, 86, -46, + 60, -86, 71, -17, 60, -86, 71, -17, 60, -86, 71, -17, 60, -86, 71, -17, +-46, 85, -78, 32, -46, 85, -78, 32, -46, 85, -78, 32, -46, 85, -78, 32, + 32, -60, 78, -86, 32, -60, 78, -86, 32, -60, 78, -86, 32, -60, 78, -86, + 85, -71, 46, -17, 85, -71, 46, -17, 85, -71, 46, -17, 85, -71, 46, -17, +}; + +ALIGNED(32) static const int16_t ff_dct8_4x8_coeff_ver[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4xN_coeff_hor[64] = { + 64, 83, 64, 36, 64, 83, 64, 36, 64, 83, 64, 36, 64, 83, 64, 36, + 64, 36, -64, -83, 64, 36, -64, -83, 64, 36, -64, -83, 64, 36, -64, -83, + 64, -36, -64, 83, 64, -36, -64, 83, 64, -36, -64, 83, 64, -36, -64, 83, + 64, -83, 64, -36, 64, -83, 64, -36, 64, -83, 64, -36, 64, -83, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_4xN_coeff_hor[64] = { + 29, 74, 84, 55, 29, 74, 84, 55, 29, 74, 84, 55, 29, 74, 84, 55, + 55, 74, -29, -84, 55, 74, -29, -84, 55, 74, -29, -84, 55, 74, -29, -84, + 74, 0, -74, 74, 74, 0, -74, 74, 74, 0, -74, 74, 74, 0, -74, 74, + 84, -74, 55, -29, 84, -74, 55, -29, 84, -74, 55, -29, 84, -74, 55, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 74, 55, 29, 84, 74, 55, 29, 84, 74, 55, 29, 84, 74, 55, 29, + 74, 0, -74, -74, 74, 0, -74, -74, 74, 0, -74, -74, 74, 0, -74, -74, + 55, -74, -29, 84, 55, -74, -29, 84, 55, -74, -29, 84, 55, -74, -29, 84, + 29, -74, 84, -55, 29, -74, 84, -55, 29, -74, 84, -55, 29, -74, 84, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4x8_coeff_hor[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, +-36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, +-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_4x8_coeff_hor[128] = { + 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, + 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, + 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, + 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, +-74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, + 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, + 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, +-74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, +-74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4x8_coeff_ver[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_4x8_coeff_ver[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) static const int16_t fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4x16_coeff_hor[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_4x16_coeff_hor[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4x16_coeff_ver[512] = { + 64, 90, 89, 87, 83, 80, 75, 70, 64, 90, 89, 87, 83, 80, 75, 70, // 0 + 64, 57, 50, 43, 36, 25, 18, 9, 64, 57, 50, 43, 36, 25, 18, 9, + 64, 87, 75, 57, 36, 9, -18, -43, 64, 87, 75, 57, 36, 9, -18, -43, +-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25, + 64, 80, 50, 9, -36, -70, -89, -87, 64, 80, 50, 9, -36, -70, -89, -87, +-64, -25, 18, 57, 83, 90, 75, 43, -64, -25, 18, 57, 83, 90, 75, 43, + 64, 70, 18, -43, -83, -87, -50, 9, 64, 70, 18, -43, -83, -87, -50, 9, + 64, 90, 75, 25, -36, -80, -89, -57, 64, 90, 75, 25, -36, -80, -89, -57, + 64, 57, -18, -80, -83, -25, 50, 90, 64, 57, -18, -80, -83, -25, 50, 90, // 8 + 64, -9, -75, -87, -36, 43, 89, 70, 64, -9, -75, -87, -36, 43, 89, 70, + 64, 43, -50, -90, -36, 57, 89, 25, 64, 43, -50, -90, -36, 57, 89, 25, +-64, -87, -18, 70, 83, 9, -75, -80, -64, -87, -18, 70, 83, 9, -75, -80, + 64, 25, -75, -70, 36, 90, 18, -80, 64, 25, -75, -70, 36, 90, 18, -80, +-64, 43, 89, 9, -83, -57, 50, 87, -64, 43, 89, 9, -83, -57, 50, 87, + 64, 9, -89, -25, 83, 43, -75, -57, 64, 9, -89, -25, 83, 43, -75, -57, + 64, 70, -50, -80, 36, 87, -18, -90, 64, 70, -50, -80, 36, 87, -18, -90, + 64, -9, -89, 25, 83, -43, -75, 57, 64, -9, -89, 25, 83, -43, -75, 57, // 16 + 64, -70, -50, 80, 36, -87, -18, 90, 64, -70, -50, 80, 36, -87, -18, 90, + 64, -25, -75, 70, 36, -90, 18, 80, 64, -25, -75, 70, 36, -90, 18, 80, +-64, -43, 89, -9, -83, 57, 50, -87, -64, -43, 89, -9, -83, 57, 50, -87, + 64, -43, -50, 90, -36, -57, 89, -25, 64, -43, -50, 90, -36, -57, 89, -25, +-64, 87, -18, -70, 83, -9, -75, 80, -64, 87, -18, -70, 83, -9, -75, 80, + 64, -57, -18, 80, -83, 25, 50, -90, 64, -57, -18, 80, -83, 25, 50, -90, + 64, 9, -75, 87, -36, -43, 89, -70, 64, 9, -75, 87, -36, -43, 89, -70, + 64, -70, 18, 43, -83, 87, -50, -9, 64, -70, 18, 43, -83, 87, -50, -9, // 24 + 64, -90, 75, -25, -36, 80, -89, 57, 64, -90, 75, -25, -36, 80, -89, 57, + 64, -80, 50, -9, -36, 70, -89, 87, 64, -80, 50, -9, -36, 70, -89, 87, +-64, 25, 18, -57, 83, -90, 75, -43, -64, 25, 18, -57, 83, -90, 75, -43, + 64, -87, 75, -57, 36, -9, -18, 43, 64, -87, 75, -57, 36, -9, -18, 43, +-64, 80, -89, 90, -83, 70, -50, 25, -64, 80, -89, 90, -83, 70, -50, 25, + 64, -90, 89, -87, 83, -80, 75, -70, 64, -90, 89, -87, 83, -80, 75, -70, + 64, -57, 50, -43, 36, -25, 18, -9, 64, -57, 50, -43, 36, -25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_4x16_coeff_ver[512] = { + 8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0 + 88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17, + 17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25, + -8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33, + 25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81, +-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48, + 33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48, + 17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62, + 40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8 + 87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73, + 48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68, +-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81, + 55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48, +-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87, + 62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81, + 33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88, + 68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16 + 81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88, + 73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88, +-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85, + 77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0, +-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77, + 81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88, + 48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68, + 85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24 + 73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55, + 87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81, +-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40, + 88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48, +-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25, + 88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68, + 62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct8_4x16_coeff_ver[512] = { + 88, 88, 87, 85, 81, 77, 73, 68, 88, 88, 87, 85, 81, 77, 73, 68, // 0 + 62, 55, 48, 40, 33, 25, 17, 8, 62, 55, 48, 40, 33, 25, 17, 8, + 88, 81, 68, 48, 25, 0, -25, -48, 88, 81, 68, 48, 25, 0, -25, -48, +-68, -81, -88, -88, -81, -68, -48, -25, -68, -81, -88, -88, -81, -68, -48, -25, + 87, 68, 33, -8, -48, -77, -88, -81, 87, 68, 33, -8, -48, -77, -88, -81, +-55, -17, 25, 62, 85, 88, 73, 40, -55, -17, 25, 62, 85, 88, 73, 40, + 85, 48, -8, -62, -88, -77, -33, 25, 85, 48, -8, -62, -88, -77, -33, 25, + 73, 88, 68, 17, -40, -81, -87, -55, 73, 88, 68, 17, -40, -81, -87, -55, + 81, 25, -48, -88, -68, 0, 68, 88, 81, 25, -48, -88, -68, 0, 68, 88, // 8 + 48, -25, -81, -81, -25, 48, 88, 68, 48, -25, -81, -81, -25, 48, 88, 68, + 77, 0, -77, -77, 0, 77, 77, 0, 77, 0, -77, -77, 0, 77, 77, 0, +-77, -77, 0, 77, 77, 0, -77, -77, -77, -77, 0, 77, 77, 0, -77, -77, + 73, -25, -88, -33, 68, 77, -17, -88, 73, -25, -88, -33, 68, 77, -17, -88, +-40, 62, 81, -8, -87, -48, 55, 85, -40, 62, 81, -8, -87, -48, 55, 85, + 68, -48, -81, 25, 88, 0, -88, -25, 68, -48, -81, 25, 88, 0, -88, -25, + 81, 48, -68, -68, 48, 81, -25, -88, 81, 48, -68, -68, 48, 81, -25, -88, + 62, -68, -55, 73, 48, -77, -40, 81, 62, -68, -55, 73, 48, -77, -40, 81, // 16 + 33, -85, -25, 87, 17, -88, -8, 88, 33, -85, -25, 87, 17, -88, -8, 88, + 55, -81, -17, 88, -25, -77, 62, 48, 55, -81, -17, 88, -25, -77, 62, 48, +-85, -8, 88, -33, -73, 68, 40, -87, -85, -8, 88, -33, -73, 68, 40, -87, + 48, -88, 25, 68, -81, 0, 81, -68, 48, -88, 25, 68, -81, 0, 81, -68, +-25, 88, -48, -48, 88, -25, -68, 81, -25, 88, -48, -48, 88, -25, -68, 81, + 40, -88, 62, 17, -81, 77, -8, -68, 40, -88, 62, 17, -81, 77, -8, -68, + 87, -33, -48, 88, -55, -25, 85, -73, 87, -33, -48, 88, -55, -25, 85, -73, + 33, -81, 85, -40, -25, 77, -87, 48, 33, -81, 85, -40, -25, 77, -87, 48, // 24 + 17, -73, 88, -55, -8, 68, -88, 62, 17, -73, 88, -55, -8, 68, -88, 62, + 25, -68, 88, -81, 48, 0, -48, 81, 25, -68, 88, -81, 48, 0, -48, 81, +-88, 68, -25, -25, 68, -88, 81, -48, -88, 68, -25, -25, 68, -88, 81, -48, + 17, -48, 73, -87, 88, -77, 55, -25, 17, -48, 73, -87, 88, -77, 55, -25, + -8, 40, -68, 85, -88, 81, -62, 33, -8, 40, -68, 85, -88, 81, -62, 33, + 8, -25, 40, -55, 68, -77, 85, -88, 8, -25, 40, -55, 68, -77, 85, -88, + 88, -87, 81, -73, 62, -48, 33, -17, 88, -87, 81, -73, 62, -48, 33, -17, +}; + + +ALIGNED(32) static const int16_t fi_dct2_4x32_coeff_hor[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_4x32_coeff_hor[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_4x32_coeff_hor[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +// 8xN +ALIGNED(32) static const int16_t ff_dct2_8xN_coeff_hor[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, +-64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, +-64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18 +}; + +ALIGNED(32) static const int16_t ff_dst7_8xN_coeff_hor[128] = { + 17, 32, 46, 78, 71, 85, 85, 46, 17, 32, 46, 78, 71, 85, 85, 46, + 46, 60, 86, 71, 32, -46, -60, -78, 46, 60, 86, 71, 32, -46, -60, -78, + 71, 78, 32, -17, -86, -60, 17, 86, 71, 78, 32, -17, -86, -60, 17, 86, + 85, 86, -60, -85, 17, 78, 32, -71, 85, 86, -60, -85, 17, 78, 32, -71, + 86, -17, 78, -71, 60, -86, 32, -60, 86, -17, 78, -71, 60, -86, 32, -60, +-85, 32, -17, 85, 71, -17, 78, -86, -85, 32, -17, 85, 71, -17, 78, -86, + 78, -46, -60, -32, -46, 85, 85, -71, 78, -46, -60, -32, -46, 85, 85, -71, +-71, 60, 86, -46, -78, 32, 46, -17, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) static const int16_t ff_dct8_8xN_coeff_hor[128] = { + 86, 85, 85, 60, 78, 17, 71, -32, 86, 85, 85, 60, 78, 17, 71, -32, + 78, 71, 17, -32, -60, -86, -86, -17, 78, 71, 17, -32, -60, -86, -86, -17, + 60, 46, -71, -86, -46, 32, 78, 60, 60, 46, -71, -86, -46, 32, 78, 60, + 32, 17, -78, -46, 85, 71, -46, -85, 32, 17, -78, -46, 85, 71, -46, -85, + 60, -71, 46, -86, 32, -78, 17, -46, 60, -71, 46, -86, 32, -78, 17, -46, +-46, 78, 32, 60, 85, -46, 71, -85, -46, 78, 32, 60, 85, -46, 71, -85, + 32, -85, -85, 17, -17, 71, 86, -78, 32, -85, -85, 17, -17, 71, 86, -78, +-17, 86, 71, -78, -86, 60, 60, -32, -17, 86, 71, -78, -86, 60, 60, -32, +}; + + + static const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) static const int16_t fi_dct2_8x2_coeff_hor[128] = { + 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, + 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, + 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, + 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, + 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_8x2_coeff_hor[128] = { + 17, 46, 71, 85, 86, 78, 60, 32, 17, 46, 71, 85, 86, 78, 60, 32, + 32, 78, 85, 46, -17, -71, -86, -60, 32, 78, 85, 46, -17, -71, -86, -60, + 46, 86, 32, -60, -85, -17, 71, 78, 46, 86, 32, -60, -85, -17, 71, 78, + 60, 71, -46, -78, 32, 85, -17, -86, 60, 71, -46, -78, 32, 85, -17, -86, + 71, 32, -86, 17, 78, -60, -46, 85, 71, 32, -86, 17, 78, -60, -46, 85, + 78, -17, -60, 86, -46, -32, 85, -71, 78, -17, -60, 86, -46, -32, 85, -71, + 85, -60, 17, 32, -71, 86, -78, 46, 85, -60, 17, 32, -71, 86, -78, 46, + 86, -85, 78, -71, 60, -46, 32, -17, 86, -85, 78, -71, 60, -46, 32, -17, +}; + + static const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) static const int16_t ff_dct2_8x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) static const int16_t ff_dst7_8x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) static const int16_t ff_dct8_8x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_8x4_coeff_hor[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_8x4_coeff_hor[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) static const int16_t fi_dct8_8x4_coeff_hor[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) static const int16_t fi_dct2_8x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_8x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + + static const int16_t* fi_dct8_8x4_coeff_ver = ff_dct8_8x4_coeff_ver; // Duplicate table + + +ALIGNED(32) static const int16_t ff_dct2_8x8_coeff_ver[64] = { + 64, 64, 64, 64, 64, 64, 64, 64, 89, 50, 75, 18, -18, -75, -50, -89, + 83, -36, 36, -83, -83, 36, -36, 83, 75, -89, -18, -50, 50, 18, 89, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 50, 18, -89, 75, -75, 89, -18, -50, + 36, 83, -83, -36, -36, -83, 83, 36, 18, 75, -50, -89, 89, 50, -75, -18, +}; + +ALIGNED(32) static const int16_t ff_dst7_8x8_coeff_ver[64] = { + 17, 46, 32, 60, 71, 85, 78, 86, 46, 86, 78, 71, 32, -60, -17, -85, + 71, 32, 85, -46, -86, 17, -60, 78, 85, -60, 46, -78, 17, 32, 86, -71, + 86, -85, -17, 32, 78, -71, -46, 60, 78, -17, -71, 85, -60, 86, -32, -46, + 60, 71, -86, -17, -46, -78, 85, 32, 32, 78, -60, -86, 85, 46, -71, -17, +}; + +ALIGNED(32) static const int16_t ff_dct8_8x8_coeff_ver[64] = { + 86, 78, 85, 71, 60, 32, 46, 17, 85, 17, 60, -32, -71, -78, -86, -46, + 78, -60, 17, -86, -46, 85, 32, 71, 71, -86, -32, -17, 78, -46, 60, -85, + 60, -46, -71, 78, 32, -17, -85, 86, 46, 32, -86, 60, -85, 71, 17, -78, + 32, 85, -78, -46, -17, -86, 71, 60, 17, 71, -46, -85, 86, 60, -78, -32, +}; + + +ALIGNED(32) static const int16_t fi_dct2_8x8_coeff_hor[512] = { + 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, // 0 + 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, + 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, + 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, + 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, + 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, +-64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, +-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, + 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, // 8 +-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, +-64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, + 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, + 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, +-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, + 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, +-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, + 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, // 16 +-83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, + 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, +-36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, + 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, +-36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, +-64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, + 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, + 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, // 24 + 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, +-64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, +-83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, + 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, + 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, + 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, + 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_8x8_coeff_hor[512] = { + 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, // 0 + 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, + 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, + 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, + 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, + 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, +-17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, +-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, + 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, // 8 + 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, +-85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, + 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, + 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, +-46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, + 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, +-17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, + 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, // 16 +-86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, + 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, +-46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, + 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, +-60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, +-46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, + 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, + 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, // 24 + 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, +-71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, +-78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, + 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, + 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, + 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, + 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, +}; + +ALIGNED(32) static const int16_t fi_dct8_8x8_coeff_hor[512] = { + 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, // 0 + 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, + 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, + 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, + 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, + 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, +-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, +-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, + 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, // 8 +-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, +-46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, + 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, + 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, +-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, + 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, +-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, + 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, // 16 +-46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, + 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, +-17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, + 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, + 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, +-85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, + 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, + 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, // 24 + 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, +-17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, +-86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, + 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, + 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, + 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, + 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, +}; + + +ALIGNED(32) static const int16_t ff_dct2_8x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) static const int16_t ff_dst7_8x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) static const int16_t ff_dct8_8x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + +ALIGNED(32) static const int16_t ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = { + 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, // 0 + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, // 8 +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +}; + + + static const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor; + + static const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor; + + static const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor; + + +ALIGNED(32) static const int16_t fi_dct2_8x16_coeff_ver[2048] = { + 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, // 0 + 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, + 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, + 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, + 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, + 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, + 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, + 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, + 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, // 8 + 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, + 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, +-18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, +-64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, +-89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, +-83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, +-50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, + 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, // 16 + 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, +-36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, +-89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, +-64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, + 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, + 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, + 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, + 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, // 24 + 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, +-83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, +-50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, + 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, + 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, +-36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, +-89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, + 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, // 32 +-18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, +-83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, + 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, + 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, +-75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, +-36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, + 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, + 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, // 40 +-50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, +-36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, + 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, +-64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, +-18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, + 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, +-75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, + 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, // 48 +-75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, + 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, + 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, +-64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, + 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, +-83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, + 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, + 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, // 56 +-89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, + 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, +-75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, + 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, +-50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, + 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, +-18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, + 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, // 64 +-89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, + 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, +-75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, + 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, +-50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, + 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, +-18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, + 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, // 72 +-75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, + 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, + 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, +-64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, + 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, +-83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, + 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, + 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, // 80 +-50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, +-36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, + 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, +-64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, +-18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, + 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, +-75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, + 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, // 88 +-18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, +-83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, + 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, + 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, +-75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, +-36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, + 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, + 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, // 96 + 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, +-83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, +-50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, + 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, + 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, +-36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, +-89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, + 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, // 104 + 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, +-36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, +-89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, +-64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, + 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, + 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, + 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, + 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, // 112 + 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, + 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, +-18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, +-64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, +-89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, +-83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, +-50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, + 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, // 120 + 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, + 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, + 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, + 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, + 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, + 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, + 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_8x16_coeff_ver[2048] = { + 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, // 0 + 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, + 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, + 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, + 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, + 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, + 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, + 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, + 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, // 8 + 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, + 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, + 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, + -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, +-68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, +-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, +-62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, + 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, // 16 + 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, + 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, +-48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, +-88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, + 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, + 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, // 24 + 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, +-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, +-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, + 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, + 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, + -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, +-88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, + 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, // 32 + 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, +-81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, + -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, + 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, +-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, +-55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, + 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, + 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, // 40 + 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, +-81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, + 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, +-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, +-48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, + 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, +-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, + 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, // 48 +-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, +-25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, + 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, +-85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, + 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, +-73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, + 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, + 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, // 56 +-55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, + 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, +-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, + 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, +-25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, + 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, + -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, + 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, // 64 +-81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, + 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, +-88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, + 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, +-68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, + 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, +-25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, + 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, // 72 +-88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, + 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, +-17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, +-40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, + 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, +-87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, + 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, // 80 +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, // 88 +-48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, +-68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, + 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, + 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, +-81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, +-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, + 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, + 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, // 96 + -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, +-88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, +-33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, + 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, + 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, +-40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, +-87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, + 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, // 104 + 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, +-48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, +-88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, +-55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, + 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, + 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, + 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, + 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, // 112 + 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, + 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, +-25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, +-68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, +-48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, // 120 + 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, + 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, + 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, + 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, + 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, + 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, + 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct8_8x16_coeff_ver[2048] = { + 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, // 0 + 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, + 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, + 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, + 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, + 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, + 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, + 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, + 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, // 8 + 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, + 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, +-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, +-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, +-88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, +-81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, +-48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, + 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, // 16 + 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, +-48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, +-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, +-55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, + 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, + 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, + 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, + 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, // 24 + -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, +-88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, +-33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, + 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, + 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, +-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, +-87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, + 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, // 32 +-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, +-68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, + 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, + 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, +-81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, +-25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, + 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, // 40 +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, // 48 +-88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, + 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, +-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, +-40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, + 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, +-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, + 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, + 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, // 56 +-81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, + 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, +-88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, + 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, +-68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, +-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, + 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, // 64 +-55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, + 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, +-40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, + 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, +-25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, + 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, + -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, + 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, // 72 +-17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, +-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, + 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, +-85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, + 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, +-73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, + 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, + 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, // 80 + 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, +-81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, + 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, +-25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, +-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, + 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, +-68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, + 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, // 88 + 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, +-81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, + -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, + 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, +-48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, +-55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, + 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, + 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, // 96 + 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, +-25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, +-87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, + 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, + 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, + -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, +-88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, + 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, // 104 + 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, + 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, +-48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, +-88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, +-25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, + 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, + 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, + 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, // 112 + 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, + 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, + 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, + -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, +-68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, +-88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, +-62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, + 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, // 120 + 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, + 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, + 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, + 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, + 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, + 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, + 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, +}; + + +ALIGNED(32) static const int16_t ff_dct2_8x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dst7_8x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dct8_8x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + + static const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor; + + static const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor; + + static const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor; + + +// 16xN +ALIGNED(32) static const int16_t ff_dct2_16xN_coeff_hor[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 64, -64, 57, -80, 50, -89, 43, -90, + 64, 64, 80, 70, 50, 18, 9, -43, -64, 64, -25, 90, 18, 75, 57, 25, + 64, 64, 57, 43, -18, -50, -80, -90, 64, -64, -9, -87, -75, -18, -87, 70, + 64, 64, 25, 9, -75, -89, -70, -25, -64, 64, 43, 70, 89, -50, 9, -80, + 64, 64, -9, -25, -89, -75, 25, 70, 64, -64, -70, -43, -50, 89, 80, -9, + 64, 64, -43, -57, -50, -18, 90, 80, -64, 64, 87, 9, -18, -75, -70, 87, + 64, 64, -70, -80, 18, 50, 43, -9, 64, -64, -90, 25, 75, 18, -25, -57, + 64, 64, -87, -90, 75, 89, -57, -87, -64, 64, 80, -57, -89, 50, 90, -43, + 83, 36, 80, 9, 75, -18, 70, -43, 36, -83, 25, -70, 18, -50, 9, -25, +-36, -83, -70, -87, -89, -50, -87, 9, 83, -36, 90, -80, 75, -89, 43, -57, +-83, -36, -25, 57, 50, 89, 90, 25, -36, 83, 43, 9, 89, -75, 70, -80, + 36, 83, 90, 43, 18, -75, -80, -57, -83, 36, -57, 87, 50, -18, 87, -90, + 83, 36, -43, -90, -75, 18, 57, 80, 36, -83, -87, 57, -18, 50, 90, -87, +-36, -83, -57, 25, 89, 50, -25, -90, 83, -36, -9, -43, -75, 89, 80, -70, +-83, -36, 87, 70, -50, -89, -9, 87, -36, 83, 80, -90, -89, 75, 57, -43, + 36, 83, -9, -80, -18, 75, 43, -70, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) static const int16_t ff_dst7_16xN_coeff_hor[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 88, -8, 87, -40, 81, -68, 73, -85, // 0 + 25, 33, 68, 81, 88, 85, 81, 40, -88, 17, -68, 73, -25, 88, 25, 55, + 40, 48, 88, 88, 62, 25, -17, -68, 87, -25, 33, -88, -48, -48, -88, 48, + 55, 62, 81, 68, -17, -55, -88, -73, -85, 33, 8, 85, 88, -25, 33, -87, + 68, 73, 48, 25, -81, -88, -25, 33, 81, -40, -48, -62, -68, 81, 68, 8, + 77, 81, 0, -25, -77, -48, 77, 88, -77, 48, 77, 25, 0, -81, -77, 81, + 85, 87, -48, -68, -8, 33, 62, 8, 73, -55, -88, 17, 68, 25, -17, -62, + 88, 88, -81, -88, 68, 87, -48, -85, -68, 62, 81, -55, -88, 48, 88, -40, + 68, 88, 77, 77, 85, 55, 88, 25, 62, -88, 48, -81, 33, -62, 17, -33, // 8 + 48, -25, 0, -77, -48, -87, -81, -48, 68, -8, 88, -68, 81, -88, 48, -62, +-81, -81, -77, 0, -8, 81, 68, 68, -55, 88, 25, 25, 85, -68, 73, -81, +-25, 48, 77, 77, 62, -40, -48, -81, -73, 17, -68, 88, 40, -8, 87, -88, + 88, 68, 0, -77, -88, -17, 25, 88, 48, -87, -81, 48, -25, 55, 88, -85, + 0, -68, -77, 0, 77, 68, 0, -88, 77, -25, 0, -48, -77, 88, 77, -68, +-88, -48, 77, 77, -33, -88, -25, 81, -40, 85, 81, -88, -87, 73, 55, -40, + 25, 81, 0, -77, -25, 73, 48, -68, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) static const int16_t ff_dct8_16xN_coeff_hor[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 62, -68, 55, -81, 48, -88, 40, -88, // 0 + 87, 85, 68, 48, 33, -8, -8, -62, -55, 73, -17, 88, 25, 68, 62, 17, + 81, 77, 25, 0, -48, -77, -88, -77, 48, -77, -25, -77, -81, 0, -81, 77, + 73, 68, -25, -48, -88, -81, -33, 25, -40, 81, 62, 48, 81, -68, -8, -68, + 62, 55, -68, -81, -55, -17, 73, 88, 33, -85, -85, -8, -25, 88, 87, -33, + 48, 40, -88, -88, 25, 62, 68, 17, -25, 87, 88, -33, -48, -48, -48, 88, + 33, 25, -81, -68, 85, 88, -40, -81, 17, -88, -73, 68, 88, -25, -55, -25, + 17, 8, -48, -25, 73, 40, -87, -55, -8, 88, 40, -87, -68, 81, 85, -73, + 81, 25, 77, 0, 73, -25, 68, -48, 33, -81, 25, -68, 17, -48, 8, -25, // 8 +-48, -88, -77, -77, -88, -33, -81, 25, 85, -40, 88, -81, 73, -87, 40, -55, +-68, 0, 0, 77, 68, 77, 88, 0, -25, 77, 48, 0, 88, -77, 68, -77, + 68, 88, 77, 0, -17, -88, -88, -25, -87, 48, -48, 81, 55, -25, 85, -88, + 48, -25, -77, -77, -40, 62, 81, 48, 17, -73, -88, 68, -8, 40, 88, -87, +-81, -81, 0, 77, 81, -8, -68, -68, 88, -55, -25, -25, -68, 85, 81, -73, +-25, 48, 77, 0, -87, -48, 48, 81, -8, 68, 68, -88, -88, 81, 62, -48, + 88, 68, -77, -77, 55, 85, -25, -88, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +static const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) static const int16_t fi_dct2_16x2_coeff_hor[512] = { + 64, 90, 89, 87, 83, 80, 75, 70, 64, 90, 89, 87, 83, 80, 75, 70, // 0 + 64, 57, 50, 43, 36, 25, 18, 9, 64, 57, 50, 43, 36, 25, 18, 9, + 64, 87, 75, 57, 36, 9, -18, -43, 64, 87, 75, 57, 36, 9, -18, -43, +-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25, + 64, 80, 50, 9, -36, -70, -89, -87, 64, 80, 50, 9, -36, -70, -89, -87, +-64, -25, 18, 57, 83, 90, 75, 43, -64, -25, 18, 57, 83, 90, 75, 43, + 64, 70, 18, -43, -83, -87, -50, 9, 64, 70, 18, -43, -83, -87, -50, 9, + 64, 90, 75, 25, -36, -80, -89, -57, 64, 90, 75, 25, -36, -80, -89, -57, + 64, 57, -18, -80, -83, -25, 50, 90, 64, 57, -18, -80, -83, -25, 50, 90, // 8 + 64, -9, -75, -87, -36, 43, 89, 70, 64, -9, -75, -87, -36, 43, 89, 70, + 64, 43, -50, -90, -36, 57, 89, 25, 64, 43, -50, -90, -36, 57, 89, 25, +-64, -87, -18, 70, 83, 9, -75, -80, -64, -87, -18, 70, 83, 9, -75, -80, + 64, 25, -75, -70, 36, 90, 18, -80, 64, 25, -75, -70, 36, 90, 18, -80, +-64, 43, 89, 9, -83, -57, 50, 87, -64, 43, 89, 9, -83, -57, 50, 87, + 64, 9, -89, -25, 83, 43, -75, -57, 64, 9, -89, -25, 83, 43, -75, -57, + 64, 70, -50, -80, 36, 87, -18, -90, 64, 70, -50, -80, 36, 87, -18, -90, + 64, -9, -89, 25, 83, -43, -75, 57, 64, -9, -89, 25, 83, -43, -75, 57, // 16 + 64, -70, -50, 80, 36, -87, -18, 90, 64, -70, -50, 80, 36, -87, -18, 90, + 64, -25, -75, 70, 36, -90, 18, 80, 64, -25, -75, 70, 36, -90, 18, 80, +-64, -43, 89, -9, -83, 57, 50, -87, -64, -43, 89, -9, -83, 57, 50, -87, + 64, -43, -50, 90, -36, -57, 89, -25, 64, -43, -50, 90, -36, -57, 89, -25, +-64, 87, -18, -70, 83, -9, -75, 80, -64, 87, -18, -70, 83, -9, -75, 80, + 64, -57, -18, 80, -83, 25, 50, -90, 64, -57, -18, 80, -83, 25, 50, -90, + 64, 9, -75, 87, -36, -43, 89, -70, 64, 9, -75, 87, -36, -43, 89, -70, + 64, -70, 18, 43, -83, 87, -50, -9, 64, -70, 18, 43, -83, 87, -50, -9, // 24 + 64, -90, 75, -25, -36, 80, -89, 57, 64, -90, 75, -25, -36, 80, -89, 57, + 64, -80, 50, -9, -36, 70, -89, 87, 64, -80, 50, -9, -36, 70, -89, 87, +-64, 25, 18, -57, 83, -90, 75, -43, -64, 25, 18, -57, 83, -90, 75, -43, + 64, -87, 75, -57, 36, -9, -18, 43, 64, -87, 75, -57, 36, -9, -18, 43, +-64, 80, -89, 90, -83, 70, -50, 25, -64, 80, -89, 90, -83, 70, -50, 25, + 64, -90, 89, -87, 83, -80, 75, -70, 64, -90, 89, -87, 83, -80, 75, -70, + 64, -57, 50, -43, 36, -25, 18, -9, 64, -57, 50, -43, 36, -25, 18, -9, +}; + + static const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + + +ALIGNED(32) static const int16_t fi_dst7_16x2_coeff_hor[512] = { + 8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0 + 88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17, + 17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25, + -8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33, + 25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81, +-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48, + 33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48, + 17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62, + 40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8 + 87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73, + 48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68, +-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81, + 55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48, +-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87, + 62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81, + 33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88, + 68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16 + 81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88, + 73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88, +-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85, + 77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0, +-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77, + 81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88, + 48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68, + 85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24 + 73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55, + 87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81, +-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40, + 88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48, +-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25, + 88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68, + 62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8, +}; + + +ALIGNED(32) static const int16_t ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = { + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, + 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, // 8 + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, // 16 + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, // 24 +-43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, // 32 +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, // 40 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, // 48 +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, // 56 +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +}; + + +ALIGNED(32) static const int16_t ff_dct2_16x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) static const int16_t ff_dst7_16x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) static const int16_t ff_dct8_16x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_16x4_coeff_hor[1024] = { + 64, 90, 89, 87, 64, 90, 89, 87, 64, 90, 89, 87, 64, 90, 89, 87, // 0 + 83, 80, 75, 70, 83, 80, 75, 70, 83, 80, 75, 70, 83, 80, 75, 70, + 64, 57, 50, 43, 64, 57, 50, 43, 64, 57, 50, 43, 64, 57, 50, 43, + 36, 25, 18, 9, 36, 25, 18, 9, 36, 25, 18, 9, 36, 25, 18, 9, + 64, 87, 75, 57, 64, 87, 75, 57, 64, 87, 75, 57, 64, 87, 75, 57, + 36, 9, -18, -43, 36, 9, -18, -43, 36, 9, -18, -43, 36, 9, -18, -43, +-64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, +-83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, + 64, 80, 50, 9, 64, 80, 50, 9, 64, 80, 50, 9, 64, 80, 50, 9, // 8 +-36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, +-64, -25, 18, 57, -64, -25, 18, 57, -64, -25, 18, 57, -64, -25, 18, 57, + 83, 90, 75, 43, 83, 90, 75, 43, 83, 90, 75, 43, 83, 90, 75, 43, + 64, 70, 18, -43, 64, 70, 18, -43, 64, 70, 18, -43, 64, 70, 18, -43, +-83, -87, -50, 9, -83, -87, -50, 9, -83, -87, -50, 9, -83, -87, -50, 9, + 64, 90, 75, 25, 64, 90, 75, 25, 64, 90, 75, 25, 64, 90, 75, 25, +-36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, + 64, 57, -18, -80, 64, 57, -18, -80, 64, 57, -18, -80, 64, 57, -18, -80, // 16 +-83, -25, 50, 90, -83, -25, 50, 90, -83, -25, 50, 90, -83, -25, 50, 90, + 64, -9, -75, -87, 64, -9, -75, -87, 64, -9, -75, -87, 64, -9, -75, -87, +-36, 43, 89, 70, -36, 43, 89, 70, -36, 43, 89, 70, -36, 43, 89, 70, + 64, 43, -50, -90, 64, 43, -50, -90, 64, 43, -50, -90, 64, 43, -50, -90, +-36, 57, 89, 25, -36, 57, 89, 25, -36, 57, 89, 25, -36, 57, 89, 25, +-64, -87, -18, 70, -64, -87, -18, 70, -64, -87, -18, 70, -64, -87, -18, 70, + 83, 9, -75, -80, 83, 9, -75, -80, 83, 9, -75, -80, 83, 9, -75, -80, + 64, 25, -75, -70, 64, 25, -75, -70, 64, 25, -75, -70, 64, 25, -75, -70, // 24 + 36, 90, 18, -80, 36, 90, 18, -80, 36, 90, 18, -80, 36, 90, 18, -80, +-64, 43, 89, 9, -64, 43, 89, 9, -64, 43, 89, 9, -64, 43, 89, 9, +-83, -57, 50, 87, -83, -57, 50, 87, -83, -57, 50, 87, -83, -57, 50, 87, + 64, 9, -89, -25, 64, 9, -89, -25, 64, 9, -89, -25, 64, 9, -89, -25, + 83, 43, -75, -57, 83, 43, -75, -57, 83, 43, -75, -57, 83, 43, -75, -57, + 64, 70, -50, -80, 64, 70, -50, -80, 64, 70, -50, -80, 64, 70, -50, -80, + 36, 87, -18, -90, 36, 87, -18, -90, 36, 87, -18, -90, 36, 87, -18, -90, + 64, -9, -89, 25, 64, -9, -89, 25, 64, -9, -89, 25, 64, -9, -89, 25, // 32 + 83, -43, -75, 57, 83, -43, -75, 57, 83, -43, -75, 57, 83, -43, -75, 57, + 64, -70, -50, 80, 64, -70, -50, 80, 64, -70, -50, 80, 64, -70, -50, 80, + 36, -87, -18, 90, 36, -87, -18, 90, 36, -87, -18, 90, 36, -87, -18, 90, + 64, -25, -75, 70, 64, -25, -75, 70, 64, -25, -75, 70, 64, -25, -75, 70, + 36, -90, 18, 80, 36, -90, 18, 80, 36, -90, 18, 80, 36, -90, 18, 80, +-64, -43, 89, -9, -64, -43, 89, -9, -64, -43, 89, -9, -64, -43, 89, -9, +-83, 57, 50, -87, -83, 57, 50, -87, -83, 57, 50, -87, -83, 57, 50, -87, + 64, -43, -50, 90, 64, -43, -50, 90, 64, -43, -50, 90, 64, -43, -50, 90, // 40 +-36, -57, 89, -25, -36, -57, 89, -25, -36, -57, 89, -25, -36, -57, 89, -25, +-64, 87, -18, -70, -64, 87, -18, -70, -64, 87, -18, -70, -64, 87, -18, -70, + 83, -9, -75, 80, 83, -9, -75, 80, 83, -9, -75, 80, 83, -9, -75, 80, + 64, -57, -18, 80, 64, -57, -18, 80, 64, -57, -18, 80, 64, -57, -18, 80, +-83, 25, 50, -90, -83, 25, 50, -90, -83, 25, 50, -90, -83, 25, 50, -90, + 64, 9, -75, 87, 64, 9, -75, 87, 64, 9, -75, 87, 64, 9, -75, 87, +-36, -43, 89, -70, -36, -43, 89, -70, -36, -43, 89, -70, -36, -43, 89, -70, + 64, -70, 18, 43, 64, -70, 18, 43, 64, -70, 18, 43, 64, -70, 18, 43, // 48 +-83, 87, -50, -9, -83, 87, -50, -9, -83, 87, -50, -9, -83, 87, -50, -9, + 64, -90, 75, -25, 64, -90, 75, -25, 64, -90, 75, -25, 64, -90, 75, -25, +-36, 80, -89, 57, -36, 80, -89, 57, -36, 80, -89, 57, -36, 80, -89, 57, + 64, -80, 50, -9, 64, -80, 50, -9, 64, -80, 50, -9, 64, -80, 50, -9, +-36, 70, -89, 87, -36, 70, -89, 87, -36, 70, -89, 87, -36, 70, -89, 87, +-64, 25, 18, -57, -64, 25, 18, -57, -64, 25, 18, -57, -64, 25, 18, -57, + 83, -90, 75, -43, 83, -90, 75, -43, 83, -90, 75, -43, 83, -90, 75, -43, + 64, -87, 75, -57, 64, -87, 75, -57, 64, -87, 75, -57, 64, -87, 75, -57, // 56 + 36, -9, -18, 43, 36, -9, -18, 43, 36, -9, -18, 43, 36, -9, -18, 43, +-64, 80, -89, 90, -64, 80, -89, 90, -64, 80, -89, 90, -64, 80, -89, 90, +-83, 70, -50, 25, -83, 70, -50, 25, -83, 70, -50, 25, -83, 70, -50, 25, + 64, -90, 89, -87, 64, -90, 89, -87, 64, -90, 89, -87, 64, -90, 89, -87, + 83, -80, 75, -70, 83, -80, 75, -70, 83, -80, 75, -70, 83, -80, 75, -70, + 64, -57, 50, -43, 64, -57, 50, -43, 64, -57, 50, -43, 64, -57, 50, -43, + 36, -25, 18, -9, 36, -25, 18, -9, 36, -25, 18, -9, 36, -25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_16x4_coeff_hor[1024] = { + 8, 25, 40, 55, 8, 25, 40, 55, 8, 25, 40, 55, 8, 25, 40, 55, // 0 + 68, 77, 85, 88, 68, 77, 85, 88, 68, 77, 85, 88, 68, 77, 85, 88, + 88, 87, 81, 73, 88, 87, 81, 73, 88, 87, 81, 73, 88, 87, 81, 73, + 62, 48, 33, 17, 62, 48, 33, 17, 62, 48, 33, 17, 62, 48, 33, 17, + 17, 48, 73, 87, 17, 48, 73, 87, 17, 48, 73, 87, 17, 48, 73, 87, + 88, 77, 55, 25, 88, 77, 55, 25, 88, 77, 55, 25, 88, 77, 55, 25, + -8, -40, -68, -85, -8, -40, -68, -85, -8, -40, -68, -85, -8, -40, -68, -85, +-88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, + 25, 68, 88, 81, 25, 68, 88, 81, 25, 68, 88, 81, 25, 68, 88, 81, // 8 + 48, 0, -48, -81, 48, 0, -48, -81, 48, 0, -48, -81, 48, 0, -48, -81, +-88, -68, -25, 25, -88, -68, -25, 25, -88, -68, -25, 25, -88, -68, -25, 25, + 68, 88, 81, 48, 68, 88, 81, 48, 68, 88, 81, 48, 68, 88, 81, 48, + 33, 81, 85, 40, 33, 81, 85, 40, 33, 81, 85, 40, 33, 81, 85, 40, +-25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, + 17, 73, 88, 55, 17, 73, 88, 55, 17, 73, 88, 55, 17, 73, 88, 55, + -8, -68, -88, -62, -8, -68, -88, -62, -8, -68, -88, -62, -8, -68, -88, -62, + 40, 88, 62, -17, 40, 88, 62, -17, 40, 88, 62, -17, 40, 88, 62, -17, // 16 +-81, -77, -8, 68, -81, -77, -8, 68, -81, -77, -8, 68, -81, -77, -8, 68, + 87, 33, -48, -88, 87, 33, -48, -88, 87, 33, -48, -88, 87, 33, -48, -88, +-55, 25, 85, 73, -55, 25, 85, 73, -55, 25, 85, 73, -55, 25, 85, 73, + 48, 88, 25, -68, 48, 88, 25, -68, 48, 88, 25, -68, 48, 88, 25, -68, +-81, 0, 81, 68, -81, 0, 81, 68, -81, 0, 81, 68, -81, 0, 81, 68, +-25, -88, -48, 48, -25, -88, -48, 48, -25, -88, -48, 48, -25, -88, -48, 48, + 88, 25, -68, -81, 88, 25, -68, -81, 88, 25, -68, -81, 88, 25, -68, -81, + 55, 81, -17, -88, 55, 81, -17, -88, 55, 81, -17, -88, 55, 81, -17, -88, // 24 +-25, 77, 62, -48, -25, 77, 62, -48, -25, 77, 62, -48, -25, 77, 62, -48, +-85, 8, 88, 33, -85, 8, 88, 33, -85, 8, 88, 33, -85, 8, 88, 33, +-73, -68, 40, 87, -73, -68, 40, 87, -73, -68, 40, 87, -73, -68, 40, 87, + 62, 68, -55, -73, 62, 68, -55, -73, 62, 68, -55, -73, 62, 68, -55, -73, + 48, 77, -40, -81, 48, 77, -40, -81, 48, 77, -40, -81, 48, 77, -40, -81, + 33, 85, -25, -87, 33, 85, -25, -87, 33, 85, -25, -87, 33, 85, -25, -87, + 17, 88, -8, -88, 17, 88, -8, -88, 17, 88, -8, -88, 17, 88, -8, -88, + 68, 48, -81, -25, 68, 48, -81, -25, 68, 48, -81, -25, 68, 48, -81, -25, // 32 + 88, 0, -88, 25, 88, 0, -88, 25, 88, 0, -88, 25, 88, 0, -88, 25, + 81, -48, -68, 68, 81, -48, -68, 68, 81, -48, -68, 68, 81, -48, -68, 68, + 48, -81, -25, 88, 48, -81, -25, 88, 48, -81, -25, 88, 48, -81, -25, 88, + 73, 25, -88, 33, 73, 25, -88, 33, 73, 25, -88, 33, 73, 25, -88, 33, + 68, -77, -17, 88, 68, -77, -17, 88, 68, -77, -17, 88, 68, -77, -17, 88, +-40, -62, 81, 8, -40, -62, 81, 8, -40, -62, 81, 8, -40, -62, 81, 8, +-87, 48, 55, -85, -87, 48, 55, -85, -87, 48, 55, -85, -87, 48, 55, -85, + 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, // 40 + 0, -77, 77, 0, 0, -77, 77, 0, 0, -77, 77, 0, 0, -77, 77, 0, +-77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, + 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, + 81, -25, -48, 88, 81, -25, -48, 88, 81, -25, -48, 88, 81, -25, -48, 88, +-68, 0, 68, -88, -68, 0, 68, -88, -68, 0, 68, -88, -68, 0, 68, -88, + 48, 25, -81, 81, 48, 25, -81, 81, 48, 25, -81, 81, 48, 25, -81, 81, +-25, -48, 88, -68, -25, -48, 88, -68, -25, -48, 88, -68, -25, -48, 88, -68, + 85, -48, -8, 62, 85, -48, -8, 62, 85, -48, -8, 62, 85, -48, -8, 62, // 48 +-88, 77, -33, -25, -88, 77, -33, -25, -88, 77, -33, -25, -88, 77, -33, -25, + 73, -88, 68, -17, 73, -88, 68, -17, 73, -88, 68, -17, 73, -88, 68, -17, +-40, 81, -87, 55, -40, 81, -87, 55, -40, 81, -87, 55, -40, 81, -87, 55, + 87, -68, 33, 8, 87, -68, 33, 8, 87, -68, 33, 8, 87, -68, 33, 8, +-48, 77, -88, 81, -48, 77, -88, 81, -48, 77, -88, 81, -48, 77, -88, 81, +-55, 17, 25, -62, -55, 17, 25, -62, -55, 17, 25, -62, -55, 17, 25, -62, + 85, -88, 73, -40, 85, -88, 73, -40, 85, -88, 73, -40, 85, -88, 73, -40, + 88, -81, 68, -48, 88, -81, 68, -48, 88, -81, 68, -48, 88, -81, 68, -48, // 56 + 25, 0, -25, 48, 25, 0, -25, 48, 25, 0, -25, 48, 25, 0, -25, 48, +-68, 81, -88, 88, -68, 81, -88, 88, -68, 81, -88, 88, -68, 81, -88, 88, +-81, 68, -48, 25, -81, 68, -48, 25, -81, 68, -48, 25, -81, 68, -48, 25, + 88, -88, 87, -85, 88, -88, 87, -85, 88, -88, 87, -85, 88, -88, 87, -85, + 81, -77, 73, -68, 81, -77, 73, -68, 81, -77, 73, -68, 81, -77, 73, -68, + 62, -55, 48, -40, 62, -55, 48, -40, 62, -55, 48, -40, 62, -55, 48, -40, + 33, -25, 17, -8, 33, -25, 17, -8, 33, -25, 17, -8, 33, -25, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct8_16x4_coeff_hor[1024] = { + 88, 88, 87, 85, 88, 88, 87, 85, 88, 88, 87, 85, 88, 88, 87, 85, // 0 + 81, 77, 73, 68, 81, 77, 73, 68, 81, 77, 73, 68, 81, 77, 73, 68, + 62, 55, 48, 40, 62, 55, 48, 40, 62, 55, 48, 40, 62, 55, 48, 40, + 33, 25, 17, 8, 33, 25, 17, 8, 33, 25, 17, 8, 33, 25, 17, 8, + 88, 81, 68, 48, 88, 81, 68, 48, 88, 81, 68, 48, 88, 81, 68, 48, + 25, 0, -25, -48, 25, 0, -25, -48, 25, 0, -25, -48, 25, 0, -25, -48, +-68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, +-81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, + 87, 68, 33, -8, 87, 68, 33, -8, 87, 68, 33, -8, 87, 68, 33, -8, // 8 +-48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, +-55, -17, 25, 62, -55, -17, 25, 62, -55, -17, 25, 62, -55, -17, 25, 62, + 85, 88, 73, 40, 85, 88, 73, 40, 85, 88, 73, 40, 85, 88, 73, 40, + 85, 48, -8, -62, 85, 48, -8, -62, 85, 48, -8, -62, 85, 48, -8, -62, +-88, -77, -33, 25, -88, -77, -33, 25, -88, -77, -33, 25, -88, -77, -33, 25, + 73, 88, 68, 17, 73, 88, 68, 17, 73, 88, 68, 17, 73, 88, 68, 17, +-40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, + 81, 25, -48, -88, 81, 25, -48, -88, 81, 25, -48, -88, 81, 25, -48, -88, // 16 +-68, 0, 68, 88, -68, 0, 68, 88, -68, 0, 68, 88, -68, 0, 68, 88, + 48, -25, -81, -81, 48, -25, -81, -81, 48, -25, -81, -81, 48, -25, -81, -81, +-25, 48, 88, 68, -25, 48, 88, 68, -25, 48, 88, 68, -25, 48, 88, 68, + 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, + 0, 77, 77, 0, 0, 77, 77, 0, 0, 77, 77, 0, 0, 77, 77, 0, +-77, -77, 0, 77, -77, -77, 0, 77, -77, -77, 0, 77, -77, -77, 0, 77, + 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, + 73, -25, -88, -33, 73, -25, -88, -33, 73, -25, -88, -33, 73, -25, -88, -33, // 24 + 68, 77, -17, -88, 68, 77, -17, -88, 68, 77, -17, -88, 68, 77, -17, -88, +-40, 62, 81, -8, -40, 62, 81, -8, -40, 62, 81, -8, -40, 62, 81, -8, +-87, -48, 55, 85, -87, -48, 55, 85, -87, -48, 55, 85, -87, -48, 55, 85, + 68, -48, -81, 25, 68, -48, -81, 25, 68, -48, -81, 25, 68, -48, -81, 25, + 88, 0, -88, -25, 88, 0, -88, -25, 88, 0, -88, -25, 88, 0, -88, -25, + 81, 48, -68, -68, 81, 48, -68, -68, 81, 48, -68, -68, 81, 48, -68, -68, + 48, 81, -25, -88, 48, 81, -25, -88, 48, 81, -25, -88, 48, 81, -25, -88, + 62, -68, -55, 73, 62, -68, -55, 73, 62, -68, -55, 73, 62, -68, -55, 73, // 32 + 48, -77, -40, 81, 48, -77, -40, 81, 48, -77, -40, 81, 48, -77, -40, 81, + 33, -85, -25, 87, 33, -85, -25, 87, 33, -85, -25, 87, 33, -85, -25, 87, + 17, -88, -8, 88, 17, -88, -8, 88, 17, -88, -8, 88, 17, -88, -8, 88, + 55, -81, -17, 88, 55, -81, -17, 88, 55, -81, -17, 88, 55, -81, -17, 88, +-25, -77, 62, 48, -25, -77, 62, 48, -25, -77, 62, 48, -25, -77, 62, 48, +-85, -8, 88, -33, -85, -8, 88, -33, -85, -8, 88, -33, -85, -8, 88, -33, +-73, 68, 40, -87, -73, 68, 40, -87, -73, 68, 40, -87, -73, 68, 40, -87, + 48, -88, 25, 68, 48, -88, 25, 68, 48, -88, 25, 68, 48, -88, 25, 68, // 40 +-81, 0, 81, -68, -81, 0, 81, -68, -81, 0, 81, -68, -81, 0, 81, -68, +-25, 88, -48, -48, -25, 88, -48, -48, -25, 88, -48, -48, -25, 88, -48, -48, + 88, -25, -68, 81, 88, -25, -68, 81, 88, -25, -68, 81, 88, -25, -68, 81, + 40, -88, 62, 17, 40, -88, 62, 17, 40, -88, 62, 17, 40, -88, 62, 17, +-81, 77, -8, -68, -81, 77, -8, -68, -81, 77, -8, -68, -81, 77, -8, -68, + 87, -33, -48, 88, 87, -33, -48, 88, 87, -33, -48, 88, 87, -33, -48, 88, +-55, -25, 85, -73, -55, -25, 85, -73, -55, -25, 85, -73, -55, -25, 85, -73, + 33, -81, 85, -40, 33, -81, 85, -40, 33, -81, 85, -40, 33, -81, 85, -40, // 48 +-25, 77, -87, 48, -25, 77, -87, 48, -25, 77, -87, 48, -25, 77, -87, 48, + 17, -73, 88, -55, 17, -73, 88, -55, 17, -73, 88, -55, 17, -73, 88, -55, + -8, 68, -88, 62, -8, 68, -88, 62, -8, 68, -88, 62, -8, 68, -88, 62, + 25, -68, 88, -81, 25, -68, 88, -81, 25, -68, 88, -81, 25, -68, 88, -81, + 48, 0, -48, 81, 48, 0, -48, 81, 48, 0, -48, 81, 48, 0, -48, 81, +-88, 68, -25, -25, -88, 68, -25, -25, -88, 68, -25, -25, -88, 68, -25, -25, + 68, -88, 81, -48, 68, -88, 81, -48, 68, -88, 81, -48, 68, -88, 81, -48, + 17, -48, 73, -87, 17, -48, 73, -87, 17, -48, 73, -87, 17, -48, 73, -87, // 56 + 88, -77, 55, -25, 88, -77, 55, -25, 88, -77, 55, -25, 88, -77, 55, -25, + -8, 40, -68, 85, -8, 40, -68, 85, -8, 40, -68, 85, -8, 40, -68, 85, +-88, 81, -62, 33, -88, 81, -62, 33, -88, 81, -62, 33, -88, 81, -62, 33, + 8, -25, 40, -55, 8, -25, 40, -55, 8, -25, 40, -55, 8, -25, 40, -55, + 68, -77, 85, -88, 68, -77, 85, -88, 68, -77, 85, -88, 68, -77, 85, -88, + 88, -87, 81, -73, 88, -87, 81, -73, 88, -87, 81, -73, 88, -87, 81, -73, + 62, -48, 33, -17, 62, -48, 33, -17, 62, -48, 33, -17, 62, -48, 33, -17, +}; + + +ALIGNED(32) static const int16_t fi_dct2_16x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_16x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_16x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t ff_dct2_16x8_coeff_ver[64] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, + 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, + 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, + 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, +}; + +ALIGNED(32) static const int16_t ff_dst7_16x8_coeff_ver[64] = { + 17, 32, 46, 78, 71, 85, 85, 46, 86, -17, 78, -71, 60, -86, 32, -60, + 46, 60, 86, 71, 32, -46, -60, -78, -85, 32, -17, 85, 71, -17, 78, -86, + 71, 78, 32, -17, -86, -60, 17, 86, 78, -46, -60, -32, -46, 85, 85, -71, + 85, 86, -60, -85, 17, 78, 32, -71, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) static const int16_t ff_dct8_16x8_coeff_ver[64] = { + 86, 85, 85, 60, 78, 17, 71, -32, 60, -71, 46, -86, 32, -78, 17, -46, + 78, 71, 17, -32, -60, -86, -86, -17, -46, 78, 32, 60, 85, -46, 71, -85, + 60, 46, -71, -86, -46, 32, 78, 60, 32, -85, -85, 17, -17, 71, 86, -78, + 32, 17, -78, -46, 85, 71, -46, -85, -17, 86, 71, -78, -86, 60, 60, -32, +}; + +ALIGNED(32) static const int16_t ff_dct2_16x8_butterfly_coeff_ver[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, + -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, + -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18 +}; + +ALIGNED(32) static const int16_t ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = { + 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, // 0 + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, // 8 +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +}; + + + static const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table. + + static const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table. + + static const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table. + + + static const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor; // Duplicate table + + static const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor; // Duplicate table + + static const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor; // Duplicate table + + +ALIGNED(32) static const int16_t ff_dct2_16x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) static const int16_t ff_dst7_16x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) static const int16_t ff_dct8_16x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) static const int16_t fi_dct2_16x16_coeff_hor[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_16x16_coeff_hor[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct2_16x1_coeff_hor[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_16x1_coeff_hor[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +ALIGNED(32) static const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver; + + + static const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor; + + static const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor; + + static const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver; + + +ALIGNED(32) static const int16_t ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, // 8 + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 16 + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, // 24 +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, // 32 + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, // 40 + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, // 48 + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, // 56 + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, // 64 + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, // 72 +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, // 80 + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, // 88 +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, // 96 +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, // 104 + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, // 112 +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 120 + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, // 128 +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, // 136 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, // 144 +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, // 152 + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, // 160 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, // 168 + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, // 176 +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, // 184 +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, // 192 +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, // 200 +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, // 208 +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, // 216 + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, // 224 +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, // 232 +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, // 240 +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, // 248 +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +}; + +ALIGNED(32) static const int16_t ff_dct2_16x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dst7_16x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dct8_16x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + + static const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor; + + static const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor; + + static const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver; + +// 32xN +ALIGNED(32) static const int16_t ff_dct2_32xN_coeff_hor[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dst7_32xN_coeff_hor[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, // 0 + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, // 2 +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, // 8 + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, // 10 +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, // 4 + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, -60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, -46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, // 6 +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, -68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, -85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, // 12 + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, -53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, -21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, // 14 +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, -74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, // 16 + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, // 18 +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, // 24 + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, -17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, // 26 +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, -90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, // 20 + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, -46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, // 22 +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, -78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, -86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, // 28 + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, -38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, // 30 +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, -82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dct8_32xN_coeff_hor[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +ALIGNED(32) static const int16_t fi_dct2_32xN_coeff_hor[1024] = { +64, 90, 64, 90, 64, 88, 64, 85, 64, 82, 64, 78, 64, 73, 64, 67, 64, 61, 64, 54, 64, 46, 64, 38, 64, 31, 64, 22, 64, 13, 64, 4, // 0 + 64, -4, 64, -13, 64, -22, 64, -31, 64, -38, 64, -46, 64, -54, 64, -61, 64, -67, 64, -73, 64, -78, 64, -82, 64, -85, 64, -88, 64, -90, 64, -90, + 90, 90, 87, 82, 80, 67, 70, 46, 57, 22, 43, -4, 25, -31, 9, -54, -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13, // 2 +-90, 13, -87, 38, -80, 61, -70, 78, -57, 88, -43, 90, -25, 85, -9, 73, 9, 54, 25, 31, 43, 4, 57, -22, 70, -46, 80, -67, 87, -82, 90, -90, + 89, 88, 75, 67, 50, 31, 18, -13, -18, -54, -50, -82, -75, -90, -89, -78, -89, -46, -75, -4, -50, 38, -18, 73, 18, 90, 50, 85, 75, 61, 89, 22, // 4 + 89, -22, 75, -61, 50, -85, 18, -90, -18, -73, -50, -38, -75, 4, -89, 46, -89, 78, -75, 90, -50, 82, -18, 54, 18, 13, 50, -31, 75, -67, 89, -88, + 87, 85, 57, 46, 9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25, 38, 25, 82, 70, 88, 90, 54, 80, -4, 43, -61, -9, -90, -57, -78, -87, -31, // 6 +-87, 31, -57, 78, -9, 90, 43, 61, 80, 4, 90, -54, 70, -88, 25, -82, -25, -38, -70, 22, -90, 73, -80, 90, -43, 67, 9, 13, 57, -46, 87, -85, + 83, 82, 36, 22, -36, -54, -83, -90, -83, -61, -36, 13, 36, 78, 83, 85, 83, 31, 36, -46, -36, -90, -83, -67, -83, 4, -36, 73, 36, 88, 83, 38, // 8 + 83, -38, 36, -88, -36, -73, -83, -4, -83, 67, -36, 90, 36, 46, 83, -31, 83, -85, 36, -78, -36, -13, -83, 61, -83, 90, -36, 54, 36, -22, 83, -82, + 80, 78, 9, -4, -70, -82, -87, -73, -25, 13, 57, 85, 90, 67, 43, -22, -43, -88, -90, -61, -57, 31, 25, 90, 87, 54, 70, -38, -9, -90, -80, -46, // 10 +-80, 46, -9, 90, 70, 38, 87, -54, 25, -90, -57, -31, -90, 61, -43, 88, 43, 22, 90, -67, 57, -85, -25, -13, -87, 73, -70, 82, 9, 4, 80, -78, + 75, 73, -18, -31, -89, -90, -50, -22, 50, 78, 89, 67, 18, -38, -75, -90, -75, -13, 18, 82, 89, 61, 50, -46, -50, -88, -89, -4, -18, 85, 75, 54, // 12 + 75, -54, -18, -85, -89, 4, -50, 88, 50, 46, 89, -61, 18, -82, -75, 13, -75, 90, 18, 38, 89, -67, 50, -78, -50, 22, -89, 90, -18, 31, 75, -73, + 70, 67, -43, -54, -87, -78, 9, 38, 90, 85, 25, -22, -80, -90, -57, 4, 57, 90, 80, 13, -25, -88, -90, -31, -9, 82, 87, 46, 43, -73, -70, -61, // 14 +-70, 61, 43, 73, 87, -46, -9, -82, -90, 31, -25, 88, 80, -13, 57, -90, -57, -4, -80, 90, 25, 22, 90, -85, 9, -38, -87, 78, -43, 54, 70, -67, + 64, 61, -64, -73, -64, -46, 64, 82, 64, 31, -64, -88, -64, -13, 64, 90, 64, -4, -64, -90, -64, 22, 64, 85, 64, -38, -64, -78, -64, 54, 64, 67, // 16 + 64, -67, -64, -54, -64, 78, 64, 38, 64, -85, -64, -22, -64, 90, 64, 4, 64, -90, -64, 13, -64, 88, 64, -31, 64, -82, -64, 46, -64, 73, 64, -61, + 57, 54, -80, -85, -25, -4, 90, 88, -9, -46, -87, -61, 43, 82, 70, 13, -70, -90, -43, 38, 87, 67, 9, -78, -90, -22, 25, 90, 80, -31, -57, -73, // 18 +-57, 73, 80, 31, 25, -90, -90, 22, 9, 78, 87, -67, -43, -38, -70, 90, 70, -13, 43, -82, -87, 61, -9, 46, 90, -88, -25, 4, -80, 85, 57, -54, + 50, 46, -89, -90, 18, 38, 75, 54, -75, -90, -18, 31, 89, 61, -50, -88, -50, 22, 89, 67, -18, -85, -75, 13, 75, 73, 18, -82, -89, 4, 50, 78, // 20 + 50, -78, -89, -4, 18, 82, 75, -73, -75, -13, -18, 85, 89, -67, -50, -22, -50, 88, 89, -61, -18, -31, -75, 90, 75, -54, 18, -38, -89, 90, 50, -46, + 43, 38, -90, -88, 57, 73, 25, -4, -87, -67, 70, 90, 9, -46, -80, -31, 80, 85, -9, -78, -70, 13, 87, 61, -25, -90, -57, 54, 90, 22, -43, -82, // 22 +-43, 82, 90, -22, -57, -54, -25, 90, 87, -61, -70, -13, -9, 78, 80, -85, -80, 31, 9, 46, 70, -90, -87, 67, 25, 4, 57, -73, -90, 88, 43, -38, + 36, 31, -83, -78, 83, 90, -36, -61, -36, 4, 83, 54, -83, -88, 36, 82, 36, -38, -83, -22, 83, 73, -36, -90, -36, 67, 83, -13, -83, -46, 36, 85, // 24 + 36, -85, -83, 46, 83, 13, -36, -67, -36, 90, 83, -73, -83, 22, 36, 38, 36, -82, -83, 88, 83, -54, -36, -4, -36, 61, 83, -90, -83, 78, 36, -31, + 25, 22, -70, -61, 90, 85, -80, -90, 43, 73, 9, -38, -57, -4, 87, 46, -87, -78, 57, 90, -9, -82, -43, 54, 80, -13, -90, -31, 70, 67, -25, -88, // 26 +-25, 88, 70, -67, -90, 31, 80, 13, -43, -54, -9, 82, 57, -90, -87, 78, 87, -46, -57, 4, 9, 38, 43, -73, -80, 90, 90, -85, -70, 61, 25, -22, + 18, 13, -50, -38, 75, 61, -89, -78, 89, 88, -75, -90, 50, 85, -18, -73, -18, 54, 50, -31, -75, 4, 89, 22, -89, -46, 75, 67, -50, -82, 18, 90, // 28 + 18, -90, -50, 82, 75, -67, -89, 46, 89, -22, -75, -4, 50, 31, -18, -54, -18, 73, 50, -85, -75, 90, 89, -88, -89, 78, 75, -61, -50, 38, 18, -13, + 9, 4, -25, -13, 43, 22, -57, -31, 70, 38, -80, -46, 87, 54, -90, -61, 90, 67, -87, -73, 80, 78, -70, -82, 57, 85, -43, -88, 25, 90, -9, -90, // 30 + -9, 90, 25, -90, -43, 88, 57, -85, -70, 82, 80, -78, -87, 73, 90, -67, -90, 61, 87, -54, -80, 46, 70, -38, -57, 31, 43, -22, -25, 13, 9, -4, +}; + + +ALIGNED(32) static const int16_t fi_dst7_32xN_coeff_hor[1024] = { + 4, 13, 9, 26, 13, 38, 17, 50, 21, 60, 26, 68, 30, 77, 34, 82, 38, 86, 42, 89, 46, 90, 50, 88, 53, 85, 56, 80, 60, 74, 63, 66, // 0 + 66, 56, 68, 46, 72, 34, 74, 21, 77, 9, 78, -4, 80, -17, 82, -30, 84, -42, 85, -53, 86, -63, 87, -72, 88, -78, 89, -84, 90, -87, 90, -90, + 21, 30, 42, 56, 60, 77, 74, 87, 84, 89, 89, 80, 89, 63, 84, 38, 74, 9, 60, -21, 42, -50, 21, -72, 0, -85, -21, -90, -42, -84, -60, -68, // 2 +-74, -46, -84, -17, -89, 13, -89, 42, -84, 66, -74, 82, -60, 90, -42, 86, -21, 74, 0, 53, 21, 26, 42, -4, 60, -34, 74, -60, 84, -78, 89, -88, + 38, 46, 68, 78, 86, 90, 88, 77, 74, 42, 46, -4, 9, -50, -30, -80, -63, -90, -84, -74, -90, -38, -78, 9, -53, 53, -17, 82, 21, 89, 56, 72, // 4 + 80, 34, 90, -13, 82, -56, 60, -84, 26, -88, -13, -68, -50, -30, -77, 17, -89, 60, -85, 85, -66, 87, -34, 66, 4, 26, 42, -21, 72, -63, 87, -86, + 53, 60, 85, 89, 85, 74, 53, 21, 0, -42, -53, -84, -85, -84, -85, -42, -53, 21, 0, 74, 53, 89, 85, 60, 85, 0, 53, -60, 0, -89, -53, -74, // 6 +-85, -21, -85, 42, -53, 84, 0, 84, 53, 42, 85, -21, 85, -74, 53, -89, 0, -60, -53, 0, -85, 60, -85, 89, -53, 74, 0, 21, 53, -42, 85, -84, + 66, 72, 90, 86, 56, 34, -13, -46, -74, -89, -87, -63, -46, 13, 26, 78, 80, 82, 84, 21, 34, -56, -38, -90, -85, -53, -78, 26, -21, 84, 50, 77, // 8 + 88, 9, 72, -66, 9, -88, -60, -42, -90, 38, -63, 87, 4, 68, 68, -4, 89, -74, 53, -85, -17, -30, -77, 50, -86, 90, -42, 60, 30, -17, 82, -80, + 77, 80, 80, 72, 9, -17, -72, -86, -84, -60, -17, 34, 66, 90, 86, 46, 26, -50, -60, -89, -88, -30, -34, 63, 53, 85, 90, 13, 42, -74, -46, -78, // 10 +-90, 4, -50, 82, 38, 68, 89, -21, 56, -87, -30, -56, -87, 38, -63, 90, 21, 42, 85, -53, 68, -88, -13, -26, -82, 66, -74, 84, 4, 9, 78, -77, + 84, 86, 60, 46, -42, -63, -89, -78, -21, 21, 74, 90, 74, 26, -21, -77, -89, -66, -42, 42, 60, 87, 84, 4, 0, -85, -84, -50, -60, 60, 42, 80, // 12 + 89, -17, 21, -90, -74, -30, -74, 74, 21, 68, 89, -38, 42, -88, -60, -9, -84, 84, 0, 53, 84, -56, 60, -82, -42, 13, -89, 89, -21, 34, 74, -72, + 88, 90, 30, 13, -78, -87, -56, -26, 60, 84, 77, 38, -34, -78, -87, -50, 4, 72, 89, 60, 26, -63, -80, -68, -53, 53, 63, 77, 74, -42, -38, -82, // 14 +-86, 30, 9, 86, 90, -17, 21, -89, -82, 4, -50, 90, 66, 9, 72, -88, -42, -21, -85, 85, 13, 34, 90, -80, 17, -46, -84, 74, -46, 56, 68, -66, + 90, 89, -4, -21, -90, -84, 9, 42, 89, 74, -13, -60, -88, -60, 17, 74, 87, 42, -21, -84, -86, -21, 26, 89, 85, 0, -30, -89, -84, 21, 34, 84, // 16 + 82, -42, -38, -74, -80, 60, 42, 60, 78, -74, -46, -42, -77, 84, 50, 21, 74, -89, -53, 0, -72, 89, 56, -21, 68, -84, -60, 42, -66, 74, 63, -60, + 87, 85, -38, -53, -72, -53, 68, 85, 42, 0, -86, -85, -4, 53, 88, 53, -34, -85, -74, 0, 66, 85, 46, -53, -85, -53, -9, 85, 89, 0, -30, -85, // 18 +-77, 53, 63, 53, 50, -85, -84, 0, -13, 85, 90, -53, -26, -53, -78, 85, 60, 0, 53, -85, -82, 53, -17, 53, 90, -85, -21, 0, -80, 85, 56, -53, + 82, 78, -66, -77, -30, -4, 90, 80, -42, -74, -56, -9, 86, 82, -13, -72, -77, -13, 74, 84, 17, -68, -87, -17, 53, 85, 46, -66, -89, -21, 26, 86, // 20 + 68, -63, -80, -26, -4, 87, 84, -60, -63, -30, -34, 88, 90, -56, -38, -34, -60, 89, 85, -53, -9, -38, -78, 90, 72, -50, 21, -42, -88, 90, 50, -46, + 74, 68, -84, -88, 21, 46, 60, 30, -89, -84, 42, 78, 42, -17, -89, -56, 60, 90, 21, -60, -84, -13, 74, 77, 0, -85, -74, 34, 84, 42, -21, -87, // 22 +-60, 72, 89, -4, -42, -66, -42, 89, 89, -50, -60, -26, -21, 82, 84, -80, -74, 21, 0, 53, 74, -90, -84, 63, 21, 9, 60, -74, -89, 86, 42, -38, + 63, 56, -90, -87, 66, 80, -4, -38, -60, -21, 90, 72, -68, -90, 9, 68, 56, -17, -89, -42, 72, 82, -13, -86, -53, 53, 88, 4, -74, -60, 17, 88, // 24 + 50, -78, -87, 34, 77, 26, -21, -74, -46, 90, 86, -66, -78, 13, 26, 46, 42, -84, -85, 85, 80, -50, -30, -9, -38, 63, 84, -89, -82, 77, 34, -30, + 50, 42, -82, -74, 88, 89, -66, -84, 21, 60, 30, -21, -72, -21, 90, 60, -78, -84, 42, 89, 9, -74, -56, 42, 85, 0, -86, -42, 60, 74, -13, -89, // 26 +-38, 84, 77, -60, -90, 21, 74, 21, -34, -60, -17, 84, 63, -89, -87, 74, 84, -42, -53, 0, 4, 42, 46, -74, -80, 89, 89, -84, -68, 60, 26, -21, + 34, 26, -63, -50, 82, 68, -90, -82, 84, 89, -66, -88, 38, 80, -4, -66, -30, 46, 60, -21, -80, -4, 90, 30, -85, -53, 68, 72, -42, -84, 9, 90, // 28 + 26, -87, -56, 78, 78, -63, -89, 42, 86, -17, -72, -9, 46, 34, -13, -56, -21, 74, 53, -85, -77, 90, 88, -86, -87, 77, 74, -60, -50, 38, 17, -13, + 17, 9, -34, -17, 50, 26, -63, -34, 74, 42, -82, -50, 87, 56, -90, -63, 88, 68, -84, -74, 77, 78, -66, -82, 53, 85, -38, -87, 21, 89, -4, -90, // 30 +-13, 90, 30, -88, -46, 86, 60, -84, -72, 80, 80, -77, -86, 72, 90, -66, -89, 60, 85, -53, -78, 46, 68, -38, -56, 30, 42, -21, -26, 13, 9, -4, +}; + + +ALIGNED(32) static const int16_t fi_dct8_32xN_coeff_hor[1024] = { +90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +static const int16_t ff_dct8_4x32_coeff_ver[1024] = { +90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; +static const int16_t ff_dst7_4x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, // 0 + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, // 2 +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, // 4 + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, -60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, -46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, // 6 +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, -68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, // 8 + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, // 10 +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, -85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, // 12 + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, -53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, -21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, // 14 +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, -74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, // 16 + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, // 18 +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, -90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, // 20 + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, -46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, // 22 +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, -78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, // 24 + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, -17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, // 26 +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, -86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, // 28 + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, -38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, // 30 +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, -82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + + static const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + + static const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; + + +ALIGNED(32) static const int16_t ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = { + 90, 90, 87, 87, 90, 90, 87, 87, 90, 90, 87, 87, 90, 90, 87, 87, // 0 + 80, 80, 70, 70, 80, 80, 70, 70, 80, 80, 70, 70, 80, 80, 70, 70, + 57, 57, 43, 43, 57, 57, 43, 43, 57, 57, 43, 43, 57, 57, 43, 43, + 25, 25, 9, 9, 25, 25, 9, 9, 25, 25, 9, 9, 25, 25, 9, 9, + 87, 87, 57, 57, 87, 87, 57, 57, 87, 87, 57, 57, 87, 87, 57, 57, + 9, 9, -43, -43, 9, 9, -43, -43, 9, 9, -43, -43, 9, 9, -43, -43, +-80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, +-70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, + 80, 80, 9, 9, 80, 80, 9, 9, 80, 80, 9, 9, 80, 80, 9, 9, // 8 +-70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, +-25, -25, 57, 57, -25, -25, 57, 57, -25, -25, 57, 57, -25, -25, 57, 57, + 90, 90, 43, 43, 90, 90, 43, 43, 90, 90, 43, 43, 90, 90, 43, 43, + 70, 70, -43, -43, 70, 70, -43, -43, 70, 70, -43, -43, 70, 70, -43, -43, +-87, -87, 9, 9, -87, -87, 9, 9, -87, -87, 9, 9, -87, -87, 9, 9, + 90, 90, 25, 25, 90, 90, 25, 25, 90, 90, 25, 25, 90, 90, 25, 25, +-80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, + 57, 57, -80, -80, 57, 57, -80, -80, 57, 57, -80, -80, 57, 57, -80, -80, // 16 +-25, -25, 90, 90, -25, -25, 90, 90, -25, -25, 90, 90, -25, -25, 90, 90, + -9, -9, -87, -87, -9, -9, -87, -87, -9, -9, -87, -87, -9, -9, -87, -87, + 43, 43, 70, 70, 43, 43, 70, 70, 43, 43, 70, 70, 43, 43, 70, 70, + 43, 43, -90, -90, 43, 43, -90, -90, 43, 43, -90, -90, 43, 43, -90, -90, + 57, 57, 25, 25, 57, 57, 25, 25, 57, 57, 25, 25, 57, 57, 25, 25, +-87, -87, 70, 70, -87, -87, 70, 70, -87, -87, 70, 70, -87, -87, 70, 70, + 9, 9, -80, -80, 9, 9, -80, -80, 9, 9, -80, -80, 9, 9, -80, -80, + 25, 25, -70, -70, 25, 25, -70, -70, 25, 25, -70, -70, 25, 25, -70, -70, // 24 + 90, 90, -80, -80, 90, 90, -80, -80, 90, 90, -80, -80, 90, 90, -80, -80, + 43, 43, 9, 9, 43, 43, 9, 9, 43, 43, 9, 9, 43, 43, 9, 9, +-57, -57, 87, 87, -57, -57, 87, 87, -57, -57, 87, 87, -57, -57, 87, 87, + 9, 9, -25, -25, 9, 9, -25, -25, 9, 9, -25, -25, 9, 9, -25, -25, + 43, 43, -57, -57, 43, 43, -57, -57, 43, 43, -57, -57, 43, 43, -57, -57, + 70, 70, -80, -80, 70, 70, -80, -80, 70, 70, -80, -80, 70, 70, -80, -80, + 87, 87, -90, -90, 87, 87, -90, -90, 87, 87, -90, -90, 87, 87, -90, -90, +}; + +ALIGNED(32) static const int16_t ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 88, -88, 85, -85, 88, -88, 85, -85, 88, -88, 85, -85, 88, -88, 85, -85, + 82, -82, 78, -78, 82, -82, 78, -78, 82, -82, 78, -78, 82, -82, 78, -78, + 73, -73, 67, -67, 73, -73, 67, -67, 73, -73, 67, -67, 73, -73, 67, -67, + 61, -61, 54, -54, 61, -61, 54, -54, 61, -61, 54, -54, 61, -61, 54, -54, + 46, -46, 38, -38, 46, -46, 38, -38, 46, -46, 38, -38, 46, -46, 38, -38, + 31, -31, 22, -22, 31, -31, 22, -22, 31, -31, 22, -22, 31, -31, 22, -22, + 13, -13, 4, -4, 13, -13, 4, -4, 13, -13, 4, -4, 13, -13, 4, -4, + 90, -90, 82, -82, 90, -90, 82, -82, 90, -90, 82, -82, 90, -90, 82, -82, // 8 + 67, -67, 46, -46, 67, -67, 46, -46, 67, -67, 46, -46, 67, -67, 46, -46, + 22, -22, -4, 4, 22, -22, -4, 4, 22, -22, -4, 4, 22, -22, -4, 4, +-31, 31, -54, 54, -31, 31, -54, 54, -31, 31, -54, 54, -31, 31, -54, 54, +-73, 73, -85, 85, -73, 73, -85, 85, -73, 73, -85, 85, -73, 73, -85, 85, +-90, 90, -88, 88, -90, 90, -88, 88, -90, 90, -88, 88, -90, 90, -88, 88, +-78, 78, -61, 61, -78, 78, -61, 61, -78, 78, -61, 61, -78, 78, -61, 61, +-38, 38, -13, 13, -38, 38, -13, 13, -38, 38, -13, 13, -38, 38, -13, 13, + 88, -88, 67, -67, 88, -88, 67, -67, 88, -88, 67, -67, 88, -88, 67, -67, // 16 + 31, -31, -13, 13, 31, -31, -13, 13, 31, -31, -13, 13, 31, -31, -13, 13, +-54, 54, -82, 82, -54, 54, -82, 82, -54, 54, -82, 82, -54, 54, -82, 82, +-90, 90, -78, 78, -90, 90, -78, 78, -90, 90, -78, 78, -90, 90, -78, 78, +-46, 46, -4, 4, -46, 46, -4, 4, -46, 46, -4, 4, -46, 46, -4, 4, + 38, -38, 73, -73, 38, -38, 73, -73, 38, -38, 73, -73, 38, -38, 73, -73, + 90, -90, 85, -85, 90, -90, 85, -85, 90, -90, 85, -85, 90, -90, 85, -85, + 61, -61, 22, -22, 61, -61, 22, -22, 61, -61, 22, -22, 61, -61, 22, -22, + 85, -85, 46, -46, 85, -85, 46, -46, 85, -85, 46, -46, 85, -85, 46, -46, // 24 +-13, 13, -67, 67, -13, 13, -67, 67, -13, 13, -67, 67, -13, 13, -67, 67, +-90, 90, -73, 73, -90, 90, -73, 73, -90, 90, -73, 73, -90, 90, -73, 73, +-22, 22, 38, -38, -22, 22, 38, -38, -22, 22, 38, -38, -22, 22, 38, -38, + 82, -82, 88, -88, 82, -82, 88, -88, 82, -82, 88, -88, 82, -82, 88, -88, + 54, -54, -4, 4, 54, -54, -4, 4, 54, -54, -4, 4, 54, -54, -4, 4, +-61, 61, -90, 90, -61, 61, -90, 90, -61, 61, -90, 90, -61, 61, -90, 90, +-78, 78, -31, 31, -78, 78, -31, 31, -78, 78, -31, 31, -78, 78, -31, 31, + 82, -82, 22, -22, 82, -82, 22, -22, 82, -82, 22, -22, 82, -82, 22, -22, // 32 +-54, 54, -90, 90, -54, 54, -90, 90, -54, 54, -90, 90, -54, 54, -90, 90, +-61, 61, 13, -13, -61, 61, 13, -13, -61, 61, 13, -13, -61, 61, 13, -13, + 78, -78, 85, -85, 78, -78, 85, -85, 78, -78, 85, -85, 78, -78, 85, -85, + 31, -31, -46, 46, 31, -31, -46, 46, 31, -31, -46, 46, 31, -31, -46, 46, +-90, 90, -67, 67, -90, 90, -67, 67, -90, 90, -67, 67, -90, 90, -67, 67, + 4, -4, 73, -73, 4, -4, 73, -73, 4, -4, 73, -73, 4, -4, 73, -73, + 88, -88, 38, -38, 88, -88, 38, -38, 88, -88, 38, -38, 88, -88, 38, -38, + 78, -78, -4, 4, 78, -78, -4, 4, 78, -78, -4, 4, 78, -78, -4, 4, // 40 +-82, 82, -73, 73, -82, 82, -73, 73, -82, 82, -73, 73, -82, 82, -73, 73, + 13, -13, 85, -85, 13, -13, 85, -85, 13, -13, 85, -85, 13, -13, 85, -85, + 67, -67, -22, 22, 67, -67, -22, 22, 67, -67, -22, 22, 67, -67, -22, 22, +-88, 88, -61, 61, -88, 88, -61, 61, -88, 88, -61, 61, -88, 88, -61, 61, + 31, -31, 90, -90, 31, -31, 90, -90, 31, -31, 90, -90, 31, -31, 90, -90, + 54, -54, -38, 38, 54, -54, -38, 38, 54, -54, -38, 38, 54, -54, -38, 38, +-90, 90, -46, 46, -90, 90, -46, 46, -90, 90, -46, 46, -90, 90, -46, 46, + 73, -73, -31, 31, 73, -73, -31, 31, 73, -73, -31, 31, 73, -73, -31, 31, // 48 +-90, 90, -22, 22, -90, 90, -22, 22, -90, 90, -22, 22, -90, 90, -22, 22, + 78, -78, 67, -67, 78, -78, 67, -67, 78, -78, 67, -67, 78, -78, 67, -67, +-38, 38, -90, 90, -38, 38, -90, 90, -38, 38, -90, 90, -38, 38, -90, 90, +-13, 13, 82, -82, -13, 13, 82, -82, -13, 13, 82, -82, -13, 13, 82, -82, + 61, -61, -46, 46, 61, -61, -46, 46, 61, -61, -46, 46, 61, -61, -46, 46, +-88, 88, -4, 4, -88, 88, -4, 4, -88, 88, -4, 4, -88, 88, -4, 4, + 85, -85, 54, -54, 85, -85, 54, -54, 85, -85, 54, -54, 85, -85, 54, -54, + 67, -67, -54, 54, 67, -67, -54, 54, 67, -67, -54, 54, 67, -67, -54, 54, // 56 +-78, 78, 38, -38, -78, 78, 38, -38, -78, 78, 38, -38, -78, 78, 38, -38, + 85, -85, -22, 22, 85, -85, -22, 22, 85, -85, -22, 22, 85, -85, -22, 22, +-90, 90, 4, -4, -90, 90, 4, -4, -90, 90, 4, -4, -90, 90, 4, -4, + 90, -90, 13, -13, 90, -90, 13, -13, 90, -90, 13, -13, 90, -90, 13, -13, +-88, 88, -31, 31, -88, 88, -31, 31, -88, 88, -31, 31, -88, 88, -31, 31, + 82, -82, 46, -46, 82, -82, 46, -46, 82, -82, 46, -46, 82, -82, 46, -46, +-73, 73, -61, 61, -73, 73, -61, 61, -73, 73, -61, 61, -73, 73, -61, 61, + 61, -61, -73, 73, 61, -61, -73, 73, 61, -61, -73, 73, 61, -61, -73, 73, // 64 +-46, 46, 82, -82, -46, 46, 82, -82, -46, 46, 82, -82, -46, 46, 82, -82, + 31, -31, -88, 88, 31, -31, -88, 88, 31, -31, -88, 88, 31, -31, -88, 88, +-13, 13, 90, -90, -13, 13, 90, -90, -13, 13, 90, -90, -13, 13, 90, -90, + -4, 4, -90, 90, -4, 4, -90, 90, -4, 4, -90, 90, -4, 4, -90, 90, + 22, -22, 85, -85, 22, -22, 85, -85, 22, -22, 85, -85, 22, -22, 85, -85, +-38, 38, -78, 78, -38, 38, -78, 78, -38, 38, -78, 78, -38, 38, -78, 78, + 54, -54, 67, -67, 54, -54, 67, -67, 54, -54, 67, -67, 54, -54, 67, -67, + 54, -54, -85, 85, 54, -54, -85, 85, 54, -54, -85, 85, 54, -54, -85, 85, // 72 + -4, 4, 88, -88, -4, 4, 88, -88, -4, 4, 88, -88, -4, 4, 88, -88, +-46, 46, -61, 61, -46, 46, -61, 61, -46, 46, -61, 61, -46, 46, -61, 61, + 82, -82, 13, -13, 82, -82, 13, -13, 82, -82, 13, -13, 82, -82, 13, -13, +-90, 90, 38, -38, -90, 90, 38, -38, -90, 90, 38, -38, -90, 90, 38, -38, + 67, -67, -78, 78, 67, -67, -78, 78, 67, -67, -78, 78, 67, -67, -78, 78, +-22, 22, 90, -90, -22, 22, 90, -90, -22, 22, 90, -90, -22, 22, 90, -90, +-31, 31, -73, 73, -31, 31, -73, 73, -31, 31, -73, 73, -31, 31, -73, 73, + 46, -46, -90, 90, 46, -46, -90, 90, 46, -46, -90, 90, 46, -46, -90, 90, // 80 + 38, -38, 54, -54, 38, -38, 54, -54, 38, -38, 54, -54, 38, -38, 54, -54, +-90, 90, 31, -31, -90, 90, 31, -31, -90, 90, 31, -31, -90, 90, 31, -31, + 61, -61, -88, 88, 61, -61, -88, 88, 61, -61, -88, 88, 61, -61, -88, 88, + 22, -22, 67, -67, 22, -22, 67, -67, 22, -22, 67, -67, 22, -22, 67, -67, +-85, 85, 13, -13, -85, 85, 13, -13, -85, 85, 13, -13, -85, 85, 13, -13, + 73, -73, -82, 82, 73, -73, -82, 82, 73, -73, -82, 82, 73, -73, -82, 82, + 4, -4, 78, -78, 4, -4, 78, -78, 4, -4, 78, -78, 4, -4, 78, -78, + 38, -38, -88, 88, 38, -38, -88, 88, 38, -38, -88, 88, 38, -38, -88, 88, // 88 + 73, -73, -4, 4, 73, -73, -4, 4, 73, -73, -4, 4, 73, -73, -4, 4, +-67, 67, 90, -90, -67, 67, 90, -90, -67, 67, 90, -90, -67, 67, 90, -90, +-46, 46, -31, 31, -46, 46, -31, 31, -46, 46, -31, 31, -46, 46, -31, 31, + 85, -85, -78, 78, 85, -85, -78, 78, 85, -85, -78, 78, 85, -85, -78, 78, + 13, -13, 61, -61, 13, -13, 61, -61, 13, -13, 61, -61, 13, -13, 61, -61, +-90, 90, 54, -54, -90, 90, 54, -54, -90, 90, 54, -54, -90, 90, 54, -54, + 22, -22, -82, 82, 22, -22, -82, 82, 22, -22, -82, 82, 22, -22, -82, 82, + 31, -31, -78, 78, 31, -31, -78, 78, 31, -31, -78, 78, 31, -31, -78, 78, // 96 + 90, -90, -61, 61, 90, -90, -61, 61, 90, -90, -61, 61, 90, -90, -61, 61, + 4, -4, 54, -54, 4, -4, 54, -54, 4, -4, 54, -54, 4, -4, 54, -54, +-88, 88, 82, -82, -88, 88, 82, -82, -88, 88, 82, -82, -88, 88, 82, -82, +-38, 38, -22, 22, -38, 38, -22, 22, -38, 38, -22, 22, -38, 38, -22, 22, + 73, -73, -90, 90, 73, -73, -90, 90, 73, -73, -90, 90, 73, -73, -90, 90, + 67, -67, -13, 13, 67, -67, -13, 13, 67, -67, -13, 13, 67, -67, -13, 13, +-46, 46, 85, -85, -46, 46, 85, -85, -46, 46, 85, -85, -46, 46, 85, -85, + 22, -22, -61, 61, 22, -22, -61, 61, 22, -22, -61, 61, 22, -22, -61, 61, // 104 + 85, -85, -90, 90, 85, -85, -90, 90, 85, -85, -90, 90, 85, -85, -90, 90, + 73, -73, -38, 38, 73, -73, -38, 38, 73, -73, -38, 38, 73, -73, -38, 38, + -4, 4, 46, -46, -4, 4, 46, -46, -4, 4, 46, -46, -4, 4, 46, -46, +-78, 78, 90, -90, -78, 78, 90, -90, -78, 78, 90, -90, -78, 78, 90, -90, +-82, 82, 54, -54, -82, 82, 54, -54, -82, 82, 54, -54, -82, 82, 54, -54, +-13, 13, -31, 31, -13, 13, -31, 31, -13, 13, -31, 31, -13, 13, -31, 31, + 67, -67, -88, 88, 67, -67, -88, 88, 67, -67, -88, 88, 67, -67, -88, 88, + 13, -13, -38, 38, 13, -13, -38, 38, 13, -13, -38, 38, 13, -13, -38, 38, // 112 + 61, -61, -78, 78, 61, -61, -78, 78, 61, -61, -78, 78, 61, -61, -78, 78, + 88, -88, -90, 90, 88, -88, -90, 90, 88, -88, -90, 90, 88, -88, -90, 90, + 85, -85, -73, 73, 85, -85, -73, 73, 85, -85, -73, 73, 85, -85, -73, 73, + 54, -54, -31, 31, 54, -54, -31, 31, 54, -54, -31, 31, 54, -54, -31, 31, + 4, -4, 22, -22, 4, -4, 22, -22, 4, -4, 22, -22, 4, -4, 22, -22, +-46, 46, 67, -67, -46, 46, 67, -67, -46, 46, 67, -67, -46, 46, 67, -67, +-82, 82, 90, -90, -82, 82, 90, -90, -82, 82, 90, -90, -82, 82, 90, -90, + 4, -4, -13, 13, 4, -4, -13, 13, 4, -4, -13, 13, 4, -4, -13, 13, // 120 + 22, -22, -31, 31, 22, -22, -31, 31, 22, -22, -31, 31, 22, -22, -31, 31, + 38, -38, -46, 46, 38, -38, -46, 46, 38, -38, -46, 46, 38, -38, -46, 46, + 54, -54, -61, 61, 54, -54, -61, 61, 54, -54, -61, 61, 54, -54, -61, 61, + 67, -67, -73, 73, 67, -67, -73, 73, 67, -67, -73, 73, 67, -67, -73, 73, + 78, -78, -82, 82, 78, -78, -82, 82, 78, -78, -82, 82, 78, -78, -82, 82, + 85, -85, -88, 88, 85, -85, -88, 88, 85, -85, -88, 88, 85, -85, -88, 88, + 90, -90, -90, 90, 90, -90, -90, 90, 90, -90, -90, 90, 90, -90, -90, 90, +}; + + +ALIGNED(32) static const int16_t ff_dct2_32x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) static const int16_t ff_dst7_32x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) static const int16_t ff_dct8_32x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t fi_dct2_32x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) static const int16_t fi_dst7_32x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) static const int16_t fi_dct8_32x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) static const int16_t ff_dct2_32x8_coeff_ver[512] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 0 + 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 8 + 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, +-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, + 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 16 +-18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, +-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, + 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, +-36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, + 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 24 +-75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, + 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, + 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, +-83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, + 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, +}; + +ALIGNED(32) static const int16_t ff_dst7_32x8_coeff_ver[512] = { + 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, // 0 + 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, + 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, + 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, + 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, + 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, + 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, + 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, + 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, // 8 + 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, + 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, +-60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, +-85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, +-17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, + 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, + 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, + 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, // 16 + 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, +-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, + 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, + 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, +-60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, +-46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, + 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, + 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, // 24 +-60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, + 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, + 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, +-71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, + 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, +-78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, + 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, +}; + +ALIGNED(32) static const int16_t ff_dct8_32x8_coeff_ver[512] = { + 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, // 0 + 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, + 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, + 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, + 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, + 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, + 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, + 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, + 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, // 8 + 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, +-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, +-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, +-46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, + 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, + 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, + 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, + 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, // 16 +-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, +-46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, + 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, + 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, +-85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, +-17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, + 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, + 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, // 24 +-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, + 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, +-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, +-17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, + 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, +-86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, + 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, +}; + + +ALIGNED(32) static const int16_t fi_dct2_32x8_coeff_ver[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) static const int16_t fi_dst7_32x8_coeff_ver[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) static const int16_t fi_dct8_32x8_coeff_ver[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) static const int16_t ff_dct2_32x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) static const int16_t ff_dst7_32x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) static const int16_t ff_dct8_32x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) static const int16_t fi_dct2_32x16_coeff_ver[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) static const int16_t fi_dst7_32x16_coeff_ver[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +ALIGNED(32) static const int16_t fi_dct8_32x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) static const int16_t ff_dct2_32x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dst7_32x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) static const int16_t ff_dct8_32x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + + // DCT-2 +#define DEFINE_DCT2_P2_MATRIX(a) \ +{ \ + {a, a}, \ + {a, -a} \ +} + +#define DEFINE_DCT2_P4_MATRIX(a,b,c) \ +{ \ + { a, a, a, a}, \ + { b, c, -c, -b}, \ + { a, -a, -a, a}, \ + { c, -b, b, -c} \ +} + +#define DEFINE_DCT2_P8_MATRIX(a,b,c,d,e,f,g) \ +{ \ + { a, a, a, a, a, a, a, a}, \ + { d, e, f, g, -g, -f, -e, -d}, \ + { b, c, -c, -b, -b, -c, c, b}, \ + { e, -g, -d, -f, f, d, g, -e}, \ + { a, -a, -a, a, a, -a, -a, a}, \ + { f, -d, g, e, -e, -g, d, -f}, \ + { c, -b, b, -c, -c, b, -b, c}, \ + { g, -f, e, -d, d, -e, f, -g} \ +} + +#define DEFINE_DCT2_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ +{ \ + { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}, \ + { h, i, j, k, l, m, n, o, -o, -n, -m, -l, -k, -j, -i, -h}, \ + { d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d}, \ + { i, l, o, -m, -j, -h, -k, -n, n, k, h, j, m, -o, -l, -i}, \ + { b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b}, \ + { j, o, -k, -i, -n, l, h, m, -m, -h, -l, n, i, k, -o, -j}, \ + { e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e}, \ + { k, -m, -i, o, h, n, -j, -l, l, j, -n, -h, -o, i, m, -k}, \ + { a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a}, \ + { l, -j, -n, h, -o, -i, m, k, -k, -m, i, o, -h, n, j, -l}, \ + { f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f}, \ + { m, -h, l, n, -i, k, o, -j, j, -o, -k, i, -n, -l, h, -m}, \ + { c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c}, \ + { n, -k, h, -j, m, o, -l, i, -i, l, -o, -m, j, -h, k, -n}, \ + { g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g}, \ + { o, -n, m, -l, k, -j, i, -h, h, -i, j, -k, l, -m, n, -o} \ +} + +#define DEFINE_DCT2_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E) \ +{ \ + { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}, \ + { p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, -E, -D, -C, -B, -A, -z, -y, -x, -w, -v, -u, -t, -s, -r, -q, -p}, \ + { h, i, j, k, l, m, n, o, -o, -n, -m, -l, -k, -j, -i, -h, -h, -i, -j, -k, -l, -m, -n, -o, o, n, m, l, k, j, i, h}, \ + { q, t, w, z, C, -E, -B, -y, -v, -s, -p, -r, -u, -x, -A, -D, D, A, x, u, r, p, s, v, y, B, E, -C, -z, -w, -t, -q}, \ + { d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d, d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d}, \ + { r, w, B, -D, -y, -t, -p, -u, -z, -E, A, v, q, s, x, C, -C, -x, -s, -q, -v, -A, E, z, u, p, t, y, D, -B, -w, -r}, \ + { i, l, o, -m, -j, -h, -k, -n, n, k, h, j, m, -o, -l, -i, -i, -l, -o, m, j, h, k, n, -n, -k, -h, -j, -m, o, l, i}, \ + { s, z, -D, -w, -p, -v, -C, A, t, r, y, -E, -x, -q, -u, -B, B, u, q, x, E, -y, -r, -t, -A, C, v, p, w, D, -z, -s}, \ + { b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b}, \ + { t, C, -y, -p, -x, D, u, s, B, -z, -q, -w, E, v, r, A, -A, -r, -v, -E, w, q, z, -B, -s, -u, -D, x, p, y, -C, -t}, \ + { j, o, -k, -i, -n, l, h, m, -m, -h, -l, n, i, k, -o, -j, -j, -o, k, i, n, -l, -h, -m, m, h, l, -n, -i, -k, o, j}, \ + { u, -E, -t, -v, D, s, w, -C, -r, -x, B, q, y, -A, -p, -z, z, p, A, -y, -q, -B, x, r, C, -w, -s, -D, v, t, E, -u}, \ + { e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e, e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e}, \ + { v, -B, -p, -C, u, w, -A, -q, -D, t, x, -z, -r, -E, s, y, -y, -s, E, r, z, -x, -t, D, q, A, -w, -u, C, p, B, -v}, \ + { k, -m, -i, o, h, n, -j, -l, l, j, -n, -h, -o, i, m, -k, -k, m, i, -o, -h, -n, j, l, -l, -j, n, h, o, -i, -m, k}, \ + { w, -y, -u, A, s, -C, -q, E, p, D, -r, -B, t, z, -v, -x, x, v, -z, -t, B, r, -D, -p, -E, q, C, -s, -A, u, y, -w}, \ + { a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a}, \ + { x, -v, -z, t, B, -r, -D, p, -E, -q, C, s, -A, -u, y, w, -w, -y, u, A, -s, -C, q, E, -p, D, r, -B, -t, z, v, -x}, \ + { l, -j, -n, h, -o, -i, m, k, -k, -m, i, o, -h, n, j, -l, -l, j, n, -h, o, i, -m, -k, k, m, -i, -o, h, -n, -j, l}, \ + { y, -s, -E, r, -z, -x, t, D, -q, A, w, -u, -C, p, -B, -v, v, B, -p, C, u, -w, -A, q, -D, -t, x, z, -r, E, s, -y}, \ + { f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f, f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f}, \ + { z, -p, A, y, -q, B, x, -r, C, w, -s, D, v, -t, E, u, -u, -E, t, -v, -D, s, -w, -C, r, -x, -B, q, -y, -A, p, -z}, \ + { m, -h, l, n, -i, k, o, -j, j, -o, -k, i, -n, -l, h, -m, -m, h, -l, -n, i, -k, -o, j, -j, o, k, -i, n, l, -h, m}, \ + { A, -r, v, -E, -w, q, -z, -B, s, -u, D, x, -p, y, C, -t, t, -C, -y, p, -x, -D, u, -s, B, z, -q, w, E, -v, r, -A}, \ + { c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c}, \ + { B, -u, q, -x, E, y, -r, t, -A, -C, v, -p, w, -D, -z, s, -s, z, D, -w, p, -v, C, A, -t, r, -y, -E, x, -q, u, -B}, \ + { n, -k, h, -j, m, o, -l, i, -i, l, -o, -m, j, -h, k, -n, -n, k, -h, j, -m, -o, l, -i, i, -l, o, m, -j, h, -k, n}, \ + { C, -x, s, -q, v, -A, -E, z, -u, p, -t, y, -D, -B, w, -r, r, -w, B, D, -y, t, -p, u, -z, E, A, -v, q, -s, x, -C}, \ + { g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g, g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g}, \ + { D, -A, x, -u, r, -p, s, -v, y, -B, E, C, -z, w, -t, q, -q, t, -w, z, -C, -E, B, -y, v, -s, p, -r, u, -x, A, -D}, \ + { o, -n, m, -l, k, -j, i, -h, h, -i, j, -k, l, -m, n, -o, -o, n, -m, l, -k, j, -i, h, -h, i, -j, k, -l, m, -n, o}, \ + { E, -D, C, -B, A, -z, y, -x, w, -v, u, -t, s, -r, q, -p, p, -q, r, -s, t, -u, v, -w, x, -y, z, -A, B, -C, D, -E} \ +} + + +#define DEFINE_DCT2_P64_MATRIX(aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck) \ +{ \ + { aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa }, \ + { bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck, -ck, -cj, -ci, -ch, -cg, -cf, -ce, -cd, -cc, -cb, -ca, -bz, -by, -bx, -bw, -bv, -bu, -bt, -bs, -br, -bq, -bp, -bo, -bn, -bm, -bl, -bk, -bj, -bi, -bh, -bg, -bf }, \ + { ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, -be, -bd, -bc, -bb, -ba, -az, -ay, -ax, -aw, -av, -au, -at, -as, -ar, -aq, -ap, -ap, -aq, -ar, -as, -at, -au, -av, -aw, -ax, -ay, -az, -ba, -bb, -bc, -bd, -be, be, bd, bc, bb, ba, az, ay, ax, aw, av, au, at, as, ar, aq, ap }, \ + { bg, bj, bm, bp, bs, bv, by, cb, ce, ch, ck, -ci, -cf, -cc, -bz, -bw, -bt, -bq, -bn, -bk, -bh, -bf, -bi, -bl, -bo, -br, -bu, -bx, -ca, -cd, -cg, -cj, cj, cg, cd, ca, bx, bu, br, bo, bl, bi, bf, bh, bk, bn, bq, bt, bw, bz, cc, cf, ci, -ck, -ch, -ce, -cb, -by, -bv, -bs, -bp, -bm, -bj, -bg }, \ + { ah, ai, aj, ak, al, am, an, ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao, ao, an, am, al, ak, aj, ai, ah, ah, ai, aj, ak, al, am, an, ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao, ao, an, am, al, ak, aj, ai, ah }, \ + { bh, bm, br, bw, cb, cg, -ck, -cf, -ca, -bv, -bq, -bl, -bg, -bi, -bn, -bs, -bx, -cc, -ch, cj, ce, bz, bu, bp, bk, bf, bj, bo, bt, by, cd, ci, -ci, -cd, -by, -bt, -bo, -bj, -bf, -bk, -bp, -bu, -bz, -ce, -cj, ch, cc, bx, bs, bn, bi, bg, bl, bq, bv, ca, cf, ck, -cg, -cb, -bw, -br, -bm, -bh }, \ + { aq, at, aw, az, bc, -be, -bb, -ay, -av, -as, -ap, -ar, -au, -ax, -ba, -bd, bd, ba, ax, au, ar, ap, as, av, ay, bb, be, -bc, -az, -aw, -at, -aq, -aq, -at, -aw, -az, -bc, be, bb, ay, av, as, ap, ar, au, ax, ba, bd, -bd, -ba, -ax, -au, -ar, -ap, -as, -av, -ay, -bb, -be, bc, az, aw, at, aq }, \ + { bi, bp, bw, cd, ck, -ce, -bx, -bq, -bj, -bh, -bo, -bv, -cc, -cj, cf, by, br, bk, bg, bn, bu, cb, ci, -cg, -bz, -bs, -bl, -bf, -bm, -bt, -ca, -ch, ch, ca, bt, bm, bf, bl, bs, bz, cg, -ci, -cb, -bu, -bn, -bg, -bk, -br, -by, -cf, cj, cc, bv, bo, bh, bj, bq, bx, ce, -ck, -cd, -bw, -bp, -bi }, \ + { ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad }, \ + { bj, bs, cb, ck, -cc, -bt, -bk, -bi, -br, -ca, -cj, cd, bu, bl, bh, bq, bz, ci, -ce, -bv, -bm, -bg, -bp, -by, -ch, cf, bw, bn, bf, bo, bx, cg, -cg, -bx, -bo, -bf, -bn, -bw, -cf, ch, by, bp, bg, bm, bv, ce, -ci, -bz, -bq, -bh, -bl, -bu, -cd, cj, ca, br, bi, bk, bt, cc, -ck, -cb, -bs, -bj }, \ + { ar, aw, bb, -bd, -ay, -at, -ap, -au, -az, -be, ba, av, aq, as, ax, bc, -bc, -ax, -as, -aq, -av, -ba, be, az, au, ap, at, ay, bd, -bb, -aw, -ar, -ar, -aw, -bb, bd, ay, at, ap, au, az, be, -ba, -av, -aq, -as, -ax, -bc, bc, ax, as, aq, av, ba, -be, -az, -au, -ap, -at, -ay, -bd, bb, aw, ar }, \ + { bk, bv, cg, -ce, -bt, -bi, -bm, -bx, -ci, cc, br, bg, bo, bz, ck, -ca, -bp, -bf, -bq, -cb, cj, by, bn, bh, bs, cd, -ch, -bw, -bl, -bj, -bu, -cf, cf, bu, bj, bl, bw, ch, -cd, -bs, -bh, -bn, -by, -cj, cb, bq, bf, bp, ca, -ck, -bz, -bo, -bg, -br, -cc, ci, bx, bm, bi, bt, ce, -cg, -bv, -bk }, \ + { ai, al, ao, -am, -aj, -ah, -ak, -an, an, ak, ah, aj, am, -ao, -al, -ai, -ai, -al, -ao, am, aj, ah, ak, an, -an, -ak, -ah, -aj, -am, ao, al, ai, ai, al, ao, -am, -aj, -ah, -ak, -an, an, ak, ah, aj, am, -ao, -al, -ai, -ai, -al, -ao, am, aj, ah, ak, an, -an, -ak, -ah, -aj, -am, ao, al, ai }, \ + { bl, by, -ck, -bx, -bk, -bm, -bz, cj, bw, bj, bn, ca, -ci, -bv, -bi, -bo, -cb, ch, bu, bh, bp, cc, -cg, -bt, -bg, -bq, -cd, cf, bs, bf, br, ce, -ce, -br, -bf, -bs, -cf, cd, bq, bg, bt, cg, -cc, -bp, -bh, -bu, -ch, cb, bo, bi, bv, ci, -ca, -bn, -bj, -bw, -cj, bz, bm, bk, bx, ck, -by, -bl }, \ + { as, az, -bd, -aw, -ap, -av, -bc, ba, at, ar, ay, -be, -ax, -aq, -au, -bb, bb, au, aq, ax, be, -ay, -ar, -at, -ba, bc, av, ap, aw, bd, -az, -as, -as, -az, bd, aw, ap, av, bc, -ba, -at, -ar, -ay, be, ax, aq, au, bb, -bb, -au, -aq, -ax, -be, ay, ar, at, ba, -bc, -av, -ap, -aw, -bd, az, as }, \ + { bm, cb, -cf, -bq, -bi, -bx, cj, bu, bf, bt, ci, -by, -bj, -bp, -ce, cc, bn, bl, ca, -cg, -br, -bh, -bw, ck, bv, bg, bs, ch, -bz, -bk, -bo, -cd, cd, bo, bk, bz, -ch, -bs, -bg, -bv, -ck, bw, bh, br, cg, -ca, -bl, -bn, -cc, ce, bp, bj, by, -ci, -bt, -bf, -bu, -cj, bx, bi, bq, cf, -cb, -bm }, \ + { ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab }, \ + { bn, ce, -ca, -bj, -br, -ci, bw, bf, bv, -cj, -bs, -bi, -bz, cf, bo, bm, cd, -cb, -bk, -bq, -ch, bx, bg, bu, -ck, -bt, -bh, -by, cg, bp, bl, cc, -cc, -bl, -bp, -cg, by, bh, bt, ck, -bu, -bg, -bx, ch, bq, bk, cb, -cd, -bm, -bo, -cf, bz, bi, bs, cj, -bv, -bf, -bw, ci, br, bj, ca, -ce, -bn }, \ + { at, bc, -ay, -ap, -ax, bd, au, as, bb, -az, -aq, -aw, be, av, ar, ba, -ba, -ar, -av, -be, aw, aq, az, -bb, -as, -au, -bd, ax, ap, ay, -bc, -at, -at, -bc, ay, ap, ax, -bd, -au, -as, -bb, az, aq, aw, -be, -av, -ar, -ba, ba, ar, av, be, -aw, -aq, -az, bb, as, au, bd, -ax, -ap, -ay, bc, at }, \ + { bo, ch, -bv, -bh, -ca, cc, bj, bt, -cj, -bq, -bm, -cf, bx, bf, by, -ce, -bl, -br, -ck, bs, bk, cd, -bz, -bg, -bw, cg, bn, bp, ci, -bu, -bi, -cb, cb, bi, bu, -ci, -bp, -bn, -cg, bw, bg, bz, -cd, -bk, -bs, ck, br, bl, ce, -by, -bf, -bx, cf, bm, bq, cj, -bt, -bj, -cc, ca, bh, bv, -ch, -bo }, \ + { aj, ao, -ak, -ai, -an, al, ah, am, -am, -ah, -al, an, ai, ak, -ao, -aj, -aj, -ao, ak, ai, an, -al, -ah, -am, am, ah, al, -an, -ai, -ak, ao, aj, aj, ao, -ak, -ai, -an, al, ah, am, -am, -ah, -al, an, ai, ak, -ao, -aj, -aj, -ao, ak, ai, an, -al, -ah, -am, am, ah, al, -an, -ai, -ak, ao, aj }, \ + { bp, ck, -bq, -bo, -cj, br, bn, ci, -bs, -bm, -ch, bt, bl, cg, -bu, -bk, -cf, bv, bj, ce, -bw, -bi, -cd, bx, bh, cc, -by, -bg, -cb, bz, bf, ca, -ca, -bf, -bz, cb, bg, by, -cc, -bh, -bx, cd, bi, bw, -ce, -bj, -bv, cf, bk, bu, -cg, -bl, -bt, ch, bm, bs, -ci, -bn, -br, cj, bo, bq, -ck, -bp }, \ + { au, -be, -at, -av, bd, as, aw, -bc, -ar, -ax, bb, aq, ay, -ba, -ap, -az, az, ap, ba, -ay, -aq, -bb, ax, ar, bc, -aw, -as, -bd, av, at, be, -au, -au, be, at, av, -bd, -as, -aw, bc, ar, ax, -bb, -aq, -ay, ba, ap, az, -az, -ap, -ba, ay, aq, bb, -ax, -ar, -bc, aw, as, bd, -av, -at, -be, au }, \ + { bq, -ci, -bl, -bv, cd, bg, ca, -by, -bi, -cf, bt, bn, ck, -bo, -bs, cg, bj, bx, -cb, -bf, -cc, bw, bk, ch, -br, -bp, cj, bm, bu, -ce, -bh, -bz, bz, bh, ce, -bu, -bm, -cj, bp, br, -ch, -bk, -bw, cc, bf, cb, -bx, -bj, -cg, bs, bo, -ck, -bn, -bt, cf, bi, by, -ca, -bg, -cd, bv, bl, ci, -bq }, \ + { ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae }, \ + { br, -cf, -bg, -cc, bu, bo, -ci, -bj, -bz, bx, bl, ck, -bm, -bw, ca, bi, ch, -bp, -bt, cd, bf, ce, -bs, -bq, cg, bh, cb, -bv, -bn, cj, bk, by, -by, -bk, -cj, bn, bv, -cb, -bh, -cg, bq, bs, -ce, -bf, -cd, bt, bp, -ch, -bi, -ca, bw, bm, -ck, -bl, -bx, bz, bj, ci, -bo, -bu, cc, bg, cf, -br }, \ + { av, -bb, -ap, -bc, au, aw, -ba, -aq, -bd, at, ax, -az, -ar, -be, as, ay, -ay, -as, be, ar, az, -ax, -at, bd, aq, ba, -aw, -au, bc, ap, bb, -av, -av, bb, ap, bc, -au, -aw, ba, aq, bd, -at, -ax, az, ar, be, -as, -ay, ay, as, -be, -ar, -az, ax, at, -bd, -aq, -ba, aw, au, -bc, -ap, -bb, av }, \ + { bs, -cc, -bi, -cj, bl, bz, -bv, -bp, cf, bf, cg, -bo, -bw, by, bm, -ci, -bh, -cd, br, bt, -cb, -bj, -ck, bk, ca, -bu, -bq, ce, bg, ch, -bn, -bx, bx, bn, -ch, -bg, -ce, bq, bu, -ca, -bk, ck, bj, cb, -bt, -br, cd, bh, ci, -bm, -by, bw, bo, -cg, -bf, -cf, bp, bv, -bz, -bl, cj, bi, cc, -bs }, \ + { ak, -am, -ai, ao, ah, an, -aj, -al, al, aj, -an, -ah, -ao, ai, am, -ak, -ak, am, ai, -ao, -ah, -an, aj, al, -al, -aj, an, ah, ao, -ai, -am, ak, ak, -am, -ai, ao, ah, an, -aj, -al, al, aj, -an, -ah, -ao, ai, am, -ak, -ak, am, ai, -ao, -ah, -an, aj, al, -al, -aj, an, ah, ao, -ai, -am, ak }, \ + { bt, -bz, -bn, cf, bh, ck, -bi, -ce, bo, by, -bu, -bs, ca, bm, -cg, -bg, -cj, bj, cd, -bp, -bx, bv, br, -cb, -bl, ch, bf, ci, -bk, -cc, bq, bw, -bw, -bq, cc, bk, -ci, -bf, -ch, bl, cb, -br, -bv, bx, bp, -cd, -bj, cj, bg, cg, -bm, -ca, bs, bu, -by, -bo, ce, bi, -ck, -bh, -cf, bn, bz, -bt }, \ + { aw, -ay, -au, ba, as, -bc, -aq, be, ap, bd, -ar, -bb, at, az, -av, -ax, ax, av, -az, -at, bb, ar, -bd, -ap, -be, aq, bc, -as, -ba, au, ay, -aw, -aw, ay, au, -ba, -as, bc, aq, -be, -ap, -bd, ar, bb, -at, -az, av, ax, -ax, -av, az, at, -bb, -ar, bd, ap, be, -aq, -bc, as, ba, -au, -ay, aw }, \ + { bu, -bw, -bs, by, bq, -ca, -bo, cc, bm, -ce, -bk, cg, bi, -ci, -bg, ck, bf, cj, -bh, -ch, bj, cf, -bl, -cd, bn, cb, -bp, -bz, br, bx, -bt, -bv, bv, bt, -bx, -br, bz, bp, -cb, -bn, cd, bl, -cf, -bj, ch, bh, -cj, -bf, -ck, bg, ci, -bi, -cg, bk, ce, -bm, -cc, bo, ca, -bq, -by, bs, bw, -bu }, \ + { aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa }, \ + { bv, -bt, -bx, br, bz, -bp, -cb, bn, cd, -bl, -cf, bj, ch, -bh, -cj, bf, -ck, -bg, ci, bi, -cg, -bk, ce, bm, -cc, -bo, ca, bq, -by, -bs, bw, bu, -bu, -bw, bs, by, -bq, -ca, bo, cc, -bm, -ce, bk, cg, -bi, -ci, bg, ck, -bf, cj, bh, -ch, -bj, cf, bl, -cd, -bn, cb, bp, -bz, -br, bx, bt, -bv }, \ + { ax, -av, -az, at, bb, -ar, -bd, ap, -be, -aq, bc, as, -ba, -au, ay, aw, -aw, -ay, au, ba, -as, -bc, aq, be, -ap, bd, ar, -bb, -at, az, av, -ax, -ax, av, az, -at, -bb, ar, bd, -ap, be, aq, -bc, -as, ba, au, -ay, -aw, aw, ay, -au, -ba, as, bc, -aq, -be, ap, -bd, -ar, bb, at, -az, -av, ax }, \ + { bw, -bq, -cc, bk, ci, -bf, ch, bl, -cb, -br, bv, bx, -bp, -cd, bj, cj, -bg, cg, bm, -ca, -bs, bu, by, -bo, -ce, bi, ck, -bh, cf, bn, -bz, -bt, bt, bz, -bn, -cf, bh, -ck, -bi, ce, bo, -by, -bu, bs, ca, -bm, -cg, bg, -cj, -bj, cd, bp, -bx, -bv, br, cb, -bl, -ch, bf, -ci, -bk, cc, bq, -bw }, \ + { al, -aj, -an, ah, -ao, -ai, am, ak, -ak, -am, ai, ao, -ah, an, aj, -al, -al, aj, an, -ah, ao, ai, -am, -ak, ak, am, -ai, -ao, ah, -an, -aj, al, al, -aj, -an, ah, -ao, -ai, am, ak, -ak, -am, ai, ao, -ah, an, aj, -al, -al, aj, an, -ah, ao, ai, -am, -ak, ak, am, -ai, -ao, ah, -an, -aj, al }, \ + { bx, -bn, -ch, bg, -ce, -bq, bu, ca, -bk, -ck, bj, -cb, -bt, br, cd, -bh, ci, bm, -by, -bw, bo, cg, -bf, cf, bp, -bv, -bz, bl, cj, -bi, cc, bs, -bs, -cc, bi, -cj, -bl, bz, bv, -bp, -cf, bf, -cg, -bo, bw, by, -bm, -ci, bh, -cd, -br, bt, cb, -bj, ck, bk, -ca, -bu, bq, ce, -bg, ch, bn, -bx }, \ + { ay, -as, -be, ar, -az, -ax, at, bd, -aq, ba, aw, -au, -bc, ap, -bb, -av, av, bb, -ap, bc, au, -aw, -ba, aq, -bd, -at, ax, az, -ar, be, as, -ay, -ay, as, be, -ar, az, ax, -at, -bd, aq, -ba, -aw, au, bc, -ap, bb, av, -av, -bb, ap, -bc, -au, aw, ba, -aq, bd, at, -ax, -az, ar, -be, -as, ay }, \ + { by, -bk, cj, bn, -bv, -cb, bh, -cg, -bq, bs, ce, -bf, cd, bt, -bp, -ch, bi, -ca, -bw, bm, ck, -bl, bx, bz, -bj, ci, bo, -bu, -cc, bg, -cf, -br, br, cf, -bg, cc, bu, -bo, -ci, bj, -bz, -bx, bl, -ck, -bm, bw, ca, -bi, ch, bp, -bt, -cd, bf, -ce, -bs, bq, cg, -bh, cb, bv, -bn, -cj, bk, -by }, \ + { af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af }, \ + { bz, -bh, ce, bu, -bm, cj, bp, -br, -ch, bk, -bw, -cc, bf, -cb, -bx, bj, -cg, -bs, bo, ck, -bn, bt, cf, -bi, by, ca, -bg, cd, bv, -bl, ci, bq, -bq, -ci, bl, -bv, -cd, bg, -ca, -by, bi, -cf, -bt, bn, -ck, -bo, bs, cg, -bj, bx, cb, -bf, cc, bw, -bk, ch, br, -bp, -cj, bm, -bu, -ce, bh, -bz }, \ + { az, -ap, ba, ay, -aq, bb, ax, -ar, bc, aw, -as, bd, av, -at, be, au, -au, -be, at, -av, -bd, as, -aw, -bc, ar, -ax, -bb, aq, -ay, -ba, ap, -az, -az, ap, -ba, -ay, aq, -bb, -ax, ar, -bc, -aw, as, -bd, -av, at, -be, -au, au, be, -at, av, bd, -as, aw, bc, -ar, ax, bb, -aq, ay, ba, -ap, az }, \ + { ca, -bf, bz, cb, -bg, by, cc, -bh, bx, cd, -bi, bw, ce, -bj, bv, cf, -bk, bu, cg, -bl, bt, ch, -bm, bs, ci, -bn, br, cj, -bo, bq, ck, -bp, bp, -ck, -bq, bo, -cj, -br, bn, -ci, -bs, bm, -ch, -bt, bl, -cg, -bu, bk, -cf, -bv, bj, -ce, -bw, bi, -cd, -bx, bh, -cc, -by, bg, -cb, -bz, bf, -ca }, \ + { am, -ah, al, an, -ai, ak, ao, -aj, aj, -ao, -ak, ai, -an, -al, ah, -am, -am, ah, -al, -an, ai, -ak, -ao, aj, -aj, ao, ak, -ai, an, al, -ah, am, am, -ah, al, an, -ai, ak, ao, -aj, aj, -ao, -ak, ai, -an, -al, ah, -am, -am, ah, -al, -an, ai, -ak, -ao, aj, -aj, ao, ak, -ai, an, al, -ah, am }, \ + { cb, -bi, bu, ci, -bp, bn, -cg, -bw, bg, -bz, -cd, bk, -bs, -ck, br, -bl, ce, by, -bf, bx, cf, -bm, bq, -cj, -bt, bj, -cc, -ca, bh, -bv, -ch, bo, -bo, ch, bv, -bh, ca, cc, -bj, bt, cj, -bq, bm, -cf, -bx, bf, -by, -ce, bl, -br, ck, bs, -bk, cd, bz, -bg, bw, cg, -bn, bp, -ci, -bu, bi, -cb }, \ + { ba, -ar, av, -be, -aw, aq, -az, -bb, as, -au, bd, ax, -ap, ay, bc, -at, at, -bc, -ay, ap, -ax, -bd, au, -as, bb, az, -aq, aw, be, -av, ar, -ba, -ba, ar, -av, be, aw, -aq, az, bb, -as, au, -bd, -ax, ap, -ay, -bc, at, -at, bc, ay, -ap, ax, bd, -au, as, -bb, -az, aq, -aw, -be, av, -ar, ba }, \ + { cc, -bl, bp, -cg, -by, bh, -bt, ck, bu, -bg, bx, ch, -bq, bk, -cb, -cd, bm, -bo, cf, bz, -bi, bs, -cj, -bv, bf, -bw, -ci, br, -bj, ca, ce, -bn, bn, -ce, -ca, bj, -br, ci, bw, -bf, bv, cj, -bs, bi, -bz, -cf, bo, -bm, cd, cb, -bk, bq, -ch, -bx, bg, -bu, -ck, bt, -bh, by, cg, -bp, bl, -cc }, \ + { ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac }, \ + { cd, -bo, bk, -bz, -ch, bs, -bg, bv, -ck, -bw, bh, -br, cg, ca, -bl, bn, -cc, -ce, bp, -bj, by, ci, -bt, bf, -bu, cj, bx, -bi, bq, -cf, -cb, bm, -bm, cb, cf, -bq, bi, -bx, -cj, bu, -bf, bt, -ci, -by, bj, -bp, ce, cc, -bn, bl, -ca, -cg, br, -bh, bw, ck, -bv, bg, -bs, ch, bz, -bk, bo, -cd }, \ + { bb, -au, aq, -ax, be, ay, -ar, at, -ba, -bc, av, -ap, aw, -bd, -az, as, -as, az, bd, -aw, ap, -av, bc, ba, -at, ar, -ay, -be, ax, -aq, au, -bb, -bb, au, -aq, ax, -be, -ay, ar, -at, ba, bc, -av, ap, -aw, bd, az, -as, as, -az, -bd, aw, -ap, av, -bc, -ba, at, -ar, ay, be, -ax, aq, -au, bb }, \ + { ce, -br, bf, -bs, cf, cd, -bq, bg, -bt, cg, cc, -bp, bh, -bu, ch, cb, -bo, bi, -bv, ci, ca, -bn, bj, -bw, cj, bz, -bm, bk, -bx, ck, by, -bl, bl, -by, -ck, bx, -bk, bm, -bz, -cj, bw, -bj, bn, -ca, -ci, bv, -bi, bo, -cb, -ch, bu, -bh, bp, -cc, -cg, bt, -bg, bq, -cd, -cf, bs, -bf, br, -ce }, \ + { an, -ak, ah, -aj, am, ao, -al, ai, -ai, al, -ao, -am, aj, -ah, ak, -an, -an, ak, -ah, aj, -am, -ao, al, -ai, ai, -al, ao, am, -aj, ah, -ak, an, an, -ak, ah, -aj, am, ao, -al, ai, -ai, al, -ao, -am, aj, -ah, ak, -an, -an, ak, -ah, aj, -am, -ao, al, -ai, ai, -al, ao, am, -aj, ah, -ak, an }, \ + { cf, -bu, bj, -bl, bw, -ch, -cd, bs, -bh, bn, -by, cj, cb, -bq, bf, -bp, ca, ck, -bz, bo, -bg, br, -cc, -ci, bx, -bm, bi, -bt, ce, cg, -bv, bk, -bk, bv, -cg, -ce, bt, -bi, bm, -bx, ci, cc, -br, bg, -bo, bz, -ck, -ca, bp, -bf, bq, -cb, -cj, by, -bn, bh, -bs, cd, ch, -bw, bl, -bj, bu, -cf }, \ + { bc, -ax, as, -aq, av, -ba, -be, az, -au, ap, -at, ay, -bd, -bb, aw, -ar, ar, -aw, bb, bd, -ay, at, -ap, au, -az, be, ba, -av, aq, -as, ax, -bc, -bc, ax, -as, aq, -av, ba, be, -az, au, -ap, at, -ay, bd, bb, -aw, ar, -ar, aw, -bb, -bd, ay, -at, ap, -au, az, -be, -ba, av, -aq, as, -ax, bc }, \ + { cg, -bx, bo, -bf, bn, -bw, cf, ch, -by, bp, -bg, bm, -bv, ce, ci, -bz, bq, -bh, bl, -bu, cd, cj, -ca, br, -bi, bk, -bt, cc, ck, -cb, bs, -bj, bj, -bs, cb, -ck, -cc, bt, -bk, bi, -br, ca, -cj, -cd, bu, -bl, bh, -bq, bz, -ci, -ce, bv, -bm, bg, -bp, by, -ch, -cf, bw, -bn, bf, -bo, bx, -cg }, \ + { ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag }, \ + { ch, -ca, bt, -bm, bf, -bl, bs, -bz, cg, ci, -cb, bu, -bn, bg, -bk, br, -by, cf, cj, -cc, bv, -bo, bh, -bj, bq, -bx, ce, ck, -cd, bw, -bp, bi, -bi, bp, -bw, cd, -ck, -ce, bx, -bq, bj, -bh, bo, -bv, cc, -cj, -cf, by, -br, bk, -bg, bn, -bu, cb, -ci, -cg, bz, -bs, bl, -bf, bm, -bt, ca, -ch }, \ + { bd, -ba, ax, -au, ar, -ap, as, -av, ay, -bb, be, bc, -az, aw, -at, aq, -aq, at, -aw, az, -bc, -be, bb, -ay, av, -as, ap, -ar, au, -ax, ba, -bd, -bd, ba, -ax, au, -ar, ap, -as, av, -ay, bb, -be, -bc, az, -aw, at, -aq, aq, -at, aw, -az, bc, be, -bb, ay, -av, as, -ap, ar, -au, ax, -ba, bd }, \ + { ci, -cd, by, -bt, bo, -bj, bf, -bk, bp, -bu, bz, -ce, cj, ch, -cc, bx, -bs, bn, -bi, bg, -bl, bq, -bv, ca, -cf, ck, cg, -cb, bw, -br, bm, -bh, bh, -bm, br, -bw, cb, -cg, -ck, cf, -ca, bv, -bq, bl, -bg, bi, -bn, bs, -bx, cc, -ch, -cj, ce, -bz, bu, -bp, bk, -bf, bj, -bo, bt, -by, cd, -ci }, \ + { ao, -an, am, -al, ak, -aj, ai, -ah, ah, -ai, aj, -ak, al, -am, an, -ao, -ao, an, -am, al, -ak, aj, -ai, ah, -ah, ai, -aj, ak, -al, am, -an, ao, ao, -an, am, -al, ak, -aj, ai, -ah, ah, -ai, aj, -ak, al, -am, an, -ao, -ao, an, -am, al, -ak, aj, -ai, ah, -ah, ai, -aj, ak, -al, am, -an, ao }, \ + { cj, -cg, cd, -ca, bx, -bu, br, -bo, bl, -bi, bf, -bh, bk, -bn, bq, -bt, bw, -bz, cc, -cf, ci, ck, -ch, ce, -cb, by, -bv, bs, -bp, bm, -bj, bg, -bg, bj, -bm, bp, -bs, bv, -by, cb, -ce, ch, -ck, -ci, cf, -cc, bz, -bw, bt, -bq, bn, -bk, bh, -bf, bi, -bl, bo, -br, bu, -bx, ca, -cd, cg, -cj }, \ + { be, -bd, bc, -bb, ba, -az, ay, -ax, aw, -av, au, -at, as, -ar, aq, -ap, ap, -aq, ar, -as, at, -au, av, -aw, ax, -ay, az, -ba, bb, -bc, bd, -be, -be, bd, -bc, bb, -ba, az, -ay, ax, -aw, av, -au, at, -as, ar, -aq, ap, -ap, aq, -ar, as, -at, au, -av, aw, -ax, ay, -az, ba, -bb, bc, -bd, be }, \ + { ck, -cj, ci, -ch, cg, -cf, ce, -cd, cc, -cb, ca, -bz, by, -bx, bw, -bv, bu, -bt, bs, -br, bq, -bp, bo, -bn, bm, -bl, bk, -bj, bi, -bh, bg, -bf, bf, -bg, bh, -bi, bj, -bk, bl, -bm, bn, -bo, bp, -bq, br, -bs, bt, -bu, bv, -bw, bx, -by, bz, -ca, cb, -cc, cd, -ce, cf, -cg, ch, -ci, cj, -ck }, \ + } + +// DCT-8 +#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \ +{ \ + { a, b, c, d,}, \ + { b, 0, -b, -b,}, \ + { c, -b, -d, a,}, \ + { d, -b, a, -c,}, \ +} + +#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \ +{ \ + { a, b, c, d, e, f, g, h,}, \ + { b, e, h, -g, -d, -a, -c, -f,}, \ + { c, h, -e, -a, -f, g, b, d,}, \ + { d, -g, -a, -h, c, e, -f, -b,}, \ + { e, -d, -f, c, g, -b, -h, a,}, \ + { f, -a, g, e, -b, h, d, -c,}, \ + { g, -c, b, -f, -h, d, -a, e,}, \ + { h, -f, d, -b, a, -c, e, -g,}, \ +} + +#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p,}, \ + { b, e, h, k, n, 0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n,}, \ + { c, h, m, -p, -k, -f, -a, -e, -j, -o, n, i, d, b, g, l,}, \ + { d, k, -p, -i, -b, -f, -m, n, g, a, h, o, -l, -e, -c, -j,}, \ + { e, n, -k, -b, -h, 0, h, b, k, -n, -e, -e, -n, k, b, h,}, \ + { f, 0, -f, -f, 0, f, f, 0, -f, -f, 0, f, f, 0, -f, -f,}, \ + { g, -n, -a, -m, h, f, -o, -b, -l, i, e, -p, -c, -k, j, d,}, \ + { h, -k, -e, n, b, 0, -b, -n, e, k, -h, -h, k, e, -n, -b,}, \ + { i, -h, -j, g, k, -f, -l, e, m, -d, -n, c, o, -b, -p, a,}, \ + { j, -e, -o, a, -n, -f, i, k, -d, -p, b, -m, -g, h, l, -c,}, \ + { k, -b, n, h, -e, 0, e, -h, -n, b, -k, -k, b, -n, -h, e,}, \ + { l, -b, i, o, -e, f, -p, -h, c, -m, -k, a, -j, -n, d, -g,}, \ + { m, -e, d, -l, -n, f, -c, k, o, -g, b, -j, -p, h, -a, i,}, \ + { n, -h, b, -e, k, 0, -k, e, -b, h, -n, -n, h, -b, e, -k,}, \ + { o, -k, g, -c, b, -f, j, -n, -p, l, -h, d, -a, e, -i, m,}, \ + { p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o,}, \ +} + +#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F,}, \ + { b, e, h, k, n, q, t, w, z, C, F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D,}, \ + { c, h, m, r, w, B, 0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B, 0, B, w, r, m, h, c, c, h, m, r, w, B,}, \ + { d, k, r, y, F, -A, -t, -m, -f, -b, -i, -p, -w, -D, C, v, o, h, a, g, n, u, B, -E, -x, -q, -j, -c, -e, -l, -s, -z,}, \ + { e, n, w, F, -y, -p, -g, -c, -l, -u, -D, A, r, i, a, j, s, B, -C, -t, -k, -b, -h, -q, -z, E, v, m, d, f, o, x,}, \ + { f, q, B, -A, -p, -e, -g, -r, -C, z, o, d, h, s, D, -y, -n, -c, -i, -t, -E, x, m, b, j, u, F, -w, -l, -a, -k, -v,}, \ + { g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t,}, \ + { h, w, -B, -m, -c, -r, 0, r, c, m, B, -w, -h, -h, -w, B, m, c, r, 0, -r, -c, -m, -B, w, h, h, w, -B, -m, -c, -r,}, \ + { i, z, -w, -f, -l, -C, t, c, o, F, -q, -a, -r, E, n, d, u, -B, -k, -g, -x, y, h, j, A, -v, -e, -m, -D, s, b, p,}, \ + { j, C, -r, -b, -u, z, g, m, F, -o, -e, -x, w, d, p, -E, -l, -h, -A, t, a, s, -B, -i, -k, -D, q, c, v, -y, -f, -n,}, \ + { k, F, -m, -i, -D, o, g, B, -q, -e, -z, s, c, x, -u, -a, -v, w, b, t, -y, -d, -r, A, f, p, -C, -h, -n, E, j, l,}, \ + { l, -E, -h, -p, A, d, t, -w, -a, -x, s, e, B, -o, -i, -F, k, m, -D, -g, -q, z, c, u, -v, -b, -y, r, f, C, -n, -j,}, \ + { m, -B, -c, -w, r, h, 0, -h, -r, w, c, B, -m, -m, B, c, w, -r, -h, 0, h, r, -w, -c, -B, m, m, -B, -c, -w, r, h,}, \ + { n, -y, -c, -D, i, s, -t, -h, E, d, x, -o, -m, z, b, C, -j, -r, u, g, -F, -e, -w, p, l, -A, -a, -B, k, q, -v, -f,}, \ + { o, -v, -h, C, a, D, -g, -w, n, p, -u, -i, B, b, E, -f, -x, m, q, -t, -j, A, c, F, -e, -y, l, r, -s, -k, z, d,}, \ + { p, -s, -m, v, j, -y, -g, B, d, -E, -a, -F, c, C, -f, -z, i, w, -l, -t, o, q, -r, -n, u, k, -x, -h, A, e, -D, -b,}, \ + { q, -p, -r, o, s, -n, -t, m, u, -l, -v, k, w, -j, -x, i, y, -h, -z, g, A, -f, -B, e, C, -d, -D, c, E, -b, -F, a,}, \ + { r, -m, -w, h, B, -c, 0, c, -B, -h, w, m, -r, -r, m, w, -h, -B, c, 0, -c, B, h, -w, -m, r, r, -m, -w, h, B, -c,}, \ + { s, -j, -B, a, -C, -i, t, r, -k, -A, b, -D, -h, u, q, -l, -z, c, -E, -g, v, p, -m, -y, d, -F, -f, w, o, -n, -x, e,}, \ + { t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g,}, \ + { u, -d, B, n, -k, -E, g, -r, -x, a, -y, -q, h, -F, -j, o, A, -c, v, t, -e, C, m, -l, -D, f, -s, -w, b, -z, -p, i,}, \ + { v, -a, w, u, -b, x, t, -c, y, s, -d, z, r, -e, A, q, -f, B, p, -g, C, o, -h, D, n, -i, E, m, -j, F, l, -k,}, \ + { w, -c, r, B, -h, m, 0, -m, h, -B, -r, c, -w, -w, c, -r, -B, h, -m, 0, m, -h, B, r, -c, w, w, -c, r, B, -h, m,}, \ + { x, -f, m, -E, -q, b, -t, -B, j, -i, A, u, -c, p, F, -n, e, -w, -y, g, -l, D, r, -a, s, C, -k, h, -z, -v, d, -o,}, \ + { y, -i, h, -x, -z, j, -g, w, A, -k, f, -v, -B, l, -e, u, C, -m, d, -t, -D, n, -c, s, E, -o, b, -r, -F, p, -a, q,}, \ + { z, -l, c, -q, E, u, -g, h, -v, -D, p, -b, m, -A, -y, k, -d, r, -F, -t, f, -i, w, C, -o, a, -n, B, x, -j, e, -s,}, \ + { A, -o, c, -j, v, F, -t, h, -e, q, -C, -y, m, -a, l, -x, -D, r, -f, g, -s, E, w, -k, b, -n, z, B, -p, d, -i, u,}, \ + { B, -r, h, -c, m, -w, 0, w, -m, c, -h, r, -B, -B, r, -h, c, -m, w, 0, -w, m, -c, h, -r, B, B, -r, h, -c, m, -w,}, \ + { C, -u, m, -e, d, -l, t, -B, -D, v, -n, f, -c, k, -s, A, E, -w, o, -g, b, -j, r, -z, -F, x, -p, h, -a, i, -q, y,}, \ + { D, -x, r, -l, f, -a, g, -m, s, -y, E, C, -w, q, -k, e, -b, h, -n, t, -z, F, B, -v, p, -j, d, -c, i, -o, u, -A,}, \ + { E, -A, w, -s, o, -k, g, -c, b, -f, j, -n, r, -v, z, -D, -F, B, -x, t, -p, l, -h, d, -a, e, -i, m, -q, u, -y, C,}, \ + { F, -D, B, -z, x, -v, t, -r, p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o, q, -s, u, -w, y, -A, C, -E,}, \ +} + + +// DST-7 +#define DEFINE_DST7_P4_MATRIX(a,b,c,d) \ +{ \ + { a, b, c, d }, \ + { c, c, 0, -c }, \ + { d, -a, -c, b }, \ + { b, -d, c, -a }, \ +} + +#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \ +{ \ + { a, b, c, d, e, f, g, h,}, \ + { c, f, h, e, b, -a, -d, -g,}, \ + { e, g, b, -c, -h, -d, a, f,}, \ + { g, c, -d, -f, a, h, b, -e,}, \ + { h, -a, -g, b, f, -c, -e, d,}, \ + { f, -e, -a, g, -d, -b, h, -c,}, \ + { d, -h, e, -a, -c, g, -f, b,}, \ + { b, -d, f, -h, g, -e, c, -a,}, \ +} + +#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p,}, \ + { c, f, i, l, o, o, l, i, f, c, 0, -c, -f, -i, -l, -o,}, \ + { e, j, o, m, h, c, -b, -g, -l, -p, -k, -f, -a, d, i, n,}, \ + { g, n, l, e, -b, -i, -p, -j, -c, d, k, o, h, a, -f, -m,}, \ + { i, o, f, -c, -l, -l, -c, f, o, i, 0, -i, -o, -f, c, l,}, \ + { k, k, 0, -k, -k, 0, k, k, 0, -k, -k, 0, k, k, 0, -k,}, \ + { m, g, -f, -n, -a, l, h, -e, -o, -b, k, i, -d, -p, -c, j,}, \ + { o, c, -l, -f, i, i, -f, -l, c, o, 0, -o, -c, l, f, -i,}, \ + { p, -a, -o, b, n, -c, -m, d, l, -e, -k, f, j, -g, -i, h,}, \ + { n, -e, -i, j, d, -o, a, m, -f, -h, k, c, -p, b, l, -g,}, \ + { l, -i, -c, o, -f, -f, o, -c, -i, l, 0, -l, i, c, -o, f,}, \ + { j, -m, c, g, -p, f, d, -n, i, a, -k, l, -b, -h, o, -e,}, \ + { h, -p, i, -a, -g, o, -j, b, f, -n, k, -c, -e, m, -l, d,}, \ + { f, -l, o, -i, c, c, -i, o, -l, f, 0, -f, l, -o, i, -c,}, \ + { d, -h, l, -p, m, -i, e, -a, -c, g, -k, o, -n, j, -f, b,}, \ + { b, -d, f, -h, j, -l, n, -p, o, -m, k, -i, g, -e, c, -a,}, \ +} + +#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F,}, \ + { c, f, i, l, o, r, u, x, A, D, F, C, z, w, t, q, n, k, h, e, b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E,}, \ + { e, j, o, t, y, D, D, y, t, o, j, e, 0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e, 0, e, j, o, t, y, D,}, \ + { g, n, u, B, D, w, p, i, b, -e, -l, -s, -z, -F, -y, -r, -k, -d, c, j, q, x, E, A, t, m, f, -a, -h, -o, -v, -C,}, \ + { i, r, A, C, t, k, b, -g, -p, -y, -E, -v, -m, -d, e, n, w, F, x, o, f, -c, -l, -u, -D, -z, -q, -h, a, j, s, B,}, \ + { k, v, F, u, j, -a, -l, -w, -E, -t, -i, b, m, x, D, s, h, -c, -n, -y, -C, -r, -g, d, o, z, B, q, f, -e, -p, -A,}, \ + { m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z,}, \ + { o, D, t, e, -j, -y, -y, -j, e, t, D, o, 0, -o, -D, -t, -e, j, y, y, j, -e, -t, -D, -o, 0, o, D, t, e, -j, -y,}, \ + { q, E, n, -c, -t, -B, -k, f, w, y, h, -i, -z, -v, -e, l, C, s, b, -o, -F, -p, a, r, D, m, -d, -u, -A, -j, g, x,}, \ + { s, A, h, -k, -D, -p, c, v, x, e, -n, -F, -m, f, y, u, b, -q, -C, -j, i, B, r, -a, -t, -z, -g, l, E, o, -d, -w,}, \ + { u, w, b, -s, -y, -d, q, A, f, -o, -C, -h, m, E, j, -k, -F, -l, i, D, n, -g, -B, -p, e, z, r, -c, -x, -t, a, v,}, \ + { w, s, -d, -A, -o, h, E, k, -l, -D, -g, p, z, c, -t, -v, a, x, r, -e, -B, -n, i, F, j, -m, -C, -f, q, y, b, -u,}, \ + { y, o, -j, -D, -e, t, t, -e, -D, -j, o, y, 0, -y, -o, j, D, e, -t, -t, e, D, j, -o, -y, 0, y, o, -j, -D, -e, t,}, \ + { A, k, -p, -v, e, F, f, -u, -q, j, B, a, -z, -l, o, w, -d, -E, -g, t, r, -i, -C, -b, y, m, -n, -x, c, D, h, -s,}, \ + { C, g, -v, -n, o, u, -h, -B, a, D, f, -w, -m, p, t, -i, -A, b, E, e, -x, -l, q, s, -j, -z, c, F, d, -y, -k, r,}, \ + { E, c, -B, -f, y, i, -v, -l, s, o, -p, -r, m, u, -j, -x, g, A, -d, -D, a, F, b, -C, -e, z, h, -w, -k, t, n, -q,}, \ + { F, -a, -E, b, D, -c, -C, d, B, -e, -A, f, z, -g, -y, h, x, -i, -w, j, v, -k, -u, l, t, -m, -s, n, r, -o, -q, p,}, \ + { D, -e, -y, j, t, -o, -o, t, j, -y, -e, D, 0, -D, e, y, -j, -t, o, o, -t, -j, y, e, -D, 0, D, -e, -y, j, t, -o,}, \ + { B, -i, -s, r, j, -A, -a, C, -h, -t, q, k, -z, -b, D, -g, -u, p, l, -y, -c, E, -f, -v, o, m, -x, -d, F, -e, -w, n,}, \ + { z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m,}, \ + { x, -q, -g, E, -j, -n, A, -c, -u, t, d, -B, m, k, -D, f, r, -w, -a, y, -p, -h, F, -i, -o, z, -b, -v, s, e, -C, l,}, \ + { v, -u, -a, w, -t, -b, x, -s, -c, y, -r, -d, z, -q, -e, A, -p, -f, B, -o, -g, C, -n, -h, D, -m, -i, E, -l, -j, F, -k,}, \ + { t, -y, e, o, -D, j, j, -D, o, e, -y, t, 0, -t, y, -e, -o, D, -j, -j, D, -o, -e, y, -t, 0, t, -y, e, o, -D, j,}, \ + { r, -C, k, g, -y, v, -d, -n, F, -o, -c, u, -z, h, j, -B, s, -a, -q, D, -l, -f, x, -w, e, m, -E, p, b, -t, A, -i,}, \ + { p, -F, q, -a, -o, E, -r, b, n, -D, s, -c, -m, C, -t, d, l, -B, u, -e, -k, A, -v, f, j, -z, w, -g, -i, y, -x, h,}, \ + { n, -B, w, -i, -e, s, -F, r, -d, -j, x, -A, m, a, -o, C, -v, h, f, -t, E, -q, c, k, -y, z, -l, -b, p, -D, u, -g,}, \ + { l, -x, C, -q, e, g, -s, E, -v, j, b, -n, z, -A, o, -c, -i, u, -F, t, -h, -d, p, -B, y, -m, a, k, -w, D, -r, f,}, \ + { j, -t, D, -y, o, -e, -e, o, -y, D, -t, j, 0, -j, t, -D, y, -o, e, e, -o, y, -D, t, -j, 0, j, -t, D, -y, o, -e,}, \ + { h, -p, x, -F, y, -q, i, -a, -g, o, -w, E, -z, r, -j, b, f, -n, v, -D, A, -s, k, -c, -e, m, -u, C, -B, t, -l, d,}, \ + { f, -l, r, -x, D, -C, w, -q, k, -e, -a, g, -m, s, -y, E, -B, v, -p, j, -d, -b, h, -n, t, -z, F, -A, u, -o, i, -c,}, \ + { d, -h, l, -p, t, -x, B, -F, C, -y, u, -q, m, -i, e, -a, -c, g, -k, o, -s, w, -A, E, -D, z, -v, r, -n, j, -f, b,}, \ + { b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a,}, \ +} + + +#endif DCT_AVX2_TABLES_H diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c new file mode 100644 index 00000000..b393bce6 --- /dev/null +++ b/src/strategies/avx2/depquant-avx2.c @@ -0,0 +1,1544 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/* +* \file +*/ + +#include "strategies/avx2/depquant-avx2.h" +#include "strategyselector.h" + + +#if COMPILE_INTEL_AVX2 && defined X86_64 +#include "dep_quant.h" + +#include +#include "cu.h" +#include "encoderstate.h" +#include "intra.h" +#include "rdo.h" +#include "transform.h" +#include "generic/quant-generic.h" +#include "uvg_math.h" +static const int32_t g_goRiceBits[4][RICEMAX] = { + { 32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, + { 65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, + { 98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680}, + {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376}, +}; + +static const int g_riceT[4] = { 32,128, 512, 2048 }; +static const int g_riceShift[5] = { 0, 2, 4, 6, 8 }; + +static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 }; + +static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start) +{ + int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; + + __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]); + __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]); + + __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]); + __m256i rd_cost_b = rd_cost_a; + __m256i rd_cost_z = rd_cost_a; + + rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist); + rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist); + + + if (state->all_gte_four) { + // pqDataA + // In case the both levels are smaller than 4 or gte 4 avx 2 can be used + if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) { + // The coeffFracBits arrays are 6 elements long, so we need to offset the indices and gather is only eficient way to load the data + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4); + // RD costs are 64 bit, so we need to extend the 32 bit values + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits); + } + + else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + + __m128i max_rice = _mm_set1_epi32(31); + value = _mm_min_epi32(value, max_rice); + // In the original implementation the goRiceTab is selected beforehand, but since we need to load from + // potentially four different locations, we need to calculate the offsets and use gather + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i *)&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqAs[4] = {0, 0, 3, 3}; + ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; + // AVX2 cannot be used so we have to loop the values normally + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqA = pqAs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqA] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0])); + } + + // pqDataB, same stuff as for pqDataA + if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) { + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits); + } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + + __m128i max_rice = _mm_set1_epi32(31); + value = _mm_min_epi32(value, max_rice); + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqBs[4] = {2, 2, 1, 1}; + int64_t rd_costs[4] = {0, 0, 0, 0}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqB = pqBs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqB] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_b = + _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0])); + } + + if (spt == SCAN_ISCSBB) { + // This loads values such as that the values are + // |State 0 Flag 0|State 0 Flag 1|State 1 Flag 0|State 1 Flag 1|State 2 Flag 0|State 2 Flag 1|State 3 Flag 0|State 3 Flag 1| + // By setting the flag 1 bits to zero we get the flag 0 values as 64 bit integers (even) variable which we can be summed to the rd_cost + // Flag 1 values can be shifted 32 to right and again we have 64 bit integeres holding the values (odd) which can be summed to the rd_cost + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff)); + __m256i odd = _mm256_srli_epi64(original, 32); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even); + } else if (spt == SCAN_SOCSBB) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + + // Same here + __m256i m_sigFracBits_0 = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff)); + __m256i m_sigFracBits_1 = _mm256_srli_epi64(original, 32); + + original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]); + __m256i m_sbbFracBits_1 = _mm256_srli_epi64(original, 32); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0); + } + else { + int num_sig_sbb; + memcpy(&num_sig_sbb, &state->m_numSigSbb[start], 4); + // numSigSbb only has values 1 or zero, so if all 4 values are 1 the complete value is 0x01010101 + if (num_sig_sbb == 0x01010101) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even = _mm256_and_si256(original, _mm256_set1_epi64x(0xffffffff)); + __m256i odd = _mm256_srli_epi64(original, 32); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even); + } + else if (num_sig_sbb == 0) { + rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]); + } + + else { + const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3}; + _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a); + _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b); + _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z); + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + if (state->m_numSigSbb[state_offset]) { + temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0]; + } else { + temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]]; + } + } + rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); + rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); + rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); + } + } + } else if (state->all_lt_four) { + __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS); + __m128i max_rice = _mm_set1_epi32(31); + __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start])); + // RD cost A + { + __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]); + // Calculate mask for pqDataA->absLevel <= state->m_goRiceZero + // The mask is reverse of the one that is used in the scalar code so the values are in other order in blendv + __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero); + + // pqDataA->absLevel < RICEMAX ? pqDataA->absLevel : RICEMAX - 1 + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice); + + // pqDataA->absLevel - 1 + __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); + + // Again calculate the offset for the different go_rice_tabs + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); + //(1 << SCALE_BITS) + goRiceTab[selected] + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } + // RD cost b, same as RD cost A + { + __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]); + __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero); + + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); + + + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } + // RD cost Z + { + // This time the go_rice_tab is offset with only the go_rize_zero + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4); + rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab)); + } + } else { + const int pqAs[4] = {0, 0, 3, 3}; + const int pqBs[4] = {2, 2, 1, 1}; + const int decision_a[4] = {0, 2, 1, 3}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + const int pqA = pqAs[i]; + const int pqB = pqBs[i]; + int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA]; + int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB]; + int64_t rdCostZ = state->m_rdCost[state_offset]; + if (state->m_remRegBins[state_offset] >= 4) { + if (pqDataA->absLevel[pqA] < 4) { + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (pqDataA->absLevel[pqB] < 4) { + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (spt == SCAN_ISCSBB) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else if (spt == SCAN_SOCSBB) { + rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0]; + } else if (state->m_numSigSbb[state_offset]) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else { + rdCostZ = decisions->rdCost[decision_a[i]]; + } + } else { + rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)]; + rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)]; + rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]]; + } + temp_rd_cost_a[i] = rdCostA; + temp_rd_cost_b[i] = rdCostB; + temp_rd_cost_z[i] = rdCostZ; + } + rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); + rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); + rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); + } + // Re order the cost so that cost of state 0 is in the first element state 1 in second etc + rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216); + rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141); + rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216); + __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost); + + __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel); + __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId); + __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20); + __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + + // Store data for all of the cost so that the lower 32 bits have coefficient magnitude and upper have the previous state + decision_data = _mm256_permutevar8x32_epi32(decision_data, mask); + __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]); + __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]); + __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0); + + __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b); + __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b); + __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b); + + __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_decision, rd_cost_z); + __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_decision, rd_cost_z, z_vs_decision); + __m256i cheaper_second_data = _mm256_blendv_epi8(decision_data, z_data, z_vs_decision); + + __m256i final_decision = _mm256_cmpgt_epi64(cheaper_second, cheaper_first); + __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_second, cheaper_first, final_decision); + __m256i final_data = _mm256_blendv_epi8(cheaper_second_data, cheaper_first_data, final_decision); + + _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost); + final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data); +} + + +static INLINE void checkRdCostSkipSbbZeroOut( + Decision* decision, + const all_depquant_states* const state, + int decision_id, + int skip_offset) +{ + int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0]; + decision->rdCost[decision_id] = rdCost; + decision->absLevel[decision_id] = 0; + decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; +} + + +static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset) +{ + int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0]; + if (rdCost < decisions->rdCost[decision_id]) + { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = 0; + decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id]; + } +} + +static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int + decision_id) +{ + int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset; + if (pqData->absLevel[decision_id] < 4) { + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]]; + } + else { + const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1; + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1]; + } + if (rdCost < decisions->rdCost[decision_id]) { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = pqData->absLevel[decision_id]; + decisions->prevId[decision_id] = -1; + } +} + +static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff) +{ + int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff; + coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift))); + int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact; + int index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; +} + + +static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2}, + .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} }; + + +static void xDecide( + all_depquant_states* const all_states, + depquant_state* const m_startState, + quant_block * qp, + const enum ScanPosType spt, + const coeff_t absCoeff, + const int lastOffset, + Decision* decisions, + bool zeroOut, + coeff_t quanCoeff, + const int skip_offset, + const int prev_offset) +{ + memcpy(decisions, &startDec, sizeof(Decision)); + + if (zeroOut) { + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset); + } + return; + } + + PQData pqData; + preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); + check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset); + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 2, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 3, skip_offset); + } + + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0); + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2); +} + + +static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos, + const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext, + const uint32_t width_in_sbb, const uint32_t height_in_sbb, + const uint32_t next_sbb_right, const uint32_t next_sbb_below, + const Decision* decisions) +{ + all_depquant_states* state = &ctxs->m_allStates; + bool all_above_minus_two = true; + bool all_between_zero_and_three = true; + bool all_above_four = true; + + + int state_offset = ctxs->m_curr_state_offset; + __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); + _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); + for (int i = 0; i < 4; ++i) { + all_above_minus_two &= decisions->prevId[i] > -2; + all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4; + all_above_four &= decisions->prevId[i] >= 4; + } + if (all_above_minus_two) { + bool all_have_previous_state = true; + __m128i prev_state; + __m128i prev_state_no_offset; + __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12); + if (all_above_four) { + prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset); + prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4)); + prev_state = _mm_add_epi32( + prev_state, + prev_state_no_offset + ); + memset(&state->m_numSigSbb[state_offset], 0, 4); + memset(state->m_absLevels[state_offset >> 2], 0, 64 * sizeof(uint8_t)); + + } else if (all_between_zero_and_three) { + prev_state_no_offset = _mm_load_si128((const __m128i*)decisions->prevId); + prev_state = _mm_add_epi32( + prev_state_no_offset, + _mm_set1_epi32(ctxs->m_prev_state_offset) + ); + // Set the high bytes to 0xff so that the shuffle will set them to zero and it won't cause problems with the min_epi32 + __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00)); + __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb); + num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes); + num_sig_sbb = _mm_or_si128( + num_sig_sbb, + _mm_min_epi32(abs_level, _mm_set1_epi32(1)) + ); + + num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control); + int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0); + memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4); + + // Set this so that the temp_prev_state has the previous state set into the first 4 bytes and duplicated to the second 4 bytes + __m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control); + __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state); + // Duplicate the state all over the vector so that all 32 bytes hold the previous states + prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0); + // Increment the second set by four, third by eight and fourth by twelve and repeat for the second lane + __m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c); + prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add); + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) { + __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]); + data = _mm256_shuffle_epi8(data, prev_state_256); + _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data); + } + } else { + // TODO: it would be possible to do the absLevels update with avx2 even here just would need to set the shuffle mask to + // 0xff for the states that don't have previous state or the previous state is a skip state + int prev_state_s[4] = {-1, -1, -1, -1}; + for (int i = 0; i < 4; ++i) { + const int decision_id = i; + const int curr_state_offset = state_offset + i; + if (decisions->prevId[decision_id] >= 4) { + prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); + state->m_numSigSbb[curr_state_offset] = 0; + for (int j = i; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j] = 0; + } + } else if (decisions->prevId[decision_id] >= 0) { + prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; + state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id]; + for (int j = 0; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][j + decisions->prevId[decision_id]]; + } + } else { + state->m_numSigSbb[curr_state_offset] = 1; + for (int j = i; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j] = 0; + } + all_have_previous_state = false; + } + } + prev_state = _mm_loadu_si128((__m128i const*)prev_state_s); + } + uint32_t level_offset = scan_pos & 15; + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255)); + max_abs = _mm_shuffle_epi8(max_abs, control); + uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0); + memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4); + + + // Update common context + __m128i last; + { + const uint32_t numSbb = width_in_sbb * height_in_sbb; + common_context* cc = &ctxs->m_common_context; + size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t); + uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags; + uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scan_pos * 4; + uint8_t* levels_in = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels + scan_pos * 4; + int previous_state_array[4]; + _mm_storeu_si128((__m128i*)previous_state_array, prev_state); + + if (all_have_previous_state) { + __m128i temp_p_state = _mm_shuffle_epi8(prev_state, control); + // Similarly to how the abs level was done earlier set the previous state duplicated across the lane + __m128i ref_sbb_ctx_offset = _mm_load_si128((__m128i*)ctxs->m_allStates.m_refSbbCtxId); + ref_sbb_ctx_offset = _mm_shuffle_epi8(ref_sbb_ctx_offset, temp_p_state); + // numSbb is two or four, in case it is one this function is never called + if (numSbb <= 4) { + __m128i incremented_ref_sbb_ctx_offset = _mm_add_epi8( + ref_sbb_ctx_offset, + _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12) + ); + // In case the ref_sbb_ctx is minus one the values need to be set to zero, which is achieved by + // first finding which states have the minus one and then the blend is used after the load to + // set the corresponding values to zero + __m128i blend_mask = _mm_cmpeq_epi8(ref_sbb_ctx_offset, _mm_set1_epi32(0xffffffff)); + __m128i sbb_flags = _mm_loadu_si128((__m128i*)cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags); + sbb_flags = _mm_shuffle_epi8(sbb_flags, incremented_ref_sbb_ctx_offset); + sbb_flags = _mm_blendv_epi8(sbb_flags, _mm_set1_epi64x(0), blend_mask); + if (numSbb == 2) { + uint64_t temp = _mm_extract_epi64(sbb_flags, 0); + memcpy(sbbFlags, &temp, 8); + } else { + _mm_storeu_si128((__m128i*)sbbFlags, sbb_flags); + } + } else { + __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset); + extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0); + __m256i inc_ref_state = _mm256_add_epi8( + extended_ref_state, + _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c) + ); + // Unlike the case for two or four sbb, the blendv is used to set the shuffle mask to -1 so that + // the shuffle will set the values to zero. Its better to do this way here so that the blendv is + // not called in the loop, and the other is done the otherway because I implemented it first + // and only realized afterwards that this order is better + __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff)); + inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask); + for (int i = 0; i < numSbb * 4; i += 32) { + __m256i sbb_flags = _mm256_loadu_si256((__m256i*)(&cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i])); + sbb_flags = _mm256_shuffle_epi8(sbb_flags, inc_ref_state); + _mm256_store_si256((__m256i*)&sbbFlags[i], sbb_flags); + } + } + // The first 16 variables will be loaded from the previous state so this can be started from 16 + int levels_start = 16; + // Do avx2 optimized version for the amount that is divisible by 8 (four states of 8 1-byte values) + const uint64_t limit = setCpSize & ~(8 - 1); + if (levels_start < limit) { + // Overall this is the same to the numSbb > 4 + __m256i extended_ref_state = _mm256_zextsi128_si256(ref_sbb_ctx_offset); + extended_ref_state = _mm256_permute4x64_epi64(extended_ref_state, 0); + __m256i inc_ref_state = _mm256_add_epi8( + extended_ref_state, + _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c,0, 0x04040404, 0x08080808, 0x0c0c0c0c) + ); + __m256i blend_mask = _mm256_cmpeq_epi8(extended_ref_state, _mm256_set1_epi32(0xffffffff)); + inc_ref_state = _mm256_blendv_epi8(inc_ref_state, _mm256_set1_epi32(0xffffffff), blend_mask); + for (; levels_start < limit; levels_start += 8) { + __m256i levels_v = _mm256_loadu_si256((__m256i*)(&levels_in[levels_start * 4])); + levels_v = _mm256_shuffle_epi8(levels_v, inc_ref_state); + _mm256_store_si256((__m256i*)&levels[levels_start * 4], levels_v); + } + } + uint8_t ref_sbb[4]; + int temp_sbb_ref = _mm_extract_epi32(ref_sbb_ctx_offset, 0); + memcpy(ref_sbb, &temp_sbb_ref, 4); + // Do the excess that is not divisible by 8 + for (;levels_start < setCpSize; ++levels_start) { + uint8_t new_values[4]; + new_values[0] = ref_sbb[0] != 0xff ? levels_in[levels_start * 4 + ref_sbb[0]] : 0; + new_values[1] = ref_sbb[1] != 0xff ? levels_in[levels_start * 4 + ref_sbb[1]] : 0; + new_values[2] = ref_sbb[2] != 0xff ? levels_in[levels_start * 4 + ref_sbb[2]] : 0; + new_values[3] = ref_sbb[3] != 0xff ? levels_in[levels_start * 4 + ref_sbb[3]] : 0; + memcpy(&levels[levels_start * 4], new_values, 4); + } + + } + else { + //TODO: This could also be done using avx2 just need to check for both wheter the previous state + // is minus one and that if the ref_sbb_ctx_id is minus one. + for (int curr_state = 0; curr_state < 4; ++curr_state) { + const int p_state = previous_state_array[curr_state]; + if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) { + const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state]; + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb]; + } + for (int i = 16; i < setCpSize; ++i) { + levels[i * 4 + curr_state] = levels_in[i * 4 + prev_sbb]; + } + } else { + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state] = 0; + } + for (int i = 16; i < setCpSize; ++i) { + levels[ i * 4 + curr_state] = 0; + } + } + } + } + memcpy(levels, ctxs->m_allStates.m_absLevels[state_offset / 4], 64); + memcpy(&sbbFlags[cg_pos * 4], &ctxs->m_allStates.m_numSigSbb[state_offset], 4); + + __m128i sbb_right = next_sbb_right ? + _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) : + _mm_set1_epi32(0); + + __m128i sbb_below = next_sbb_below ? + _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_below * 4])) : + _mm_set1_epi32(0); + + __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below); + sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1)); + // Gather is not necessary here put it would require at least five operation to do the same thing + // so the performance gain in my opinion is not worth the readability loss + __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long int *)cc->m_sbbFlagBits[0], sig_sbb, 8); + _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits); + + memset(&state->m_numSigSbb[state_offset], 0, 4); + memset(&state->m_goRicePar[state_offset], 0, 4); + + uint8_t states[4] = {0, 1, 2, 3}; + memcpy(&state->m_refSbbCtxId[state_offset], states, 4); + if (all_have_previous_state) { + __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4); + _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); + } else { + const int temp = (state->effWidth * state->effHeight * 28) / 16; + for (int i = 0; i < 4; ++i) { + if (previous_state_array[i] != -1) { + state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]]; + } else { + state->m_remRegBins[i + state_offset] = temp; + } + } + } + + const int scanBeg = scan_pos - 16; + const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; + const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4; + + __m128i ones = _mm_set1_epi32(1); + __m128i fours = _mm_set1_epi32(4); + __m256i all[4]; + uint64_t temp[4]; + + for (int id = 0; id < 16; id++, nbOut++) { + if (nbOut->num == 0) { + temp[id % 4] = 0; + if (id % 4 == 3) { + all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); + } + continue; + } + __m128i sum_abs = _mm_set1_epi32(0); + __m128i sum_abs_1 = _mm_set1_epi32(0); + __m128i sum_num = _mm_set1_epi32(0); + switch (nbOut->num) { + case 5: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[4] * 4]))); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones) + ) + ); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 4: { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[3] * 4]))); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 3: { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[2] * 4]))); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 2: { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[1] * 4]))); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 1: { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[0] * 4]))); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + break; + default: + assert(0); + } + sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3); + sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8); + __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs); + template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1); + __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0); + __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask); + temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0); + if (id % 4 == 3) { + all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); + last = template_ctx_init; + } + } + + _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][0]), all[0]); + _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][16]), all[1]); + _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][32]), all[2]); + _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]); + + memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4); + } + // End update common context + + __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7)); + __m128i sum_abs1 = _mm_and_si128( + _mm_srli_epi32(last, 3), + _mm_set1_epi32(31)); + + __m128i sum_abs_min = _mm_min_epi32( + _mm_set1_epi32(3), + _mm_srli_epi32( + _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)), + 1)); + + __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); + offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); + offsets = _mm_add_epi32(offsets, sum_abs_min); + __m256i sig_frac_bits = _mm256_i32gather_epi64((long long const*)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8); + _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); + + + __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); + __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); + uint32_t sum_gt1_s[4]; + _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1); + // These are 192 bits so no benefit from using avx2 + for (int i = 0; i < 4; ++i) { + memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0])); + } + } + else { + for (int i = 0; i < 4; i++) { + uvg_dep_quant_update_state_eos( + ctxs, + scan_pos, + cg_pos, + sigCtxOffsetNext, + gtxCtxOffsetNext, + width_in_sbb, + height_in_sbb, + next_sbb_right, + next_sbb_below, + decisions, + i); + } + } +} + +static INLINE void update_states_avx2( + context_store* ctxs, + int numIPos, + const uint32_t scan_pos, + const Decision* decisions, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const NbInfoSbb next_nb_info_ssb, + const int baseLevel, + const bool extRiceFlag) +{ + all_depquant_states* state = &ctxs->m_allStates; + + bool all_non_negative = true; + bool all_above_minus_two = true; + bool all_minus_one = true; + for (int i = 0; i < 4; ++i) { + all_non_negative &= decisions->prevId[i] >= 0; + all_above_minus_two &= decisions->prevId[i] > -2; + all_minus_one &= decisions->prevId[i] == -1; + } + int state_offset = ctxs->m_curr_state_offset; + __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); + _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); + if (all_above_minus_two) { + + bool rem_reg_all_gte_4 = true; + bool rem_reg_all_lt4 = true; + __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); + + __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel); + if (all_non_negative) { + __m128i prv_states_o = _mm_load_si128((__m128i const*)decisions->prevId); + __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); + __m128i prv_states = _mm_add_epi32(prv_states_o, prev_offset); + __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control); + + // sig_sbb values matter only whether they are one or zero so make sure that they stay at one or zero + // which allows some optimizations when handling the values in update_state_eos_avx2 + __m128i sig_sbb = _mm_load_si128((__m128i const*)state->m_numSigSbb); + sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states); + __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1)); + has_coeff = _mm_shuffle_epi8(has_coeff, control); + sig_sbb = _mm_or_si128(sig_sbb, has_coeff); + int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0); + memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4); + + // These following two are jus shuffled and then extracted the 4 bytes that store the values + __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId); + ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states); + int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0); + memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4); + + __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar); + go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + + // Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit + // Should be true for all gathers here + __m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sbbFracBits[0], prv_states, 8); + _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits); + + // Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1; + __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4); + __m128i ones = _mm_set1_epi32(1); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones); + + __m128i reg_bins_sub = _mm_set1_epi32(0); + // Next two lines: (decision->absLevel < 2 ? (unsigned)decision->absLevel : 3) + __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)); + __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two); + + // Depending on whether the rem_reg_bins are smaller than four or not, + // the reg_bins_sub is either 0 or result of the above operation + __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub); + _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins); + + // Save whether all rem_reg_bins are smaller than four or not and gte 4 as these + // are needed in multiple places + __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); + int bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_gte_4 = (bit_mask == 0xFFFF); + mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_lt4 = (bit_mask == 0xFFFF); + + // This is the same as in update_state_eos_avx2 + __m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control); + __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state); + prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0); + __m256i temp_add = _mm256_setr_epi32( + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c, + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c); + prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add); + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) { + __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]); + data = _mm256_shuffle_epi8(data, prev_state_256); + _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data); + } + + // This is overall the same as absLevels but since the ctx values are two bytes all of the + // masks have to account for that + __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId); + __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16); + prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask); + prev_state_full = _mm256_permute4x64_epi64(prev_state_full, 0); + prev_state_full = _mm256_slli_epi16(prev_state_full, 1); + temp_add = _mm256_setr_epi8( + 0, 1, 0, 1, 0, 1, 0, 1, + 8, 9, 8, 9, 8, 9, 8, 9, + 16, 17, 16, 17, 16, 17, 16, 17, + 24, 25, 24, 25, 24, 25, 24, 25); + prev_state_full = _mm256_add_epi8(prev_state_full, temp_add); + + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint16_t)))) { + __m256i data = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i])); + data = _mm256_shuffle_epi8(data, prev_state_full); + _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data); + } + } + else if (all_minus_one) { + memset(&state->m_numSigSbb[state_offset], 1, 4); + memset(&state->m_refSbbCtxId[state_offset], -1, 4); + + const int a = (state->effWidth * state->effHeight * 28) / 16; + + __m128i rem_reg_bins = _mm_set1_epi32(a); + __m128i sub = _mm_blendv_epi8( + _mm_set1_epi32(3), + abs_level, + _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)) + ); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub); + _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); + + __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); + int bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_gte_4 = (bit_mask == 0xFFFF); + mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_lt4 = (bit_mask == 0xFFFF); + + memset(state->m_absLevels[state_offset >> 2], 0, 16 * sizeof(uint8_t) * 4); + memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4); + + } + else { + for (int i = 0; i< 4; ++i) { + const int decision_id = i; + const int state_id = state_offset + i; + if (decisions->prevId[decision_id] >= 0) { + const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; + state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id]; + state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState]; + state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0]; + state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1]; + state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1; + state->m_goRicePar[state_id] = state->m_goRicePar[prvState]; + if (state->m_remRegBins[state_id] >= 4) { + state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); + } + } else { + state->m_numSigSbb[state_id] = 1; + state->m_refSbbCtxId[state_id] = -1; + int ctxBinSampleRatio = 28; + state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); + } + rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4; + rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4; + } + { + // Same as for the all_non_negative but use blendv to set the shuffle mask to -1 for the states that do not have previous state + __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId); + __m256i shuffle_mask = _mm256_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask); + prev_state_full = _mm256_permute4x64_epi64(prev_state_full, 0); + __m256i temp_add = _mm256_setr_epi32( + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c, + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c); + __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1)); + prev_state_full = _mm256_add_epi8(prev_state_full, temp_add); + prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask); + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) { + __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]); + data = _mm256_shuffle_epi8(data, prev_state_full); + _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data); + } + } + + { + __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId); + __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask); + prev_state_full = _mm256_permute4x64_epi64(prev_state_full, 0); + __m256i comp_mask = _mm256_cmpeq_epi8(prev_state_full, _mm256_set1_epi64x(-1)); + prev_state_full = _mm256_slli_epi16(prev_state_full, 1); + __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25); + + prev_state_full = _mm256_add_epi8(prev_state_full, temp_add); + prev_state_full = _mm256_blendv_epi8(prev_state_full, _mm256_set1_epi64x(-1), comp_mask); + + for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) { + __m256i data = _mm256_load_si256((__m256i*)(&state->m_ctxInit[(ctxs->m_prev_state_offset >> 2)][i])); + data = _mm256_shuffle_epi8(data, prev_state_full); + _mm256_store_si256((__m256i*)(&state->m_ctxInit[(state_offset >> 2)][i]), data); + } + } + } + uint32_t level_offset = scan_pos & 15; + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255)); + max_abs = _mm_shuffle_epi8(max_abs, control); + uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0); + memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4); + + state->all_gte_four = rem_reg_all_gte_4; + state->all_lt_four = rem_reg_all_lt4; + + if (rem_reg_all_gte_4) { + const __m128i ones = _mm_set1_epi32(1); + const uint32_t tinit_offset = MIN(level_offset - 1u, 15u); + __m128i tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4])); + tinit = _mm_cvtepi16_epi32(tinit); + __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31)); + __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7)); + + uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2]; + switch (numIPos) { + case 5: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(t, ones)); + } + case 4: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + } + case 3: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + } + case 2: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + } + case 1: { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + } break; + default: + assert(0); + } + __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); + __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); + offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); + __m128i temp = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1), + _mm_set1_epi32(3)); + offsets = _mm_add_epi32(offsets, temp); + __m256i sig_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sigFracBitsArray[state_offset][0], offsets, 8); + _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); + + sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); + sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext)); + uint32_t sum_gt1_s[4]; + _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1); + for (int i = 0; i < 4; ++i) { + memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0])); + } + + __m128i sum_abs = _mm_srli_epi32(tinit, 8); + sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(255)); + switch (numIPos) { + case 5: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 4: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 3: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 2: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 1: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } break; + default: + assert(0); + } + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + // int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0); + __m128i sum_all = _mm_max_epi32( + _mm_min_epi32( + _mm_set1_epi32(31), + _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))), + _mm_set1_epi32(0)); + __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i go_rice_par = _mm_shuffle_epi8(temp, control); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + } + } + + else if (rem_reg_all_lt4) { + uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2]; + const uint32_t tinit_offset = MIN(level_offset - 1u, 15u); + __m128i tinit = _mm_loadu_si128((__m128i*)(&state->m_ctxInit[state_offset >> 2][tinit_offset * 4])); + tinit = _mm_cvtepi16_epi32(tinit); + __m128i sum_abs = _mm_srli_epi32(tinit, 8); + sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(255)); + switch (numIPos) { + case 5: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 4: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 3: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 2: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 1: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } break; + default: + assert(0); + } + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs); + __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i go_rice_par = _mm_shuffle_epi8(temp, control); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + + // This cannot be vectorized because there is no way to dynamically shift values + for (int i = 0; i < 4; ++i) { + state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i]; + } + } + + } + else { + for (int i = 0; i < 4; ++i) { + const int state_id = state_offset + i; + uint8_t* levels = (uint8_t*)(state->m_absLevels[state_offset >> 2]); + if (state->m_remRegBins[state_id] >= 4) { + coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]; + coeff_t sumAbs1 = (tinit >> 3) & 31; + coeff_t sumNum = tinit & 7; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ + sumAbs1 += MIN(4 + (t & 1), t); \ + sumNum += !!t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + coeff_t sumGt1 = sumAbs1 - sumNum; + state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0]; + state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1]; + memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0])); + + + coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ + sumAbs += t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; + } + } else { + coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ + sumAbs += t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + sumAbs = MIN(31, sumAbs); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; + } + state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id]; + } + } + } + } else { + for (int i = 0; i < 4; ++i) { + state->all_gte_four = true; + state->all_lt_four = true; + uvg_dep_quant_update_state( + ctxs, + numIPos, + scan_pos, + decisions, + sigCtxOffsetNext, + gtxCtxOffsetNext, + next_nb_info_ssb, + baseLevel, + extRiceFlag, + i); + } + } +} + +void uvg_dep_quant_decide_and_update_avx2( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma) +{ + Decision* decisions = &ctxs->m_trellis[scan_pos]; + SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int); + + enum ScanPosType spt = 0; + if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1) + { + spt = SCAN_SOCSBB; + } + else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16) + { + spt = SCAN_EOCSBB; + } + + xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + + if (scan_pos) { + if (!(scan_pos & 15)) { + SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int); + update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions); + memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t)); + memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t)); + memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t)); + } else if (!zeroOut) { + update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false); + } + + if (spt == SCAN_SOCSBB) { + SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int); + } + } +} + + +void uvg_find_first_non_zero_avx2(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, const int width, const int height) +{ + const int default_quant_coeff = dep_quant_context->m_quant->m_QScale; + const int32_t thres = dep_quant_context->m_quant->m_thresLast; + int temp = *firstTestPos; + if (enableScalingLists) { + for (; temp >= 0; (temp)--) { + coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]); + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + } else { + coeff_t thresTmp = thres / (4 * default_quant_coeff); + if (temp >= 16 && height >= 4) { + __m256i th = _mm256_set1_epi16(thresTmp); + temp -= 15; + for (; temp >= 0; temp -= 16) { + __m256i sbb_data; + if (width <= 4) { + sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]); + } else if (width == 8) { + uint32_t i = scan[temp]; + __m256i first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]); + __m256i second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i+ 12]); + sbb_data = _mm256_blend_epi32(first, second, 204); + } else { + int16_t temp_d[16]; + uint32_t i = scan[temp]; + memcpy(temp_d, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 4, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 8, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 12, &srcCoeff[i], 8); + + sbb_data = _mm256_loadu_si256((__m256i const*)temp_d); + } + sbb_data = _mm256_abs_epi16(sbb_data); + + __m256i a = _mm256_cmpgt_epi16(sbb_data, th); + if (!_mm256_testz_si256(a, a)) + { + if (temp >= 0) { + temp += 15; + } + break; + } + } + } + for (;temp >= 0; temp--) { + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + } + + *firstTestPos = temp; +} + + +#endif //COMPILE_INTEL_AVX2 && defined X86_64 + +int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth) +{ + bool success = true; + +#if COMPILE_INTEL_AVX2 && defined X86_64 + success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2); + success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "avx2", 40, &uvg_find_first_non_zero_avx2); +#endif //COMPILE_INTEL_AVX2 && defined X86_64 + + return success; +} diff --git a/src/strategies/avx2/depquant-avx2.h b/src/strategies/avx2/depquant-avx2.h new file mode 100644 index 00000000..e6db110c --- /dev/null +++ b/src/strategies/avx2/depquant-avx2.h @@ -0,0 +1,46 @@ +#ifndef STRATEGIES_DEPQUANT_AVX2_H_ +#define STRATEGIES_DEPQUANT_AVX2_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for AVX2. + */ + +#include "global.h" // IWYU pragma: keep + + +int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth); + +#endif //STRATEGIES_DEPQUANT_AVX2_H_ diff --git a/src/strategies/avx2/encode_coding_tree-avx2.h b/src/strategies/avx2/encode_coding_tree-avx2.h index ae1845c8..ea7f077e 100644 --- a/src/strategies/avx2/encode_coding_tree-avx2.h +++ b/src/strategies/avx2/encode_coding_tree-avx2.h @@ -38,13 +38,14 @@ * Functions for writing the coding quadtree and related syntax. */ +#include "cu.h" #include "encoderstate.h" #include "global.h" void uvg_encode_coeff_nxn_avx2(encoder_state_t * const state, cabac_data_t * const cabac, const coeff_t *coeff, - uint8_t width, + const cu_loc_t *loc, uint8_t type, int8_t scan_mode, int8_t tr_skip, diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 53282e87..838bad91 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -42,10 +42,9 @@ #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" - /** * \brief Generate angular predictions. - * \param log2_width Log2 of width, range 2..5. + * \param cu_loc CU locationand size data. * \param intra_mode Angular mode in range 2..34. * \param channel_type Color channel. * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. @@ -54,20 +53,28 @@ * \param multi_ref_idx Reference line index for use with MRL. */ static void uvg_angular_pred_avx2( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, const int_fast8_t channel_type, const uvg_pixel *const in_ref_above, const uvg_pixel *const in_ref_left, uvg_pixel *const dst, - const uint8_t multi_ref_idx) + const uint8_t multi_ref_idx, + const uint8_t isp_mode, + const int cu_dim) { - - assert(log2_width >= 2 && log2_width <= 5); + // ISP_TODO: non-square block implementation, height is passed but not used + const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); assert(intra_mode >= 2 && intra_mode <= 66); // TODO: implement handling of MRL uint8_t multi_ref_index = channel_type == COLOR_Y ? multi_ref_idx : 0; + uint8_t isp = isp_mode; __m256i p_shuf_01 = _mm256_setr_epi8( 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, @@ -142,7 +149,6 @@ static void uvg_angular_pred_avx2( //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 }; uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - const int_fast32_t width = 1 << log2_width; int32_t pred_mode = intra_mode; // ToDo: handle WAIP @@ -345,13 +351,13 @@ static void uvg_angular_pred_avx2( // PDPC - bool PDPC_filter = (width >= 4 || channel_type != 0); + bool PDPC_filter = ((width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) || channel_type != 0); if (pred_mode > 1 && pred_mode < 67) { if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. PDPC_filter = false; } else if (mode_disp > 0) { - PDPC_filter = (scale >= 0); + PDPC_filter &= (scale >= 0); } } if(PDPC_filter) { @@ -497,20 +503,27 @@ static void uvg_angular_pred_avx2( /** * \brief Generate planar prediction. - * \param log2_width Log2 of width, range 2..5. + * \param cu_loc CU location and size data. + * \param color Color channel. * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. * \param dst Buffer of size width*width. */ static void uvg_intra_pred_planar_avx2( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, + color_t color, const uint8_t *const ref_top, const uint8_t *const ref_left, uint8_t *const dst) { - assert(log2_width >= 2 && log2_width <= 5); + // ISP_TODO: non-square block implementation, height is passed but not used + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5)); - const int_fast8_t width = 1 << log2_width; const uint8_t top_right = ref_top[width + 1]; const uint8_t bottom_left = ref_left[width + 1]; @@ -964,12 +977,17 @@ static void uvg_intra_pred_filtered_dc_avx2( */ static void uvg_pdpc_planar_dc_avx2( const int mode, - const int width, - const int log2_width, + const cu_loc_t* const cu_loc, + const color_t color, const uvg_intra_ref *const used_ref, uvg_pixel *const dst) { + // ISP_TODO: non-square block implementation, height is passed but not used assert(mode == 0 || mode == 1); // planar or DC + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; __m256i shuf_mask_byte = _mm256_setr_epi8( 0, -1, 0, -1, 0, -1, 0, -1, diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index df90f149..26eb535e 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -716,8 +716,9 @@ SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4) static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec, const int ref_stride, const int rec_stride, - const int width) + const int width, const int height) { + assert(width == height && "Non square not yet implemented"); __m256i ssd_part; __m256i diff = _mm256_setzero_si256(); __m128i sum; @@ -1743,40 +1744,32 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t* return diff; } -static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) { - +static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride) { + // ISP_TODO: non-square block implementation, height is passed but not used __m128i diff = _mm_setzero_si128(); switch (width) { case 4: - diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[0]), diff); - diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[4]), diff); - diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[8]), diff); - diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[12]), diff); + for (int y = 0; y < height; y+=4) { + diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff); + } break; case 8: - diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[0]), diff); - diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[8]), diff); - diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[16]), diff); - diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[24]), diff); - diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[32]), diff); - diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[40]), diff); - diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[48]), diff); - diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[56]), diff); + for (int y = 0; y < height; y += 2) { + diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]); + _mm_storeu_si128((__m128i*) & (residual[y * 8]), diff); + diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]); + _mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff); + } break; default: - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 16) { diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]); _mm_storeu_si128((__m128i*) & residual[x + y * width], diff); diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 5c39fe11..cada96f1 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -380,20 +380,24 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx) { const encoder_control_t * const encoder = state->encoder_control; - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1]; + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; - uint32_t log2_tr_width = uvg_math_floor_log2(height); - uint32_t log2_tr_height = uvg_math_floor_log2(width); + bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size + const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color; const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6]; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform - const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift); + const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale); const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; + const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6]; + uint32_t ac_sum = 0; int32_t last_cg = -1; @@ -402,7 +406,7 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr // Loading once is enough if scaling lists are not off __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256(); if (!(state->encoder_control->scaling_list.enable)) { - low_b = _mm256_set1_epi32(quant_coeff[0]); + low_b = _mm256_set1_epi32(default_quant_coeff); high_b = low_b; } @@ -579,33 +583,60 @@ static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec)); } -static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){ +static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width, int height){ - switch (width) { + if (height == width || width >= 16) { + switch (width) { case 4: - *(int32_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride); - *(int32_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride); - *(int32_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride); - *(int32_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride); + *(int32_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 0 * width, pred_in + 0 * in_stride); + *(int32_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 1 * width, pred_in + 1 * in_stride); + *(int32_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 2 * width, pred_in + 2 * in_stride); + *(int32_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_4x1_avx2(residual + 3 * width, pred_in + 3 * in_stride); break; case 8: - *(int64_t*)&(rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride); - *(int64_t*)&(rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride); - *(int64_t*)&(rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride); - *(int64_t*)&(rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride); - *(int64_t*)&(rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride); - *(int64_t*)&(rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride); - *(int64_t*)&(rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride); - *(int64_t*)&(rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride); + *(int64_t*) & (rec_out[0 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 0 * width, pred_in + 0 * in_stride); + *(int64_t*)& (rec_out[1 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 1 * width, pred_in + 1 * in_stride); + *(int64_t*)& (rec_out[2 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 2 * width, pred_in + 2 * in_stride); + *(int64_t*)& (rec_out[3 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 3 * width, pred_in + 3 * in_stride); + *(int64_t*)& (rec_out[4 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 4 * width, pred_in + 4 * in_stride); + *(int64_t*)& (rec_out[5 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 5 * width, pred_in + 5 * in_stride); + *(int64_t*)& (rec_out[6 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 6 * width, pred_in + 6 * in_stride); + *(int64_t*)& (rec_out[7 * out_stride]) = get_quantized_recon_8x1_avx2(residual + 7 * width, pred_in + 7 * in_stride); break; default: - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 16) { - *(int64_t*)&(rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride); - *(int64_t*)&(rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride); + *(int64_t*)& (rec_out[x + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + x + y * width, pred_in + x + y * in_stride); + *(int64_t*)& (rec_out[(x + 8) + y * out_stride]) = get_quantized_recon_8x1_avx2(residual + (x + 8) + y * width, pred_in + (x + 8) + y * in_stride); } } break; + } + } + else { + switch (width) { + case 4: + for (int y = 0; y < height; y += 4) { + *(int32_t*)& (rec_out[(y + 0) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 0) * width, pred_in + (y + 0) * in_stride); + *(int32_t*)& (rec_out[(y + 1) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 1) * width, pred_in + (y + 1) * in_stride); + *(int32_t*)& (rec_out[(y + 2) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 2) * width, pred_in + (y + 2) * in_stride); + *(int32_t*)& (rec_out[(y + 3) * out_stride]) = get_quantized_recon_4x1_avx2(residual + (y + 3) * width, pred_in + (y + 3) * in_stride); + } + break; + case 8: + for (int y = 0; y < height; ++y) { + *(int32_t*)& (rec_out[y * out_stride]) = get_quantized_recon_8x1_avx2(residual + y * width, pred_in + y * in_stride); + } + break; + default: + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + int16_t val = residual[x + y * width] + pred_in[x + y * in_stride]; + rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val); + } + } + break; + } } } @@ -626,7 +657,7 @@ static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, * \returns Whether coeff_out contains any non-zero coefficients. */ int uvg_quantize_residual_avx2(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uint8_t *const ref_in, const uint8_t *const pred_in, @@ -637,15 +668,15 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, // Temporary arrays to pass data to and from uvg_quant and transform functions. ALIGNED(64) int16_t residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; - - const int height = width; // TODO: height for non-square blocks + // ISP_TODO: non-square block implementation, height is passed but not used + int has_coeffs = 0; assert(width <= TR_MAX_WIDTH); assert(width >= TR_MIN_WIDTH); // Get residual. (ref_in - pred_in -> residual) - uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride); + uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride); if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; @@ -662,40 +693,51 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, // Transform residual. (residual -> coeff) if (use_trskip) { - uvg_transformskip(state->encoder_control, residual, coeff, width); + uvg_transformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } const uint16_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx; if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) { // Forward low frequency non-separable transform - uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type); + uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode); } // Quantize coeffs. (coeff -> coeff_out) - - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if(!use_trskip && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + color, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { - int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - uvg_rdoq(state, coeff, coeff_out, width, width, color, - scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index); + uvg_rdoq(state, coeff, coeff_out, width, height, color, + scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0); } else if (state->encoder_control->cfg.rdoq_enable && use_trskip) { - uvg_ts_rdoq(state, coeff, coeff_out, width, width, color, + uvg_ts_rdoq(state, coeff, coeff_out, width, height, color, scan_order); } else { - uvg_quant(state, coeff, coeff_out, width, width, color, + uvg_quant(state, coeff, coeff_out, width, height, color, scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index); } // Check if there are any non-zero coefficients. - for (int i = 0; i < width * width; i += 8) { + for (int i = 0; i < width * height; i += 8) { __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_out[i])); has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff); if(has_coeffs) break; @@ -705,25 +747,25 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, // rec_out. if (has_coeffs && !early_skip) { // Get quantized residual. (coeff_out -> coeff -> residual) - uvg_dequant(state, coeff_out, coeff, width, width, color, + uvg_dequant(state, coeff_out, coeff, width, height, color, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y); if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) { // Inverse low frequency non-separable transform - uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type); + uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode); } if (use_trskip) { - uvg_itransformskip(state->encoder_control, residual, coeff, width); + uvg_itransformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; int sign, absval; int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]); sign = residual[x + y * width] >= 0 ? 1 : -1; @@ -739,14 +781,14 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, } // Get quantized reconstruction. (residual + pred_in -> rec_out) - get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width); + get_quantized_recon_avx2(residual, pred_in, in_stride, rec_out, out_stride, width, height); } else if (rec_out != pred_in) { // With no coeffs and rec_out == pred_int we skip copying the coefficients // because the reconstruction is just the prediction. int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { rec_out[x + y * out_stride] = pred_in[x + y * in_stride]; } @@ -763,20 +805,26 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip) { const encoder_control_t * const encoder = state->encoder_control; + if (encoder->cfg.dep_quant && !transform_skip) { + uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list); + return; + } int32_t shift,add,coeff_q; int32_t n; - int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); + bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size// Represents scaling through forward transform int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; - shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift); + shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale); if (encoder->scaling_list.enable) { - uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2; - uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2; int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color); const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6]; @@ -797,7 +845,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef } } } else { - int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6); + int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6); add = 1 << (shift-1); __m256i v_scale = _mm256_set1_epi32(scale); @@ -845,8 +893,9 @@ static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length) return parts[0] + parts[1] + parts[2] + parts[3]; } -static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64_t weights) +static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights) { + assert((width == height) && "Non-square block handling not implemented for this function."); const __m256i zero = _mm256_setzero_si256(); const __m256i threes = _mm256_set1_epi16(3); const __m256i negate_hibytes = _mm256_set1_epi16(0xff00); @@ -863,7 +912,7 @@ static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, uint64 __m256i wts_lo = _mm256_broadcastsi128_si256(wts_lo_128); __m256i wts_hi = _mm256_broadcastsi128_si256(wts_hi_128); - for (int i = 0; i < width * width; i += 32) { + for (int i = 0; i < width * height; i += 32) { __m256i curr_lo = _mm256_loadu_si256 ((const __m256i *)(coeff + i)); __m256i curr_abs_lo = _mm256_abs_epi16 (curr_lo); __m256i curr_max3_lo = _mm256_min_epu16 (curr_abs_lo, threes); diff --git a/src/strategies/generic/dct-generic.c b/src/strategies/generic/dct-generic.c index cd05a01f..ccddf17a 100644 --- a/src/strategies/generic/dct-generic.c +++ b/src/strategies/generic/dct-generic.c @@ -771,6 +771,12 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input, // DCT-2 +#define DEFINE_DCT2_P2_MATRIX(a) \ +{ \ + a, a, \ + a, -a \ +} + #define DEFINE_DCT2_P4_MATRIX(a,b,c) \ { \ a, a, a, a, \ @@ -1002,6 +1008,7 @@ static void fast_inverse_dst_4x4_generic(int8_t bitdepth, const int16_t* input, } // DCT-2 +const int16_t uvg_g_DCT2P2[4] = DEFINE_DCT2_P2_MATRIX(64); const int16_t uvg_g_DCT2P4[16] = DEFINE_DCT2_P4_MATRIX(64, 83, 36); const int16_t uvg_g_DCT2P8[64] = DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18); const int16_t uvg_g_DCT2P16[256] = DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9); @@ -1020,6 +1027,68 @@ const int16_t uvg_g_DCT8P16[256] = DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77 const int16_t uvg_g_DCT8P32[1024] = DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4); // ********************************** DCT-2 ********************************** +static void fastForwardDCT2_B2(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + int32_t j; + int32_t E, O; + int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + + const int16_t* iT = uvg_g_DCT2P2; + + int16_t *p_coef = dst; + const int reduced_line = line - skip_line; + for (j = 0; j < reduced_line; j++) + { + /* E and O */ + E = src[0] + src[1]; + O = src[0] - src[1]; + + dst[0] = (iT[0] * E + add) >> shift; + dst[line] = (iT[2] * O + add) >> shift; + + + src += 2; + dst++; + } + if (skip_line) + { + dst = p_coef + reduced_line; + for (j = 0; j < 2; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_line); + dst += line; + } + } +} + +static void fastInverseDCT2_B2(const int16_t* src, int16_t* dst, int shift, int line, int skip_line, int skip_line2) +{ + int32_t j; + int32_t E, O; + int32_t add = 1 << (shift - 1); + + const int16_t* iT = uvg_g_DCT2P2; + + const int reduced_line = line - skip_line; + for (j = 0; j < reduced_line; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + E = iT[0] * (src[0] + src[line]); + O = iT[2] * (src[0] - src[line]); + + /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */ + dst[0] = (short)CLIP(-32768, 32767, (E + add) >> shift); + dst[1] = (short)CLIP(-32768, 32767, (O + add) >> shift); + + src++; + dst += 2; + } + if (skip_line) + { + memset(dst, 0, (skip_line << 1) * sizeof(int16_t)); + } +} + static void fastForwardDCT2_B4(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) { int32_t j; @@ -1366,11 +1435,6 @@ static void fastForwardDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, dst += line; } } - if (skip_line2) { - const int reduced_line = line - skip_line2; - dst = p_coef + reduced_line * 32; - memset(dst, 0, skip_line2 * 32 * sizeof(coeff_t)); - } } static void fastInverseDCT2_B32(const int16_t* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) @@ -2417,16 +2481,16 @@ DCT_MTS_NXN_GENERIC(DST1, 32); typedef void partial_tr_func(const int16_t*, int16_t*, int32_t, int, int, int); // ToDo: Enable MTS 2x2 and 64x64 transforms -static partial_tr_func* dct_table[3][5] = { - { fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL }, - { fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL }, - { fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL }, +static partial_tr_func* dct_table[3][6] = { + { fastForwardDCT2_B2, fastForwardDCT2_B4, fastForwardDCT2_B8, fastForwardDCT2_B16, fastForwardDCT2_B32, NULL }, + { NULL, fastForwardDCT8_B4, fastForwardDCT8_B8, fastForwardDCT8_B16, fastForwardDCT8_B32, NULL }, + { NULL, fastForwardDST7_B4, fastForwardDST7_B8, fastForwardDST7_B16, fastForwardDST7_B32, NULL }, }; -static partial_tr_func* idct_table[3][5] = { - { fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ }, - { fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL }, - { fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL }, +static partial_tr_func* idct_table[3][6] = { + { fastInverseDCT2_B2, fastInverseDCT2_B4, fastInverseDCT2_B8, fastInverseDCT2_B16, fastInverseDCT2_B32, NULL/*fastInverseDCT2_B64*/ }, + { NULL, fastInverseDCT8_B4, fastInverseDCT8_B8, fastInverseDCT8_B16, fastInverseDCT8_B32, NULL }, + { NULL, fastInverseDST7_B4, fastInverseDST7_B8, fastInverseDST7_B16, fastInverseDST7_B32, NULL }, }; @@ -2436,11 +2500,12 @@ static const tr_type_t mts_subset_intra[4][2] = { { DST7, DST7 }, { DCT8, DST7 } void uvg_get_tr_type( int8_t width, + int8_t height, color_t color, const cu_info_t* tu, tr_type_t* hor_out, tr_type_t* ver_out, - const int8_t mts_idx) + const int8_t mts_type) { *hor_out = DCT2; *ver_out = DCT2; @@ -2450,13 +2515,19 @@ void uvg_get_tr_type( return; } - const int height = width; - const bool explicit_mts = mts_idx == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_idx == UVG_MTS_INTRA : (mts_idx == UVG_MTS_INTER && tu->type == CU_INTER)); - const bool implicit_mts = tu->type == CU_INTRA && (mts_idx == UVG_MTS_IMPLICIT || mts_idx == UVG_MTS_INTER); + const bool explicit_mts = mts_type == UVG_MTS_BOTH || (tu->type == CU_INTRA ? mts_type == UVG_MTS_INTRA : (mts_type == UVG_MTS_INTER && tu->type == CU_INTER)); + const bool implicit_mts = tu->type == CU_INTRA && (mts_type == UVG_MTS_IMPLICIT || mts_type == UVG_MTS_INTER); assert(!(explicit_mts && implicit_mts)); + const bool is_isp = tu->type == CU_INTRA && tu->intra.isp_mode && color == COLOR_Y ? tu->intra.isp_mode : 0; + const int8_t lfnst_idx = color == COLOR_Y ? tu->lfnst_idx : tu->cr_lfnst_idx; + // const bool is_sbt = cu->type == CU_INTER && tu->sbt && color == COLOR_Y; // TODO: check SBT here when implemented - if (implicit_mts) + if (is_isp && lfnst_idx) { + return; + } + + if (implicit_mts || (is_isp && explicit_mts)) { bool width_ok = width >= 4 && width <= 16; bool height_ok = height >= 4 && height <= 16; @@ -2472,6 +2543,10 @@ void uvg_get_tr_type( return; } + /* + TODO: SBT HANDLING + */ + if (explicit_mts) { if (tu->tr_idx > MTS_SKIP) { @@ -2487,27 +2562,31 @@ static void mts_dct_generic( const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx) + const int8_t mts_type) { tr_type_t type_hor; tr_type_t type_ver; - uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); + uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type); - if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx) + if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height) { - dct_func *dct_func = uvg_get_dct_func(width, color, tu->type); + dct_func *dct_func = uvg_get_dct_func(width, height, color, tu->type); dct_func(bitdepth, input, output); } else { - const int height = width; int skip_width = (type_hor != DCT2 && width == 32) ? 16 : (width > 32 ? width - 32 : 0); int skip_height = (type_ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; - if(tu->lfnst_idx || tu->cr_lfnst_idx) { + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + //const int log2_width_minus2 = uvg_g_convert_to_bit[width]; + //const int log2_height_minus2 = uvg_g_convert_to_bit[height]; + + if((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) { if ((width == 4 && height > 4) || (width > 4 && height == 4)) { skip_width = width - 4; @@ -2520,15 +2599,20 @@ static void mts_dct_generic( } } - partial_tr_func* dct_hor = dct_table[type_hor][log2_width_minus2]; - partial_tr_func* dct_ver = dct_table[type_ver][log2_width_minus2]; + partial_tr_func* dct_hor = width != 1 ? dct_table[type_hor][log2_width_minus1] : NULL; + partial_tr_func* dct_ver = height != 1 ? dct_table[type_ver][log2_height_minus1] : NULL; int16_t tmp[32 * 32]; - const int32_t shift_1st = log2_width_minus2 + bitdepth - 7; - const int32_t shift_2nd = log2_width_minus2 + 8; - - dct_hor(input, tmp, shift_1st, height, 0, skip_width); - dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height); + const int32_t shift_1st = log2_width_minus1 + bitdepth - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + if (height == 1) { + dct_hor(input, output, shift_1st, height, 0, skip_width); + } else if (width == 1) { + dct_ver(input, output, log2_height_minus1 + 1 + bitdepth + 6 - 15, width, 0, skip_height); + } else { + dct_hor(input, tmp, shift_1st, height, 0, skip_width); + dct_ver(tmp, output, shift_2nd, width, skip_width, skip_height); + } } } @@ -2538,36 +2622,57 @@ static void mts_idct_generic( const color_t color, const cu_info_t* tu, const int8_t width, + const int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx) + const int8_t mts_type) { tr_type_t type_hor; tr_type_t type_ver; - uvg_get_tr_type(width, color, tu, &type_hor, &type_ver, mts_idx); + uvg_get_tr_type(width, height, color, tu, &type_hor, &type_ver, mts_type); - if (type_hor == DCT2 && type_ver == DCT2) + if (type_hor == DCT2 && type_ver == DCT2 && !tu->lfnst_idx && !tu->cr_lfnst_idx && width == height) { - dct_func *idct_func = uvg_get_idct_func(width, color, tu->type); + dct_func *idct_func = uvg_get_idct_func(width, height, color, tu->type); idct_func(bitdepth, input, output); } else { - const int height = width; - const int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; - const int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0; - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; + int skip_width = (type_hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = (type_ver != DCT2 && height == 32) ? 16 : height > 32 ? height - 32 : 0; + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; - partial_tr_func* idct_hor = idct_table[type_hor][log2_width_minus2]; - partial_tr_func* idct_ver = idct_table[type_ver][log2_width_minus2]; + if ((tu->lfnst_idx && color == COLOR_Y) || (tu->cr_lfnst_idx && color != COLOR_Y)) { + if ((width == 4 && height > 4) || (width > 4 && height == 4)) { + skip_width = width - 4; + skip_height = height - 4; + } + else if ((width >= 8 && height >= 8)) { + skip_width = width - 8; + skip_height = height - 8; + } + } + + partial_tr_func* idct_hor = width != 1 ? idct_table[type_hor][log2_width_minus1] : NULL; + partial_tr_func* idct_ver = height != 1 ? idct_table[type_ver][log2_height_minus1] : NULL; int16_t tmp[32 * 32]; - const int32_t shift_1st = 7; - const int32_t shift_2nd = 20 - bitdepth; + const int max_log2_tr_dynamic_range = 15; + const int transform_matrix_shift = 6; - idct_ver(input, tmp, shift_1st, width, skip_width, skip_height); - idct_hor(tmp, output, shift_2nd, height, 0, skip_width); + const int32_t shift_1st = transform_matrix_shift + 1; + const int32_t shift_2nd = (transform_matrix_shift + max_log2_tr_dynamic_range - 1) - bitdepth; + + if (height == 1) { + idct_hor(input, output, shift_2nd + 1, height, 0, skip_width); + } else if (width == 1) { + idct_ver(input, output, shift_2nd + 1, width, 0, skip_height); + } else { + idct_ver(input, tmp, shift_1st, width, skip_width, skip_height); + idct_hor(tmp, output, shift_2nd, height, 0, skip_width); + } } } @@ -2582,6 +2687,7 @@ int uvg_strategy_register_dct_generic(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "dct_8x8", "generic", 0, &dct_8x8_generic); success &= uvg_strategyselector_register(opaque, "dct_16x16", "generic", 0, &dct_16x16_generic); success &= uvg_strategyselector_register(opaque, "dct_32x32", "generic", 0, &dct_32x32_generic); + //success &= uvg_strategyselector_register(opaque, "dct_non_square", "generic", 0, &dct_non_square_generic); success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "generic", 0, &fast_inverse_dst_4x4_generic); diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c new file mode 100644 index 00000000..b15ef52b --- /dev/null +++ b/src/strategies/generic/depquant-generic.c @@ -0,0 +1,252 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "strategies/generic/depquant-generic.h" + +#include "dep_quant.h" + +#include "cu.h" +#include "encoderstate.h" +#include "intra.h" +#include "rdo.h" +#include "strategyselector.h" +#include "transform.h" +#include "uvg_math.h" +#include "generic/quant-generic.h" +static const int32_t g_goRiceBits[4][RICEMAX] = { + {32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, + 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, + 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, + 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, + {65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, + 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, + 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, + 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, + {98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072, + 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, + 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, + 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680}, + {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, + 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, + 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, + 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376}, +}; + + +static INLINE void checkRdCostSkipSbbZeroOut( + Decision* decision, + const all_depquant_states* const state, + int decision_id, + int skip_offset) { + int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0]; + decision->rdCost[decision_id] = rdCost; + decision->absLevel[decision_id] = 0; + decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; +} + +static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset) +{ + int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0]; + if (rdCost < decisions->rdCost[decision_id]) + { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = 0; + decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id]; + } +} + +static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int + decision_id) +{ + int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset; + if (pqData->absLevel[decision_id] < 4) { + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]]; + } + else { + const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1; + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1]; + } + if (rdCost < decisions->rdCost[decision_id]) { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = pqData->absLevel[decision_id]; + decisions->prevId[decision_id] = -1; + } +} + + + +static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2}, + .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} }; + +static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff) +{ + int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff; + coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift))); + int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact; + int index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; +} + +static void xDecide( + all_depquant_states* const all_states, + depquant_state* const m_startState, + quant_block* qp, + const enum ScanPosType spt, + const coeff_t absCoeff, + const int lastOffset, + Decision* decisions, + bool zeroOut, + coeff_t quanCoeff, + const int skip_offset, + const int prev_offset) +{ + memcpy(decisions, &startDec, sizeof(Decision)); + + if (zeroOut) { + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset); + } + return; + } + + PQData pqData; + preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 2, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 3, skip_offset); + } + + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0); + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2); +} + + +static void uvg_dep_quant_decide_and_update_generic( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma) +{ + Decision* decisions = &ctxs->m_trellis[scan_pos]; + SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int); + + enum ScanPosType spt = 0; + if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1) + { + spt = SCAN_SOCSBB; + } + else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16) + { + spt = SCAN_EOCSBB; + } + + xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + + if (scan_pos) { + if (!(scan_pos & 15)) { + SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3); + memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t)); + memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t)); + memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t)); + } else if (!zeroOut) { + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3); + } + + if (spt == SCAN_SOCSBB) { + SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int); + } + } +} + + +void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, const context_store * const dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height) +{ + const int default_quant_coeff = dep_quant_context->m_quant->m_QScale; + const int32_t thres = dep_quant_context->m_quant->m_thresLast; + int temp = *firstTestPos; + for (; temp >= 0; (temp)--) { + coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff)); + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + *firstTestPos = temp; +} + +int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic); + success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic); + + return success; +} diff --git a/src/strategies/generic/depquant-generic.h b/src/strategies/generic/depquant-generic.h new file mode 100644 index 00000000..488963be --- /dev/null +++ b/src/strategies/generic/depquant-generic.h @@ -0,0 +1,50 @@ +#ifndef STRATEGIES_DEPQUANT_GENERIC_H_ +#define STRATEGIES_DEPQUANT_GENERIC_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Generic C implementations of optimized functions. + */ + +#include "cu.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "uvg266.h" +#include "tables.h" + + +int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth); + +#endif //STRATEGIES_DEPQUANT_GENERIC_H_ diff --git a/src/strategies/generic/encode_coding_tree-generic.c b/src/strategies/generic/encode_coding_tree-generic.c index 67685f2f..c3065903 100644 --- a/src/strategies/generic/encode_coding_tree-generic.c +++ b/src/strategies/generic/encode_coding_tree-generic.c @@ -54,11 +54,16 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, cabac_data_t * const cabac, const coeff_t *coeff, - uint8_t width, + const cu_loc_t * const cu_loc, uint8_t color, int8_t scan_mode, cu_info_t* cur_cu, - double* bits_out) { + double* bits_out) +{ + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; //const encoder_control_t * const encoder = state->encoder_control; //int c1 = 1; @@ -75,12 +80,12 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, // CONSTANTS - const int height = width; // TODO: height for non-square blocks. - const uint32_t log2_block_size = uvg_g_convert_to_bit[width]+2; - const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_size][log2_block_size][0] + uvg_g_log2_sbb_size[log2_block_size][log2_block_size][1]; - const uint32_t *scan = - uvg_g_sig_last_scan[scan_mode][log2_block_size - 1]; - const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 1][scan_mode]; + const uint8_t log2_block_width = uvg_g_convert_to_log2[width]; + const uint8_t log2_block_height = uvg_g_convert_to_log2[height]; + + const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; + const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height); + const uint32_t* const scan_cg = uvg_get_scan_order_table(SCAN_GROUP_UNGROUPED, scan_mode, log2_block_width, log2_block_height); // Init base contexts according to block type @@ -90,12 +95,13 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, unsigned scan_cg_last = (unsigned)-1; unsigned scan_pos_last = (unsigned)-1; - for (int i = 0; i < width * width; i++) { + for (int i = 0; i < (width * height); ++i) { if (coeff[scan[i]]) { scan_pos_last = i; sig_coeffgroup_flag[scan_cg[i >> log2_cg_size]] = 1; } } + scan_cg_last = scan_pos_last >> log2_cg_size; int pos_last = scan[scan_pos_last]; @@ -120,28 +126,33 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, last_coeff_x, last_coeff_y, width, - width, + height, color, scan_mode, bits_out); - uint32_t quant_state_transition_table = 0; //ToDo: dep quant enable changes this + uint32_t quant_state_transition_table = state->encoder_control->cfg.dep_quant ? 32040 : 0; int32_t quant_state = 0; uint8_t ctx_offset[16]; int32_t temp_diag = -1; int32_t temp_sum = -1; - int32_t reg_bins = (width*width * 28) >> 4; //8 for 2x2 + int32_t reg_bins = (width * height * 28) >> 4; //8 for 2x2 // significant_coeff_flag for (i = scan_cg_last; i >= 0; i--) { //int32_t abs_coeff[64*64]; + const uint32_t log2_cg_width = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0]; + const uint32_t log2_cg_height = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1]; + const uint32_t cg_width = (MIN((uint8_t)TR_MAX_WIDTH, width) >> log2_cg_width); + const uint32_t cg_height = (MIN((uint8_t)TR_MAX_WIDTH, height) >> log2_cg_height); int32_t cg_blk_pos = scan_cg[i]; - int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> (log2_cg_size / 2)); - int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> (log2_cg_size / 2))); + int32_t cg_pos_y = cg_blk_pos / (MIN((uint8_t)32, width) >> log2_cg_width); + int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * (MIN((uint8_t)32, width) >> log2_cg_width)); + // !!! residual_coding_subblock() !!! @@ -151,7 +162,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, } else { uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0); uint32_t ctx_sig = uvg_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, - cg_pos_y, (MIN((uint8_t)32, width) >> (log2_cg_size / 2))); + cg_pos_y, cg_width, cg_height); CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctx[ctx_sig], sig_coeff_group, bits, "significant_coeffgroup_flag"); } @@ -182,7 +193,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, sig = (coeff[blk_pos] != 0) ? 1 : 0; if (num_non_zero || next_sig_pos != infer_sig_pos) { - ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum); + ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum); cabac_ctx_t* sig_ctx_luma = &(cabac->ctx.cu_sig_model_luma[MAX(0, (quant_state - 1))][ctx_sig]); cabac_ctx_t* sig_ctx_chroma = &(cabac->ctx.cu_sig_model_chroma[MAX(0, (quant_state - 1))][MIN(ctx_sig,7)]); @@ -190,7 +201,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, reg_bins--; } else if (next_sig_pos != scan_pos_last) { - ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, width, color, &temp_diag, &temp_sum); + ctx_sig = uvg_context_get_sig_ctx_idx_abs(coeff, pos_x, pos_y, width, height, color, &temp_diag, &temp_sum); } @@ -256,7 +267,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, blk_pos = scan[scan_pos]; pos_y = blk_pos / width; pos_x = blk_pos - (pos_y * width); - int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 4); + int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 4); rice_param = g_go_rice_pars[abs_sum]; uint32_t second_pass_abs_coeff = abs(coeff[blk_pos]); @@ -274,7 +285,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, pos_y = blk_pos / width; pos_x = blk_pos - (pos_y * width); uint32_t coeff_abs = abs(coeff[blk_pos]); - int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, width, 0); + int32_t abs_sum = uvg_abs_sum(coeff, pos_x, pos_y, width, height, 0); rice_param = g_go_rice_pars[abs_sum]; pos0 = ((quant_state<2)?1:2) << rice_param; uint32_t remainder = (coeff_abs == 0 ? pos0 : coeff_abs <= pos0 ? coeff_abs - 1 : coeff_abs); @@ -291,7 +302,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, uint32_t num_signs = num_non_zero; - if (state->encoder_control->cfg.signhide_enable && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) { + if (state->encoder_control->cfg.signhide_enable && !state->encoder_control->cfg.dep_quant && (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4)) { num_signs--; coeff_signs >>= 1; } diff --git a/src/strategies/generic/encode_coding_tree-generic.h b/src/strategies/generic/encode_coding_tree-generic.h index 8cfe497d..0de02e3c 100644 --- a/src/strategies/generic/encode_coding_tree-generic.h +++ b/src/strategies/generic/encode_coding_tree-generic.h @@ -44,7 +44,7 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state, cabac_data_t * const cabac, const coeff_t *coeff, - uint8_t width, + const cu_loc_t * const loc, uint8_t color, int8_t scan_mode, cu_info_t* cur_cu, diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c index 35494b99..398388fc 100644 --- a/src/strategies/generic/intra-generic.c +++ b/src/strategies/generic/intra-generic.c @@ -34,6 +34,7 @@ #include +#include "cu.h" #include "intra.h" #include "uvg266.h" #include "strategyselector.h" @@ -42,25 +43,32 @@ /** * \brief Generate angular predictions. - * \param log2_width Log2 of width, range 2..5. + * \param cu_loc CU location and size data. * \param intra_mode Angular mode in range 2..34. + * \param channel_type Color channel. * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. - * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. + * \param in_ref_left Pointer to -1 index of left reference, length=height*2+1. * \param dst Buffer of size width*width. * \param multi_ref_idx Multi reference line index for use with MRL. */ static void uvg_angular_pred_generic( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, const int_fast8_t channel_type, const uvg_pixel *const in_ref_above, const uvg_pixel *const in_ref_left, uvg_pixel *const dst, - const uint8_t multi_ref_idx) + const uint8_t multi_ref_idx, + const uint8_t isp_mode, + const int cu_dim) { + int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; - assert(log2_width >= 2 && log2_width <= 5); - assert(intra_mode >= 2 && intra_mode <= 66); + assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5); + // assert(intra_mode >= 2 && intra_mode <= 66); static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp @@ -105,126 +113,105 @@ static void uvg_angular_pred_generic( // Temporary buffer for modes 11-25. // It only needs to be big enough to hold indices from -width to width-1. + uvg_pixel temp_dst[TR_MAX_WIDTH * TR_MAX_WIDTH]; + + // TODO: check the correct size for these arrays when MRL is used //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; - const int_fast32_t width = 1 << log2_width; + uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; + uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 }; uint32_t pred_mode = intra_mode; // ToDo: handle WAIP uint8_t multi_ref_index = multi_ref_idx; + uint8_t isp = isp_mode; // Whether to swap references to always project on the left reference row. const bool vertical_mode = intra_mode >= 34; // Modes distance to horizontal or vertical mode. const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -((int32_t)pred_mode - 18); - //const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode; // Sample displacement per column in fractions of 32. - const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; + const int16_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; - // TODO: replace latter width with height - int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]); + const int side_size = vertical_mode ? log2_height : log2_width; + int scale = MIN(2, side_size - pre_scale[abs(mode_disp)]); // Pointer for the reference we are interpolating from. uvg_pixel *ref_main; // Pointer for the other reference. const uvg_pixel *ref_side; + uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst; + + const int top_ref_length = isp_mode == ISP_MODE_VER ? width + cu_dim : width << 1; + const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1; // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { + memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel)); + memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel)); - // TODO: for non-square blocks, separate loops for x and y is needed - for (int i = 0; i <= width + 1 + multi_ref_index; i++) { - temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]); - temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]); + ref_main = vertical_mode ? temp_above + height : temp_left + width; + ref_side = vertical_mode ? temp_left + width : temp_above + height; + + int size_side = vertical_mode ? height : width; + for (int i = -size_side; i <= -1; i++) { + ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)]; } - - // TODO: take into account non-square blocks - ref_main = temp_main + width; - ref_side = temp_side + width; - - // TODO: for non square blocks, need to check if width or height is used for reference extension - for (int i = -width; i <= -1; i++) { - ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)]; - } - - //const uint32_t index_offset = width + 1; - //const int32_t last_index = width; - //const int_fast32_t most_negative_index = (width * sample_disp) >> 5; - //// Negative sample_disp means, we need to use both references. - - //// TODO: update refs to take into account variating block size and shapes - //// (height is not always equal to width) - //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1; - //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1; - - //// Move the reference pixels to start from the middle to the later half of - //// the tmp_ref, so there is room for negative indices. - //for (int_fast32_t x = -1; x < width; ++x) { - // tmp_ref[x + index_offset] = ref_main[x]; - //} - //// Get a pointer to block index 0 in tmp_ref. - //ref_main = &tmp_ref[index_offset]; - //tmp_ref[index_offset -1] = tmp_ref[index_offset]; - - //// Extend the side reference to the negative indices of main reference. - //int_fast32_t col_sample_disp = 128; // rounding for the ">> 8" - //int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)]; - //// TODO: add 'vertical_mode ? height : width' instead of 'width' - // - //for (int_fast32_t x = -1; x > most_negative_index; x--) { - // col_sample_disp += inv_abs_sample_disp; - // int_fast32_t side_index = col_sample_disp >> 8; - // tmp_ref[x + index_offset - 1] = ref_side[side_index - 1]; - //} - //tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1]; - //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset]; } else { - - // TODO: again, separate loop needed for non-square blocks - for (int i = 0; i <= (width << 1) + multi_ref_index; i++) { - temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]); - temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]); - } + memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); + memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - // TODO: this code block will need to change also when non-square blocks are used - // const int log2_ratio = 0; - const int s = 0; + ref_main = vertical_mode ? temp_above : temp_left; + ref_side = vertical_mode ? temp_left : temp_above; + + const int log2_ratio = log2_width - log2_height; + const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio); const int max_index = (multi_ref_index << s) + 2; - const int ref_length = width << 1; - const uvg_pixel val = temp_main[ref_length + multi_ref_index]; - for (int j = 1; j <= max_index; j++) { - temp_main[ref_length + multi_ref_index + j] = val; + int ref_length; + if (isp_mode) { + ref_length = vertical_mode ? top_ref_length : left_ref_length; + } + else { + ref_length = vertical_mode ? width << 1 : height << 1; + } + const uvg_pixel val = ref_main[ref_length + multi_ref_index]; + for (int j = 1; j <= max_index; j++) { + ref_main[ref_length + multi_ref_index + j] = val; } - - ref_main = temp_main; - ref_side = temp_side; - //// sample_disp >= 0 means we don't need to refer to negative indices, - //// which means we can just use the references as is. - //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1; - //ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1; - - //memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(uvg_pixel)); - //ref_main = &tmp_ref[width]; - //tmp_ref[width-1] = tmp_ref[width]; - //int8_t last_index = 1 + width*2; - //tmp_ref[width + last_index] = tmp_ref[width + last_index - 1]; } + // compensate for line offset in reference line buffers ref_main += multi_ref_index; ref_side += multi_ref_index; + if (!vertical_mode) { SWAP(width, height, int) } if (sample_disp != 0) { + bool use_cubic = true; // Default to cubic filter + static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; + int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1]; + int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); + if (dist_from_vert_or_hor > filter_threshold) { + if ((abs(sample_disp) & 0x1F) != 0) + { + use_cubic = false; + } + } + // Cubic must be used if ref line != 0 or if isp mode is != 0 + if (multi_ref_index || isp) { + use_cubic = true; + } // The mode is not horizontal or vertical, we have to do interpolation. - for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) { + for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) { + int_fast32_t delta_int = delta_pos >> 5; int_fast32_t delta_fract = delta_pos & (32 - 1); + const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 }; + int16_t const* const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff; if ((abs(sample_disp) & 0x1F) != 0) { @@ -232,25 +219,7 @@ static void uvg_angular_pred_generic( if (channel_type == 0) { int32_t ref_main_index = delta_int; uvg_pixel p[4]; - bool use_cubic = true; // Default to cubic filter - static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; - int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; - int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); - if (dist_from_vert_or_hor > filter_threshold) { - static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; - const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode; - const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; - if ((abs(sample_disp) & 0x1F) != 0) - { - use_cubic = false; - } - } - // Cubic must be used if ref line != 0 - if (multi_ref_index) { - use_cubic = true; - } - const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 }; - int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff; + // Do 4-tap intra interpolation filtering for (int_fast32_t x = 0; x < width; x++, ref_main_index++) { p[0] = ref_main[ref_main_index]; @@ -258,7 +227,7 @@ static void uvg_angular_pred_generic( p[2] = ref_main[ref_main_index + 2]; p[3] = ref_main[ref_main_index + 3]; - dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6); + work[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6); } } @@ -268,99 +237,79 @@ static void uvg_angular_pred_generic( for (int_fast32_t x = 0; x < width; ++x) { uvg_pixel ref1 = ref_main[x + delta_int + 1]; uvg_pixel ref2 = ref_main[x + delta_int + 2]; - dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5); + work[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5); } } } else { // Just copy the integer samples for (int_fast32_t x = 0; x < width; x++) { - dst[y * width + x] = ref_main[x + delta_int + 1]; + work[y * width + x] = ref_main[x + delta_int + 1]; } } // PDPC - bool PDPC_filter = (width >= 4 || channel_type != 0); + bool PDPC_filter = (width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH) && multi_ref_index == 0; if (pred_mode > 1 && pred_mode < 67) { if (mode_disp < 0 || multi_ref_index) { // Cannot be used with MRL. PDPC_filter = false; } else if (mode_disp > 0) { - PDPC_filter = (scale >= 0); + PDPC_filter &= (scale >= 0); } } if(PDPC_filter) { - int inv_angle_sum = 256; + int inv_angle_sum = 256; for (int x = 0; x < MIN(3 << scale, width); x++) { inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)]; int wL = 32 >> (2 * x >> scale); const uvg_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1]; - dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6); + work[y * width + x] = work[y * width + x] + ((wL * (left - work[y * width + x]) + 32) >> 6); } } - - /* - if (pred_mode == 2 || pred_mode == 66) { - int wT = 16 >> MIN(31, ((y << 1) >> scale)); - for (int x = 0; x < width; x++) { - int wL = 16 >> MIN(31, ((x << 1) >> scale)); - if (wT + wL == 0) break; - int c = x + y + 1; - if (c >= 2 * width) { wL = 0; } - if (c >= 2 * width) { wT = 0; } - const uvg_pixel left = (wL != 0) ? ref_side[c] : 0; - const uvg_pixel top = (wT != 0) ? ref_main[c] : 0; - dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6); - } - } else if (sample_disp == 0 || sample_disp >= 12) { - int inv_angle_sum_0 = 2; - for (int x = 0; x < width; x++) { - inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)]; - int delta_pos_0 = inv_angle_sum_0 >> 2; - int delta_frac_0 = delta_pos_0 & 63; - int delta_int_0 = delta_pos_0 >> 6; - int delta_y = y + delta_int_0 + 1; - // TODO: convert to JVET_K0500_WAIP - if (delta_y > width + width - 1) break; - - int wL = 32 >> MIN(31, ((x << 1) >> scale)); - if (wL == 0) break; - const uvg_pixel *p = ref_side + delta_y - 1; - uvg_pixel left = p[delta_frac_0 >> 5]; - dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6); - } - }*/ } } else { // Mode is horizontal or vertical, just copy the pixels. + + // Do not apply PDPC if multi ref line index is other than 0 + // TODO: do not do PDPC if block is in BDPCM mode + bool do_pdpc = ((width >= 4 && height >= 4) && sample_disp >= 0 && multi_ref_index == 0 /*&& !bdpcm*/); - // TODO: update outer loop to use height instead of width - for (int_fast32_t y = 0; y < width; ++y) { - for (int_fast32_t x = 0; x < width; ++x) { - dst[y * width + x] = ref_main[x + 1]; - } - // Do not apply PDPC if multi ref line index is other than 0 - if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) { - int scale = (log2_width + log2_width - 2) >> 2; - const uvg_pixel top_left = ref_main[0]; + if (do_pdpc) { + int scale = (log2_width + log2_height - 2) >> 2; + const uvg_pixel top_left = ref_main[0]; + for (int_fast32_t y = 0; y < height; ++y) { + memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel)); const uvg_pixel left = ref_side[1 + y]; - for (int i = 0; i < MIN(3 << scale, width); i++) { - const int wL = 32 >> (2 * i >> scale); - const uvg_pixel val = dst[y * width + i]; - dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); + for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) { + const int wL = 32 >> (2 * x >> scale); + const uvg_pixel val = work[y * width + x]; + work[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6)); } } + } else { + for (int_fast32_t y = 0; y < height; ++y) { + memcpy(&work[y * width], &ref_main[1], width * sizeof(uvg_pixel)); + } } } // Flip the block if this is was a horizontal mode. if (!vertical_mode) { - for (int_fast32_t y = 0; y < width - 1; ++y) { - for (int_fast32_t x = y + 1; x < width; ++x) { - SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel); + if(width == height) { + for (int_fast32_t y = 0; y < height - 1; ++y) { + for (int_fast32_t x = y + 1; x < width; ++x) { + SWAP(work[y * height + x], work[x * width + y], uvg_pixel); + } + } + } else { + for(int y = 0; y < width; ++y) { + for(int x = 0; x < height; ++x) { + dst[x + y * height] = work[y + x * width]; + } } } } @@ -369,23 +318,32 @@ static void uvg_angular_pred_generic( /** * \brief Generate planar prediction. - * \param log2_width Log2 of width, range 2..5. + * \param cu_loc CU location and size data. + * \param color Color channel. * \param in_ref_above Pointer to -1 index of above reference, length=width*2+1. * \param in_ref_left Pointer to -1 index of left reference, length=width*2+1. * \param dst Buffer of size width*width. */ static void uvg_intra_pred_planar_generic( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, + color_t color, const uvg_pixel *const ref_top, const uvg_pixel *const ref_left, uvg_pixel *const dst) { - // TODO: Add height - assert(log2_width >= 2 && log2_width <= 5); + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; + + const int offset = 1 << (log2_width + log2_height); + const int final_shift = 1 + log2_width + log2_height; + + // If ISP is enabled log_dim 1 is possible (limit was previously 2) + assert((log2_width >= 2 && log2_width <= 5) && log2_height <= 5); - const int_fast8_t width = 1 << log2_width; const uvg_pixel top_right = ref_top[width + 1]; - const uvg_pixel bottom_left = ref_left[width + 1]; + const uvg_pixel bottom_left = ref_left[height + 1]; #if 0 // Unoptimized version for reference. @@ -397,18 +355,27 @@ static void uvg_intra_pred_planar_generic( } } #else - int_fast16_t top[32]; + // TODO: get rid of magic numbers. Make a define for this + int_fast16_t top[64]; + int_fast16_t bottom[64]; + int_fast16_t left[64]; + int_fast16_t right[64]; for (int i = 0; i < width; ++i) { - top[i] = ref_top[i + 1] << log2_width; + bottom[i] = bottom_left - ref_top[i + 1]; + top[i] = ref_top[i + 1] << log2_height; } - for (int y = 0; y < width; ++y) { - int_fast16_t hor = (ref_left[y + 1] << log2_width) + width; + for (int j = 0; j < height; ++j) { + right[j] = top_right - ref_left[j + 1]; + left[j] = ref_left[j + 1] << log2_width; + } + + for (int y = 0; y < height; ++y) { + int_fast16_t hor = left[y]; for (int x = 0; x < width; ++x) { - hor += top_right - ref_left[y + 1]; - top[x] += bottom_left - ref_top[x + 1]; - dst[y * width + x] = (hor + top[x]) >> (log2_width + 1); - // + hor += right[y]; + top[x] += bottom[x]; + dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift; } } #endif @@ -461,25 +428,26 @@ static void uvg_intra_pred_filtered_dc_generic( /** * \brief Position Dependent Prediction Combination for Planar and DC modes. -* \param log2_width Log2 of width, range 2..5. -* \param width Block width matching log2_width. +* \param cu_loc CU location and size data. * \param used_ref Pointer used reference pixel struct. * \param dst Buffer of size width*width. */ static void uvg_pdpc_planar_dc_generic( const int mode, - const int width, - const int log2_width, + const cu_loc_t* const cu_loc, + const color_t color, const uvg_intra_ref *const used_ref, uvg_pixel *const dst) { assert(mode == 0 || mode == 1); // planar or DC + const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int log2_width = uvg_g_convert_to_log2[width]; + const int log2_height = uvg_g_convert_to_log2[height]; - // TODO: replace latter log2_width with log2_height - const int scale = ((log2_width - 2 + log2_width - 2 + 2) >> 2); + const int scale = (log2_width + log2_height - 2) >> 2; - // TODO: replace width with height - for (int y = 0; y < width; y++) { + for (int y = 0; y < height; y++) { int wT = 32 >> MIN(31, ((y << 1) >> scale)); for (int x = 0; x < width; x++) { int wL = 32 >> MIN(31, ((x << 1) >> scale)); diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index 817befed..5e06ebbe 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -32,6 +32,7 @@ #include "strategies/generic/picture-generic.h" +#include #include #include "strategies/strategies-picture.h" @@ -474,6 +475,577 @@ SATD_DUAL_NXN(64, uvg_pixel) SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4) +static uint64_t xCalcHADs2x2(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + uint64_t satd = 0; + coeff_t diff[4], m[4]; + + diff[0] = piOrg[0] - piCur[0]; + diff[1] = piOrg[1] - piCur[1]; + diff[2] = piOrg[iStrideOrg] - piCur[0 + iStrideCur]; + diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur]; + m[0] = diff[0] + diff[2]; + m[1] = diff[1] + diff[3]; + m[2] = diff[0] - diff[2]; + m[3] = diff[1] - diff[3]; + + satd += abs(m[0] + m[1]) >> 2; + satd += abs(m[0] - m[1]); + satd += abs(m[2] + m[3]); + satd += abs(m[2] - m[3]); + + return satd; +} + + +static uint64_t xCalcHADs16x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ //need to add SIMD implementation ,JCA + int k, i, j, jj, sad = 0; + int diff[128], m1[8][16], m2[8][16]; + for (k = 0; k < 128; k += 16) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + diff[k + 8] = piOrg[8] - piCur[8]; + diff[k + 9] = piOrg[9] - piCur[9]; + diff[k + 10] = piOrg[10] - piCur[10]; + diff[k + 11] = piOrg[11] - piCur[11]; + diff[k + 12] = piOrg[12] - piCur[12]; + diff[k + 13] = piOrg[13] - piCur[13]; + diff[k + 14] = piOrg[14] - piCur[14]; + diff[k + 15] = piOrg[15] - piCur[15]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 8; j++) + { + jj = j << 4; + + m2[j][0] = diff[jj] + diff[jj + 8]; + m2[j][1] = diff[jj + 1] + diff[jj + 9]; + m2[j][2] = diff[jj + 2] + diff[jj + 10]; + m2[j][3] = diff[jj + 3] + diff[jj + 11]; + m2[j][4] = diff[jj + 4] + diff[jj + 12]; + m2[j][5] = diff[jj + 5] + diff[jj + 13]; + m2[j][6] = diff[jj + 6] + diff[jj + 14]; + m2[j][7] = diff[jj + 7] + diff[jj + 15]; + m2[j][8] = diff[jj] - diff[jj + 8]; + m2[j][9] = diff[jj + 1] - diff[jj + 9]; + m2[j][10] = diff[jj + 2] - diff[jj + 10]; + m2[j][11] = diff[jj + 3] - diff[jj + 11]; + m2[j][12] = diff[jj + 4] - diff[jj + 12]; + m2[j][13] = diff[jj + 5] - diff[jj + 13]; + m2[j][14] = diff[jj + 6] - diff[jj + 14]; + m2[j][15] = diff[jj + 7] - diff[jj + 15]; + + m1[j][0] = m2[j][0] + m2[j][4]; + m1[j][1] = m2[j][1] + m2[j][5]; + m1[j][2] = m2[j][2] + m2[j][6]; + m1[j][3] = m2[j][3] + m2[j][7]; + m1[j][4] = m2[j][0] - m2[j][4]; + m1[j][5] = m2[j][1] - m2[j][5]; + m1[j][6] = m2[j][2] - m2[j][6]; + m1[j][7] = m2[j][3] - m2[j][7]; + m1[j][8] = m2[j][8] + m2[j][12]; + m1[j][9] = m2[j][9] + m2[j][13]; + m1[j][10] = m2[j][10] + m2[j][14]; + m1[j][11] = m2[j][11] + m2[j][15]; + m1[j][12] = m2[j][8] - m2[j][12]; + m1[j][13] = m2[j][9] - m2[j][13]; + m1[j][14] = m2[j][10] - m2[j][14]; + m1[j][15] = m2[j][11] - m2[j][15]; + + m2[j][0] = m1[j][0] + m1[j][2]; + m2[j][1] = m1[j][1] + m1[j][3]; + m2[j][2] = m1[j][0] - m1[j][2]; + m2[j][3] = m1[j][1] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][6]; + m2[j][5] = m1[j][5] + m1[j][7]; + m2[j][6] = m1[j][4] - m1[j][6]; + m2[j][7] = m1[j][5] - m1[j][7]; + m2[j][8] = m1[j][8] + m1[j][10]; + m2[j][9] = m1[j][9] + m1[j][11]; + m2[j][10] = m1[j][8] - m1[j][10]; + m2[j][11] = m1[j][9] - m1[j][11]; + m2[j][12] = m1[j][12] + m1[j][14]; + m2[j][13] = m1[j][13] + m1[j][15]; + m2[j][14] = m1[j][12] - m1[j][14]; + m2[j][15] = m1[j][13] - m1[j][15]; + + m1[j][0] = m2[j][0] + m2[j][1]; + m1[j][1] = m2[j][0] - m2[j][1]; + m1[j][2] = m2[j][2] + m2[j][3]; + m1[j][3] = m2[j][2] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][5]; + m1[j][5] = m2[j][4] - m2[j][5]; + m1[j][6] = m2[j][6] + m2[j][7]; + m1[j][7] = m2[j][6] - m2[j][7]; + m1[j][8] = m2[j][8] + m2[j][9]; + m1[j][9] = m2[j][8] - m2[j][9]; + m1[j][10] = m2[j][10] + m2[j][11]; + m1[j][11] = m2[j][10] - m2[j][11]; + m1[j][12] = m2[j][12] + m2[j][13]; + m1[j][13] = m2[j][12] - m2[j][13]; + m1[j][14] = m2[j][14] + m2[j][15]; + m1[j][15] = m2[j][14] - m2[j][15]; + } + + //vertical + for (i = 0; i < 16; i++) + { + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + } + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 16; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(16.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs8x16(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[128], m1[16][8], m2[16][8]; + for (k = 0; k < 128; k += 8) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 16; j++) + { + jj = j << 3; + + m2[j][0] = diff[jj] + diff[jj + 4]; + m2[j][1] = diff[jj + 1] + diff[jj + 5]; + m2[j][2] = diff[jj + 2] + diff[jj + 6]; + m2[j][3] = diff[jj + 3] + diff[jj + 7]; + m2[j][4] = diff[jj] - diff[jj + 4]; + m2[j][5] = diff[jj + 1] - diff[jj + 5]; + m2[j][6] = diff[jj + 2] - diff[jj + 6]; + m2[j][7] = diff[jj + 3] - diff[jj + 7]; + + m1[j][0] = m2[j][0] + m2[j][2]; + m1[j][1] = m2[j][1] + m2[j][3]; + m1[j][2] = m2[j][0] - m2[j][2]; + m1[j][3] = m2[j][1] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][6]; + m1[j][5] = m2[j][5] + m2[j][7]; + m1[j][6] = m2[j][4] - m2[j][6]; + m1[j][7] = m2[j][5] - m2[j][7]; + + m2[j][0] = m1[j][0] + m1[j][1]; + m2[j][1] = m1[j][0] - m1[j][1]; + m2[j][2] = m1[j][2] + m1[j][3]; + m2[j][3] = m1[j][2] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][5]; + m2[j][5] = m1[j][4] - m1[j][5]; + m2[j][6] = m1[j][6] + m1[j][7]; + m2[j][7] = m1[j][6] - m1[j][7]; + } + + //vertical + for (i = 0; i < 8; i++) + { + m1[0][i] = m2[0][i] + m2[8][i]; + m1[1][i] = m2[1][i] + m2[9][i]; + m1[2][i] = m2[2][i] + m2[10][i]; + m1[3][i] = m2[3][i] + m2[11][i]; + m1[4][i] = m2[4][i] + m2[12][i]; + m1[5][i] = m2[5][i] + m2[13][i]; + m1[6][i] = m2[6][i] + m2[14][i]; + m1[7][i] = m2[7][i] + m2[15][i]; + m1[8][i] = m2[0][i] - m2[8][i]; + m1[9][i] = m2[1][i] - m2[9][i]; + m1[10][i] = m2[2][i] - m2[10][i]; + m1[11][i] = m2[3][i] - m2[11][i]; + m1[12][i] = m2[4][i] - m2[12][i]; + m1[13][i] = m2[5][i] - m2[13][i]; + m1[14][i] = m2[6][i] - m2[14][i]; + m1[15][i] = m2[7][i] - m2[15][i]; + + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + m2[8][i] = m1[8][i] + m1[12][i]; + m2[9][i] = m1[9][i] + m1[13][i]; + m2[10][i] = m1[10][i] + m1[14][i]; + m2[11][i] = m1[11][i] + m1[15][i]; + m2[12][i] = m1[8][i] - m1[12][i]; + m2[13][i] = m1[9][i] - m1[13][i]; + m2[14][i] = m1[10][i] - m1[14][i]; + m2[15][i] = m1[11][i] - m1[15][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + m1[8][i] = m2[8][i] + m2[10][i]; + m1[9][i] = m2[9][i] + m2[11][i]; + m1[10][i] = m2[8][i] - m2[10][i]; + m1[11][i] = m2[9][i] - m2[11][i]; + m1[12][i] = m2[12][i] + m2[14][i]; + m1[13][i] = m2[13][i] + m2[15][i]; + m1[14][i] = m2[12][i] - m2[14][i]; + m1[15][i] = m2[13][i] - m2[15][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + m2[8][i] = m1[8][i] + m1[9][i]; + m2[9][i] = m1[8][i] - m1[9][i]; + m2[10][i] = m1[10][i] + m1[11][i]; + m2[11][i] = m1[10][i] - m1[11][i]; + m2[12][i] = m1[12][i] + m1[13][i]; + m2[13][i] = m1[12][i] - m1[13][i]; + m2[14][i] = m1[14][i] + m1[15][i]; + m2[15][i] = m1[14][i] - m1[15][i]; + } + + for (i = 0; i < 16; i++) + { + for (j = 0; j < 8; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(16.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs4x8(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[32], m1[8][4], m2[8][4]; + for (k = 0; k < 32; k += 4) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 8; j++) + { + jj = j << 2; + m2[j][0] = diff[jj] + diff[jj + 2]; + m2[j][1] = diff[jj + 1] + diff[jj + 3]; + m2[j][2] = diff[jj] - diff[jj + 2]; + m2[j][3] = diff[jj + 1] - diff[jj + 3]; + + m1[j][0] = m2[j][0] + m2[j][1]; + m1[j][1] = m2[j][0] - m2[j][1]; + m1[j][2] = m2[j][2] + m2[j][3]; + m1[j][3] = m2[j][2] - m2[j][3]; + } + + //vertical + for (i = 0; i < 4; i++) + { + m2[0][i] = m1[0][i] + m1[4][i]; + m2[1][i] = m1[1][i] + m1[5][i]; + m2[2][i] = m1[2][i] + m1[6][i]; + m2[3][i] = m1[3][i] + m1[7][i]; + m2[4][i] = m1[0][i] - m1[4][i]; + m2[5][i] = m1[1][i] - m1[5][i]; + m2[6][i] = m1[2][i] - m1[6][i]; + m2[7][i] = m1[3][i] - m1[7][i]; + + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + m1[4][i] = m2[4][i] + m2[6][i]; + m1[5][i] = m2[5][i] + m2[7][i]; + m1[6][i] = m2[4][i] - m2[6][i]; + m1[7][i] = m2[5][i] - m2[7][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + m2[4][i] = m1[4][i] + m1[5][i]; + m2[5][i] = m1[4][i] - m1[5][i]; + m2[6][i] = m1[6][i] + m1[7][i]; + m2[7][i] = m1[6][i] - m1[7][i]; + } + + for (i = 0; i < 8; i++) + { + for (j = 0; j < 4; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(4.0 * 8) * 2); + + return sad; +} + +static uint64_t xCalcHADs8x4(const uvg_pixel* piOrg, const uvg_pixel* piCur, int iStrideOrg, int iStrideCur) +{ + int k, i, j, jj, sad = 0; + int diff[32], m1[4][8], m2[4][8]; + for (k = 0; k < 32; k += 8) + { + diff[k + 0] = piOrg[0] - piCur[0]; + diff[k + 1] = piOrg[1] - piCur[1]; + diff[k + 2] = piOrg[2] - piCur[2]; + diff[k + 3] = piOrg[3] - piCur[3]; + diff[k + 4] = piOrg[4] - piCur[4]; + diff[k + 5] = piOrg[5] - piCur[5]; + diff[k + 6] = piOrg[6] - piCur[6]; + diff[k + 7] = piOrg[7] - piCur[7]; + + piCur += iStrideCur; + piOrg += iStrideOrg; + } + + //horizontal + for (j = 0; j < 4; j++) + { + jj = j << 3; + + m2[j][0] = diff[jj] + diff[jj + 4]; + m2[j][1] = diff[jj + 1] + diff[jj + 5]; + m2[j][2] = diff[jj + 2] + diff[jj + 6]; + m2[j][3] = diff[jj + 3] + diff[jj + 7]; + m2[j][4] = diff[jj] - diff[jj + 4]; + m2[j][5] = diff[jj + 1] - diff[jj + 5]; + m2[j][6] = diff[jj + 2] - diff[jj + 6]; + m2[j][7] = diff[jj + 3] - diff[jj + 7]; + + m1[j][0] = m2[j][0] + m2[j][2]; + m1[j][1] = m2[j][1] + m2[j][3]; + m1[j][2] = m2[j][0] - m2[j][2]; + m1[j][3] = m2[j][1] - m2[j][3]; + m1[j][4] = m2[j][4] + m2[j][6]; + m1[j][5] = m2[j][5] + m2[j][7]; + m1[j][6] = m2[j][4] - m2[j][6]; + m1[j][7] = m2[j][5] - m2[j][7]; + + m2[j][0] = m1[j][0] + m1[j][1]; + m2[j][1] = m1[j][0] - m1[j][1]; + m2[j][2] = m1[j][2] + m1[j][3]; + m2[j][3] = m1[j][2] - m1[j][3]; + m2[j][4] = m1[j][4] + m1[j][5]; + m2[j][5] = m1[j][4] - m1[j][5]; + m2[j][6] = m1[j][6] + m1[j][7]; + m2[j][7] = m1[j][6] - m1[j][7]; + } + + //vertical + for (i = 0; i < 8; i++) + { + m1[0][i] = m2[0][i] + m2[2][i]; + m1[1][i] = m2[1][i] + m2[3][i]; + m1[2][i] = m2[0][i] - m2[2][i]; + m1[3][i] = m2[1][i] - m2[3][i]; + + m2[0][i] = m1[0][i] + m1[1][i]; + m2[1][i] = m1[0][i] - m1[1][i]; + m2[2][i] = m1[2][i] + m1[3][i]; + m2[3][i] = m1[2][i] - m1[3][i]; + } + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 8; j++) + { + sad += abs(m2[i][j]); + } + } + + sad -= abs(m2[0][0]); + sad += abs(m2[0][0]) >> 2; + sad = (int)(sad / sqrt(4.0 * 8) * 2); + + return sad; +} + + +static uint64_t xGetHADs(int width, int height, const uvg_pixel* ref_in, int ref_stride, const uvg_pixel* pred_in, int pred_stride) +{ + const uvg_pixel* piOrg = ref_in; + const uvg_pixel* piCur = pred_in; + const int iRows = height; + const int iCols = width; + const int iStrideOrg = ref_stride; + const int iStrideCur = pred_stride; + + int x = 0, y = 0; + + uint64_t uiSum = 0; + + if (iCols > iRows && (iRows & 7) == 0 && (iCols & 15) == 0) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 16) + { + uiSum += xCalcHADs16x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 8; + piCur += iStrideCur * 8; + } + } + else if (iCols < iRows && (iCols & 7) == 0 && (iRows & 15) == 0) + { + for (y = 0; y < iRows; y += 16) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += xCalcHADs8x16(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 16; + piCur += iStrideCur * 16; + } + } + else if (iCols > iRows && (iRows & 3) == 0 && (iCols & 7) == 0) + { + for (y = 0; y < iRows; y += 4) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += xCalcHADs8x4(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 4; + piCur += iStrideCur * 4; + } + } + else if (iCols < iRows && (iCols & 3) == 0 && (iRows & 7) == 0) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 4) + { + uiSum += xCalcHADs4x8(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += iStrideOrg * 8; + piCur += iStrideCur * 8; + } + } + else if ((iRows % 8 == 0) && (iCols % 8 == 0)) + { + for (y = 0; y < iRows; y += 8) + { + for (x = 0; x < iCols; x += 8) + { + uiSum += satd_8x8_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur); + } + piOrg += 8 * iStrideOrg; + piCur += 8 * iStrideCur; + } + } + else if ((iRows % 4 == 0) && (iCols % 4 == 0)) + { + for (y = 0; y < iRows; y += 4) + { + for (x = 0; x < iCols; x += 4) + { + uiSum += uvg_satd_4x4_subblock_generic(&piOrg[x], iStrideOrg, &piCur[x], iStrideCur); + } + piOrg += 4 * iStrideOrg; + piCur += 4 * iStrideCur; + } + } + else if ((iRows % 2 == 0) && (iCols % 2 == 0)) + { + for (y = 0; y < iRows; y += 2) + { + for (x = 0; x < iCols; x += 2) + { + uiSum += xCalcHADs2x2(&piOrg[x], &piCur[x], iStrideOrg, iStrideCur); + } + piOrg += 2 * iStrideOrg; + piCur += 2 * iStrideCur; + } + } + + // TODO: 10 bit + return (uiSum >> 0); +} + + // Function macro for defining SAD calculating functions // for fixed size blocks. #define SAD_NXN(n, pixel_type) \ @@ -539,12 +1111,12 @@ SAD_DUAL_NXN(64, uvg_pixel) static unsigned pixels_calc_ssd_generic(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, - const int width) + const int width, const int height) { int ssd = 0; int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride]; ssd += diff * diff; @@ -783,10 +1355,10 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len) static void generate_residual_generic(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, - int width, int ref_stride, int pred_stride) + int width, int height, int ref_stride, int pred_stride) { int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]); } @@ -897,6 +1469,7 @@ int uvg_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic); success &= uvg_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic); + success &= uvg_strategyselector_register(opaque, "satd_any_size_vtm", "generic", 0, &xGetHADs); success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic); success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic); diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 96d2567a..e39b6c52 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -44,7 +44,6 @@ #include "fast_coeff_cost.h" #include "reshape.h" -#define QUANT_SHIFT 14 /** * \brief quantize transformed coefficents * @@ -62,22 +61,28 @@ void uvg_quant_generic( uint8_t lfnst_idx) { const encoder_control_t * const encoder = state->encoder_control; - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - const uint32_t * const scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1]; + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + const uint32_t * const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; - uint32_t log2_tr_width = uvg_math_floor_log2(height); - uint32_t log2_tr_height = uvg_math_floor_log2(width); + bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size + const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color; const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6]; - const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform - const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift); + const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform + const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift ); const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; + const int32_t default_quant_coeff = uvg_g_quant_scales[needs_block_size_trafo_scale][qp_scaled % 6]; + uint32_t ac_sum = 0; + const bool use_scaling_list = state->encoder_control->cfg.scaling_list != UVG_SCALING_LIST_OFF; + if(lfnst_idx == 0){ for (int32_t n = 0; n < width * height; n++) { int32_t level = coef[n]; @@ -86,7 +91,7 @@ void uvg_quant_generic( sign = (level < 0 ? -1 : 1); - int32_t curr_quant_coeff = quant_coeff[n]; + int32_t curr_quant_coeff = use_scaling_list ? quant_coeff[n] : default_quant_coeff; level = (int32_t)((abs_level * curr_quant_coeff + add) >> q_bits); ac_sum += level; @@ -237,6 +242,7 @@ int uvg_quant_cbcr_residual_generic( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, @@ -247,28 +253,28 @@ int uvg_quant_cbcr_residual_generic( uvg_pixel* v_rec_out, coeff_t* coeff_out, bool early_skip, - int lmcs_chroma_adj, enum uvg_tree_type tree_type - ) { + int lmcs_chroma_adj, enum uvg_tree_type tree_type) +{ ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH]; ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; - + // TODO: this function is not fully converted to handle non-square blocks { int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]); v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]); } } } - uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride); - uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride); + uvg_generate_residual(u_ref_in, u_pred_in, u_residual, width, height, in_stride, in_stride); + uvg_generate_residual(v_ref_in, v_pred_in, v_residual, width, height, in_stride, in_stride); const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1); - for (int y = 0; y < width; y++) + for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { @@ -305,33 +311,44 @@ int uvg_quant_cbcr_residual_generic( } - uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); - if(cur_cu->cr_lfnst_idx) { - uvg_fwd_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type); + uvg_transform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); + uint8_t lfnst_idx = tree_type == UVG_CHROMA_T ? cur_cu->cr_lfnst_idx : cur_cu->lfnst_idx; + if(lfnst_idx) { + uvg_fwd_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode); } - - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if (!false && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + COLOR_U, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { - int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, - scan_order, cur_cu->type, tr_depth, cur_cu->cbf, - cur_cu->cr_lfnst_idx); + uvg_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, + scan_order, cur_cu->type, cur_cu->cbf, lfnst_idx, 0); } else if (state->encoder_control->cfg.rdoq_enable && false) { - uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U, + uvg_ts_rdoq(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U, scan_order); } else { - uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, - scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, cur_cu->lfnst_idx); + uvg_quant(state, coeff, coeff_out, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, + scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false, lfnst_idx); } int8_t has_coeffs = 0; { int i; - for (i = 0; i < width * width; ++i) { + for (i = 0; i < width * height; ++i) { if (coeff_out[i] != 0) { has_coeffs = 1; break; @@ -342,13 +359,13 @@ int uvg_quant_cbcr_residual_generic( if (has_coeffs && !early_skip) { // Get quantized residual. (coeff_out -> coeff -> residual) - uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, + uvg_dequant(state, coeff_out, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); - if (cur_cu->cr_lfnst_idx) { - uvg_inv_lfnst(cur_cu, width, width, COLOR_UV, cur_cu->cr_lfnst_idx, coeff, tree_type); + if (lfnst_idx) { + uvg_inv_lfnst(cur_cu, width, height, COLOR_UV, lfnst_idx, coeff, tree_type, state->collocated_luma_mode); } - uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); + uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, height, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu); //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { @@ -371,7 +388,7 @@ int uvg_quant_cbcr_residual_generic( //} const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1); // Get quantized reconstruction. (residual + pred_in -> rec_out) - for (int y = 0; y < width; y++) { + for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { if (temp == 2) { u_residual[x + y * width] = combined_residual[x + y * width]; @@ -400,7 +417,7 @@ int uvg_quant_cbcr_residual_generic( } } } - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride]; u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val); @@ -413,7 +430,7 @@ int uvg_quant_cbcr_residual_generic( // With no coeffs and rec_out == pred_int we skip copying the coefficients // because the reconstruction is just the prediction. - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride]; v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride]; @@ -441,7 +458,7 @@ int uvg_quant_cbcr_residual_generic( * \returns Whether coeff_out contains any non-zero coefficients. */ int uvg_quantize_residual_generic(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -454,19 +471,19 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, int has_coeffs = 0; - assert(width <= TR_MAX_WIDTH); - assert(width >= TR_MIN_WIDTH); - - const int height = width; // TODO: height for non-square blocks + // With ISP these checks no longer apply, since width and height 2 is now possible + // With MTT even 1x16 and 16x1 ISP splits are possible + //assert(width <= TR_MAX_WIDTH && height <= TR_MAX_WIDTH); + //assert(width >= TR_MIN_WIDTH && height >= TR_MIN_WIDTH); // Get residual. (ref_in - pred_in -> residual) - uvg_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride); + uvg_generate_residual(ref_in, pred_in, residual, width, height, in_stride, in_stride); if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; int sign, absval; int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { sign = residual[x + y * width] >= 0 ? 1 : -1; absval = sign * residual[x + y * width]; @@ -477,43 +494,54 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, // Transform residual. (residual -> coeff) if (use_trskip) { - uvg_transformskip(state->encoder_control, residual, coeff, width); + uvg_transformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_transform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } - const uint8_t lfnst_index = color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx; + const uint8_t lfnst_index = tree_type != UVG_CHROMA_T || color == COLOR_Y ? cur_cu->lfnst_idx : cur_cu->cr_lfnst_idx; if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) { // Forward low frequency non-separable transform - uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type); + uvg_fwd_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode); } // Quantize coeffs. (coeff -> coeff_out) - if (state->encoder_control->cfg.rdoq_enable && + int abs_sum = 0; + if (!use_trskip && state->encoder_control->cfg.dep_quant) { + uvg_dep_quant( + state, + cur_cu, + width, + height, + coeff, + coeff_out, + color, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list); + } + else if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { - int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - uvg_rdoq(state, coeff, coeff_out, width, width, color, - scan_order, cur_cu->type, tr_depth, cur_cu->cbf, - lfnst_index); + uvg_rdoq(state, coeff, coeff_out, width, height, color, + scan_order, cur_cu->type, cur_cu->cbf, lfnst_index, color == 0 ? cur_cu->tr_idx : 0); } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) { - uvg_ts_rdoq(state, coeff, coeff_out, width, width, color, + uvg_ts_rdoq(state, coeff, coeff_out, width, height, color, scan_order); } else { - uvg_quant(state, coeff, coeff_out, width, width, color, + uvg_quant(state, coeff, coeff_out, width, height, color, scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y, lfnst_index); } // Check if there are any non-zero coefficients. { int i; - for (i = 0; i < width * width; ++i) { + for (i = 0; i < width * height; ++i) { if (coeff_out[i] != 0) { has_coeffs = 1; break; @@ -527,25 +555,25 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, int y, x; // Get quantized residual. (coeff_out -> coeff -> residual) - uvg_dequant(state, coeff_out, coeff, width, width, color, + uvg_dequant(state, coeff_out, coeff, width, height, color, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y); if (state->encoder_control->cfg.lfnst && cur_cu->type == CU_INTRA) { // Inverse low frequency non-separable transform - uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type); + uvg_inv_lfnst(cur_cu, width, height, color, lfnst_index, coeff, tree_type, state->collocated_luma_mode); } if (use_trskip) { - uvg_itransformskip(state->encoder_control, residual, coeff, width); + uvg_itransformskip(state->encoder_control, residual, coeff, width, height); } else { - uvg_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu); + uvg_itransform2d(state->encoder_control, residual, coeff, width, height, color, cur_cu); } if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { int y, x; int sign, absval; int maxAbsclipBD = (1 << UVG_BIT_DEPTH) - 1; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]); sign = residual[x + y * width] >= 0 ? 1 : -1; @@ -561,7 +589,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, } // Get quantized reconstruction. (residual + pred_in -> rec_out) - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { int16_t val = residual[x + y * width] + pred_in[x + y * in_stride]; rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, val); @@ -573,7 +601,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, // because the reconstruction is just the prediction. int y, x; - for (y = 0; y < width; ++y) { + for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { rec_out[x + y * out_stride] = pred_in[x + y * in_stride]; } @@ -590,23 +618,29 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip) { const encoder_control_t * const encoder = state->encoder_control; + if(encoder->cfg.dep_quant && !transform_skip) { + uvg_dep_quant_dequant(state, block_type, width, height, color, q_coef, coef, encoder->cfg.scaling_list); + return; + } int32_t shift,add,coeff_q; int32_t n; - int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height)) >> 1); // Represents scaling through forward transform + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform + bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1); + needs_block_size_trafo_scale |= 0; // Non log2 block size int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]); qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled; - shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift); + shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale); if (encoder->scaling_list.enable) { - uint32_t log2_tr_width = uvg_math_floor_log2(height) + 2; - uint32_t log2_tr_height = uvg_math_floor_log2(width) + 2; int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color); - const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6]; + const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled%6]; shift += 4; if (shift >qp_scaled / 6) { @@ -624,10 +658,10 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c } } } else { - int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6); + int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6); add = 1 << (shift-1); - for (n = 0; n < width*height; n++) { + for (n = 0; n < width * height; n++) { coeff_q = (q_coef[n] * scale + add) >> shift; coef[n] = (coeff_t)CLIP(-32768, 32767, coeff_q); } @@ -651,14 +685,15 @@ static INLINE void get_coeff_weights(uint64_t wts_packed, uint16_t *weights) weights[3] = (wts_packed >> 48) & 0xffff; } -static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, uint64_t weights) +static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights) { + assert((width == height) && "Non-square block handling not implemented for this function."); uint32_t sum = 0; uint16_t weights_unpacked[4]; get_coeff_weights(weights, weights_unpacked); - for (int32_t i = 0; i < width * width; i++) { + for (int32_t i = 0; i < width * height; i++) { int16_t curr = coeff[i]; uint32_t curr_abs = abs(curr); if (curr_abs > 3) { diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h index da2b05ae..665e0863 100644 --- a/src/strategies/generic/quant-generic.h +++ b/src/strategies/generic/quant-generic.h @@ -44,8 +44,6 @@ #include "uvg266.h" #include "tables.h" -#define QUANT_SHIFT 14 - int uvg_strategy_register_quant_generic(void* opaque, uint8_t bitdepth); void uvg_quant_generic( const encoder_state_t * const state, @@ -60,7 +58,7 @@ void uvg_quant_generic( uint8_t lfnst_idx); int uvg_quantize_residual_generic(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -71,6 +69,7 @@ int uvg_quant_cbcr_residual_generic( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, diff --git a/src/strategies/strategies-dct.c b/src/strategies/strategies-dct.c index 4ba2a37b..64b72eb9 100644 --- a/src/strategies/strategies-dct.c +++ b/src/strategies/strategies-dct.c @@ -44,6 +44,7 @@ dct_func * uvg_dct_4x4 = 0; dct_func * uvg_dct_8x8 = 0; dct_func * uvg_dct_16x16 = 0; dct_func * uvg_dct_32x32 = 0; +dct_func * uvg_dct_non_square = 0; dct_func * uvg_fast_inverse_dst_4x4 = 0; @@ -56,16 +57,19 @@ void(*uvg_mts_dct)(int8_t bitdepth, color_t color, const cu_info_t *tu, int8_t width, + int8_t height, const int16_t *input, int16_t *output, - const int8_t mts_idx); + const int8_t mts_type); + void(*uvg_mts_idct)(int8_t bitdepth, color_t color, const cu_info_t *tu, int8_t width, + int8_t height, const int16_t *input, int16_t *output, - const int8_t mts_idx); + const int8_t mts_type); int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) { @@ -90,8 +94,13 @@ int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth) { * * \returns Pointer to the function. */ -dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type) +dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type) { + if (width != height) { + // Non-square block. Return generic dct for non-square blokcs. + assert(false && "This should never be called at this point. Non-square stuff is done inside mts_dct function."); + //return uvg_dct_non_square; + } switch (width) { case 4: //if (color == COLOR_Y && type == CU_INTRA) { @@ -119,8 +128,13 @@ dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type) * * \returns Pointer to the function. */ -dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type) +dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type) { + if (width != height) { + // Non-square block. Return generic dct for non-square blokcs. + assert(false && "This should never be called at this point. Non-square stuff is done inside mts_idct function."); + //return uvg_idct_non_square; + } switch (width) { case 4: //if (color == COLOR_Y && type == CU_INTRA) { diff --git a/src/strategies/strategies-dct.h b/src/strategies/strategies-dct.h index d58bf5a9..0ad3c8c4 100644 --- a/src/strategies/strategies-dct.h +++ b/src/strategies/strategies-dct.h @@ -51,6 +51,7 @@ extern dct_func * uvg_dct_4x4; extern dct_func * uvg_dct_8x8; extern dct_func * uvg_dct_16x16; extern dct_func * uvg_dct_32x32; +extern dct_func * uvg_dct_non_square; extern dct_func * uvg_fast_inverse_dst_4x4; @@ -64,9 +65,10 @@ typedef void (mts_dct_func)( color_t color, const cu_info_t* tu, int8_t width, + int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx); + const int8_t mts_type); extern mts_dct_func* uvg_mts_dct; @@ -75,15 +77,16 @@ typedef void (mts_idct_func)( color_t color, const cu_info_t* tu, int8_t width, + int8_t height, const int16_t* input, int16_t* output, - const int8_t mts_idx); + const int8_t mts_type); extern mts_idct_func* uvg_mts_idct; int uvg_strategy_register_dct(void* opaque, uint8_t bitdepth); -dct_func * uvg_get_dct_func(int8_t width, color_t color, cu_type_t type); -dct_func * uvg_get_idct_func(int8_t width, color_t color, cu_type_t type); +dct_func * uvg_get_dct_func(int8_t width, int8_t height, color_t color, cu_type_t type); +dct_func * uvg_get_idct_func(int8_t width, int8_t height, color_t color, cu_type_t type); diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c new file mode 100644 index 00000000..d0eac087 --- /dev/null +++ b/src/strategies/strategies-depquant.c @@ -0,0 +1,55 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "strategies/strategies-depquant.h" + +#include "strategies/avx2/depquant-avx2.h" +#include "strategies/generic/depquant-generic.h" +#include "strategyselector.h" + + +// Define function pointers. +dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; +find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff; + + +int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= uvg_strategy_register_depquant_generic(opaque, bitdepth); + + if (uvg_g_hardware_flags.intel_flags.avx2) { + success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth); + } + return success; +} diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h new file mode 100644 index 00000000..5a58a3c7 --- /dev/null +++ b/src/strategies/strategies-depquant.h @@ -0,0 +1,88 @@ +#ifndef STRATEGIES_DEPQUANT_H_ +#define STRATEGIES_DEPQUANT_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Interface for sao functions. + */ + +#include "encoder.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "uvg266.h" +#include "dep_quant.h" + + +// Declare function pointers. +typedef int(dep_quant_decide_and_update_func)( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma); + +typedef void (find_first_non_zero_coeff_func)( + const coeff_t* srcCoeff, + const bool enableScalingLists, + const context_store* const dep_quant_context, + const uint32_t* const scan, + const int32_t* q_coeff, + int* firstTestPos, + int width, + int height); + + +// Declare function pointers. +extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; +extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff; + +int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth); + + +#define STRATEGIES_DEPQUANT_EXPORTS \ + {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \ + {"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \ + + + +#endif //STRATEGIES_DEPQUANT_H_ diff --git a/src/strategies/strategies-encode.h b/src/strategies/strategies-encode.h index 8743a6ed..969dfb57 100644 --- a/src/strategies/strategies-encode.h +++ b/src/strategies/strategies-encode.h @@ -49,7 +49,7 @@ typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state, cabac_data_t * const cabac, const coeff_t *coeff, - uint8_t width, + const cu_loc_t * const loc, uint8_t color, int8_t scan_mode, cu_info_t* cur_cu, diff --git a/src/strategies/strategies-intra.h b/src/strategies/strategies-intra.h index 0f7228a0..52f5e519 100644 --- a/src/strategies/strategies-intra.h +++ b/src/strategies/strategies-intra.h @@ -38,22 +38,26 @@ * Interface for intra prediction functions. */ +#include "cu.h" #include "global.h" // IWYU pragma: keep #include "intra.h" #include "uvg266.h" typedef void (angular_pred_func)( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, const int_fast8_t intra_mode, const int_fast8_t channel_type, const uvg_pixel *const in_ref_above, const uvg_pixel *const in_ref_left, uvg_pixel *const dst, - const uint8_t multi_ref_idx); + const uint8_t multi_ref_idx, + const uint8_t isp_mode, + const int cu_dim); typedef void (intra_pred_planar_func)( - const int_fast8_t log2_width, + const cu_loc_t* const cu_loc, + color_t color, const uvg_pixel *const ref_top, const uvg_pixel *const ref_left, uvg_pixel *const dst); @@ -67,8 +71,8 @@ typedef void (intra_pred_filtered_dc_func)( typedef void (pdpc_planar_dc_func)( const int mode, - const int width, - const int log2_width, + const cu_loc_t* const cu_loc, + const color_t color, const uvg_intra_ref *const used_ref, uvg_pixel *const dst); diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 00ad9ccb..649af2d6 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -37,6 +37,7 @@ #include "strategies/generic/picture-generic.h" #include "strategies/sse2/picture-sse2.h" #include "strategies/sse41/picture-sse41.h" +#include "strategies/sse42/picture-sse42.h" #include "strategyselector.h" @@ -70,6 +71,7 @@ cost_pixel_nxn_multi_func * uvg_satd_32x32_dual = 0; cost_pixel_nxn_multi_func * uvg_satd_64x64_dual = 0; cost_pixel_any_size_func * uvg_satd_any_size = 0; +cost_pixel_any_size_func * uvg_satd_any_size_vtm = 0; cost_pixel_any_size_multi_func * uvg_satd_any_size_quad = 0; pixels_calc_ssd_func * uvg_pixels_calc_ssd = 0; @@ -115,103 +117,116 @@ int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth) { /** * \brief Get a function that calculates SATD for NxN block. * -* \param n Width of the region for which SATD is calculated. +* \param width Width of the region for which SATD is calculated. * * \returns Pointer to cost_16bit_nxn_func. */ -cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned n) +cost_pixel_nxn_func * uvg_pixels_get_satd_func(unsigned width, unsigned height) { - switch (n) { - case 4: - return uvg_satd_4x4; - case 8: - return uvg_satd_8x8; - case 16: - return uvg_satd_16x16; - case 32: - return uvg_satd_32x32; - case 64: - return uvg_satd_64x64; - default: - return NULL; + if(width == height) { + switch (width) { + case 4: + return uvg_satd_4x4; + case 8: + return uvg_satd_8x8; + case 16: + return uvg_satd_16x16; + case 32: + return uvg_satd_32x32; + case 64: + return uvg_satd_64x64; + default: + return NULL; + } } + return NULL; } /** * \brief Get a function that calculates SAD for NxN block. * -* \param n Width of the region for which SAD is calculated. +* \param width Width of the region for which SAD is calculated. * * \returns Pointer to cost_16bit_nxn_func. */ -cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned n) +cost_pixel_nxn_func * uvg_pixels_get_sad_func(unsigned width, unsigned height) { - switch (n) { - case 4: - return uvg_sad_4x4; - case 8: - return uvg_sad_8x8; - case 16: - return uvg_sad_16x16; - case 32: - return uvg_sad_32x32; - case 64: - return uvg_sad_64x64; - default: - return NULL; + if(width == height) { + switch (width) { + case 4: + return uvg_sad_4x4; + case 8: + return uvg_sad_8x8; + case 16: + return uvg_sad_16x16; + case 32: + return uvg_sad_32x32; + case 64: + return uvg_sad_64x64; + default: + return NULL; + } } + return NULL; } /** * \brief Get a function that calculates SATDs for 2 NxN blocks. * -* \param n Width of the region for which SATD is calculated. +* \param width Width of the region for which SATD is calculated. +* \param height Height of the region for which SATD is calculated. * * \returns Pointer to cost_pixel_nxn_multi_func. */ -cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n) +cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height) { - switch (n) { - case 4: - return uvg_satd_4x4_dual; - case 8: - return uvg_satd_8x8_dual; - case 16: - return uvg_satd_16x16_dual; - case 32: - return uvg_satd_32x32_dual; - case 64: - return uvg_satd_64x64_dual; - default: - return NULL; + if(width == height) { + switch (width) { + case 4: + return uvg_satd_4x4_dual; + case 8: + return uvg_satd_8x8_dual; + case 16: + return uvg_satd_16x16_dual; + case 32: + return uvg_satd_32x32_dual; + case 64: + return uvg_satd_64x64_dual; + default: + return NULL; + } } + return NULL; } /** * \brief Get a function that calculates SADs for 2 NxN blocks. * -* \param n Width of the region for which SAD is calculated. +* \param width Width of the region for which SAD is calculated. * * \returns Pointer to cost_pixel_nxn_multi_func. */ -cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n) +cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height) { - switch (n) { - case 4: - return uvg_sad_4x4_dual; - case 8: - return uvg_sad_8x8_dual; - case 16: - return uvg_sad_16x16_dual; - case 32: - return uvg_sad_32x32_dual; - case 64: - return uvg_sad_64x64_dual; - default: - return NULL; + if(width == height) { + switch (width) { + case 4: + return uvg_sad_4x4_dual; + case 8: + return uvg_sad_8x8_dual; + case 16: + return uvg_sad_16x16_dual; + case 32: + return uvg_sad_32x32_dual; + case 64: + return uvg_sad_64x64_dual; + default: + return NULL; + } } + return NULL; } // Precomputed CRC32C lookup table for polynomial 0x04C11DB7 diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index 88f52cfc..cd4e2ec5 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -124,7 +124,7 @@ typedef unsigned (cost_pixel_any_size_func)( typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const uvg_pixel *orig, unsigned num_modes, unsigned *costs_out); typedef void (cost_pixel_any_size_multi_func)(int width, int height, const uvg_pixel **preds, const int stride, const uvg_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid); -typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width); +typedef unsigned (pixels_calc_ssd_func)(const uvg_pixel *const ref, const uvg_pixel *const rec, const int ref_stride, const int rec_stride, const int width, const int height); typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t); typedef uint32_t (ver_sad_func)(const uvg_pixel *pic_data, const uvg_pixel *ref_data, int32_t block_width, int32_t block_height, @@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu, typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len); -typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride); +typedef void (generate_residual_func)(const uvg_pixel* ref_in, const uvg_pixel* pred_in, int16_t* residual, int width, int height, int ref_stride, int pred_stride); extern const uint32_t uvg_crc_table[256]; @@ -175,6 +175,7 @@ extern cost_pixel_nxn_func * uvg_satd_16x16; extern cost_pixel_nxn_func * uvg_satd_32x32; extern cost_pixel_nxn_func * uvg_satd_64x64; extern cost_pixel_any_size_func *uvg_satd_any_size; +extern cost_pixel_any_size_func *uvg_satd_any_size_vtm; extern cost_pixel_nxn_multi_func * uvg_sad_4x4_dual; extern cost_pixel_nxn_multi_func * uvg_sad_8x8_dual; @@ -203,8 +204,8 @@ extern pixel_var_func *uvg_pixel_var; extern generate_residual_func* uvg_generate_residual; int uvg_strategy_register_picture(void* opaque, uint8_t bitdepth); -cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned n); -cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n); +cost_pixel_nxn_multi_func * uvg_pixels_get_satd_dual_func(unsigned width, unsigned height); +cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned width, unsigned height); #define STRATEGIES_PICTURE_EXPORTS \ {"crc32c_4x4", (void**) &uvg_crc32c_4x4}, \ @@ -221,6 +222,7 @@ cost_pixel_nxn_multi_func * uvg_pixels_get_sad_dual_func(unsigned n); {"satd_32x32", (void**) &uvg_satd_32x32}, \ {"satd_64x64", (void**) &uvg_satd_64x64}, \ {"satd_any_size", (void**) &uvg_satd_any_size}, \ + {"satd_any_size_vtm", (void**) &uvg_satd_any_size_vtm}, \ {"sad_4x4_dual", (void**) &uvg_sad_4x4_dual}, \ {"sad_8x8_dual", (void**) &uvg_sad_8x8_dual}, \ {"sad_16x16_dual", (void**) &uvg_sad_16x16_dual}, \ diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c index 89baf86e..62c75d6f 100644 --- a/src/strategies/strategies-quant.c +++ b/src/strategies/strategies-quant.c @@ -38,15 +38,16 @@ // Define function pointers. -quant_func *uvg_quant; -quant_cbcr_func *uvg_quant_cbcr_residual; -quant_residual_func *uvg_quantize_residual; -dequant_func *uvg_dequant; -coeff_abs_sum_func *uvg_coeff_abs_sum; +quant_func *uvg_quant; +quant_cbcr_func *uvg_quant_cbcr_residual; +quant_residual_func *uvg_quantize_residual; +dequant_func *uvg_dequant; +coeff_abs_sum_func *uvg_coeff_abs_sum; fast_coeff_cost_func *uvg_fast_coeff_cost; -int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) { +int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth) +{ bool success = true; success &= uvg_strategy_register_quant_generic(opaque, bitdepth); diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h index a6c9a3d4..b0e75046 100644 --- a/src/strategies/strategies-quant.h +++ b/src/strategies/strategies-quant.h @@ -45,12 +45,23 @@ #include "tables.h" // Declare function pointers. -typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, - int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip, uint8_t lfnst_idx); +typedef unsigned (quant_func)( + const encoder_state_t * const state, + coeff_t *coef, + coeff_t *q_coef, + int32_t width, + int32_t height, + color_t color, + int8_t scan_idx, + int8_t block_type, + int8_t transform_skip, + uint8_t lfnst_idx); + typedef unsigned (quant_cbcr_func)( encoder_state_t* const state, const cu_info_t* const cur_cu, const int width, + const int height, const coeff_scan_order_t scan_order, const int in_stride, const int out_stride, const uvg_pixel* const u_ref_in, @@ -63,16 +74,19 @@ typedef unsigned (quant_cbcr_func)( bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type); + typedef unsigned (quant_residual_func)(encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, uvg_pixel *rec_out, coeff_t *coeff_out, bool early_skip, int lmcs_chroma_adj, enum uvg_tree_type tree_type); + typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height, color_t color, int8_t block_type, int8_t transform_skip); -typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights); + +typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t height, uint64_t weights); typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length); diff --git a/src/strategyselector.c b/src/strategyselector.c index 477604a9..d6dffa4e 100644 --- a/src/strategyselector.c +++ b/src/strategyselector.c @@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) { fprintf(stderr, "uvg_strategy_register_encode failed!\n"); return 0; } + if (!uvg_strategy_register_depquant(&strategies, bitdepth)) { + fprintf(stderr, "uvg_strategy_register_depquant failed!\n"); + return 0; + } while(cur_strategy_to_select->fptr) { *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type); diff --git a/src/strategyselector.h b/src/strategyselector.h index caadfda9..8bbdfbed 100644 --- a/src/strategyselector.h +++ b/src/strategyselector.h @@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st #include "strategies/strategies-intra.h" #include "strategies/strategies-sao.h" #include "strategies/strategies-encode.h" +#include "strategies/strategies-depquant.h" #include "strategies/strategies-alf.h" static const strategy_to_select_t strategies_to_select[] = { @@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = { STRATEGIES_SAO_EXPORTS STRATEGIES_ENCODE_EXPORTS STRATEGIES_ALF_EXPORTS + STRATEGIES_DEPQUANT_EXPORTS { NULL, NULL }, }; diff --git a/src/tables.c b/src/tables.c index 422fd714..c98ecf79 100644 --- a/src/tables.c +++ b/src/tables.c @@ -7,6 +7,8 @@ #endif // 4 8 16 32 64 const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1] = {-1, -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4}; +// 0 1 2 4 8 16 32 64 +const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 }; const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2] = //===== luma/chroma ===== @@ -82,3 +84,2543 @@ const uint32_t* const uvg_g_sig_last_scan[3][5] = { {g_sig_last_scan_1_0, g_sig_last_scan_1_1, g_sig_last_scan_1_2, g_sig_last_scan_1_3, g_sig_last_scan_1_4}, {g_sig_last_scan_2_0, g_sig_last_scan_2_1, g_sig_last_scan_2_2, g_sig_last_scan_2_3, g_sig_last_scan_2_4} }; + +// Holds scan order indices for all possible block sizes for diagonal scan order and coefficient group scan order +static const uint32_t const g_scan_order_buffer[32258] = { + 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, // UNGROUPED 1xN, 1x2, 1x4, 1x8 + 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // 1x16 + 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 1x32 + 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, // 1x64 + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 0, 1, 0, // 2xN, 2x2 + 2, 1, 3, 0, 2, 1, 4, 3, 6, 5, 7, 0, 2, // 2x4, 2x8 + 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, + 15, 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, // 2x16 + 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23, + 26, 25, 28, 27, 30, 29, 31, 0, 2, 1, 4, 3, 6, // 2x32 + 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, + 20, 19, 22, 21, 24, 23, 26, 25, 28, 27, 30, 29, 32, + 31, 34, 33, 36, 35, 38, 37, 40, 39, 42, 41, 44, 43, + 46, 45, 48, 47, 50, 49, 52, 51, 54, 53, 56, 55, 58, + 57, 60, 59, 62, 61, 63, 0, 2, 1, 4, 3, 6, 5, // 2x64 + 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, + 19, 22, 21, 24, 23, 26, 25, 28, 27, 30, 29, 32, 31, + 34, 33, 36, 35, 38, 37, 40, 39, 42, 41, 44, 43, 46, + 45, 48, 47, 50, 49, 52, 51, 54, 53, 56, 55, 58, 57, + 60, 59, 62, 61, 64, 63, 66, 65, 68, 67, 70, 69, 72, + 71, 74, 73, 76, 75, 78, 77, 80, 79, 82, 81, 84, 83, + 86, 85, 88, 87, 90, 89, 92, 91, 94, 93, 96, 95, 98, + 97, 100, 99, 102, 101, 104, 103, 106, 105, 108, 107, 110, 109, + 112, 111, 114, 113, 116, 115, 118, 117, 120, 119, 122, 121, 124, + 123, 126, 125, 127, 0, 1, 2, 3, 0, 4, 1, 5, 2, // 4xN, 4x2 + 6, 3, 7, 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, // 4x4 + 13, 10, 7, 14, 11, 15, 0, 4, 1, 8, 5, 2, 12, // 4x8 + 9, 6, 3, 16, 13, 10, 7, 20, 17, 14, 11, 24, 21, + 18, 15, 28, 25, 22, 19, 29, 26, 23, 30, 27, 31, 0, // 4x16 + 4, 1, 8, 5, 2, 12, 9, 6, 3, 16, 13, 10, 7, + 20, 17, 14, 11, 24, 21, 18, 15, 28, 25, 22, 19, 32, + 29, 26, 23, 36, 33, 30, 27, 40, 37, 34, 31, 44, 41, + 38, 35, 48, 45, 42, 39, 52, 49, 46, 43, 56, 53, 50, + 47, 60, 57, 54, 51, 61, 58, 55, 62, 59, 63, 0, 4, // 4x32 + 1, 8, 5, 2, 12, 9, 6, 3, 16, 13, 10, 7, 20, + 17, 14, 11, 24, 21, 18, 15, 28, 25, 22, 19, 32, 29, + 26, 23, 36, 33, 30, 27, 40, 37, 34, 31, 44, 41, 38, + 35, 48, 45, 42, 39, 52, 49, 46, 43, 56, 53, 50, 47, + 60, 57, 54, 51, 64, 61, 58, 55, 68, 65, 62, 59, 72, + 69, 66, 63, 76, 73, 70, 67, 80, 77, 74, 71, 84, 81, + 78, 75, 88, 85, 82, 79, 92, 89, 86, 83, 96, 93, 90, + 87, 100, 97, 94, 91, 104, 101, 98, 95, 108, 105, 102, 99, + 112, 109, 106, 103, 116, 113, 110, 107, 120, 117, 114, 111, 124, + 121, 118, 115, 125, 122, 119, 126, 123, 127, 0, 4, 1, 8, // 4x64 + 5, 2, 12, 9, 6, 3, 16, 13, 10, 7, 20, 17, 14, + 11, 24, 21, 18, 15, 28, 25, 22, 19, 32, 29, 26, 23, + 36, 33, 30, 27, 40, 37, 34, 31, 44, 41, 38, 35, 48, + 45, 42, 39, 52, 49, 46, 43, 56, 53, 50, 47, 60, 57, + 54, 51, 64, 61, 58, 55, 68, 65, 62, 59, 72, 69, 66, + 63, 76, 73, 70, 67, 80, 77, 74, 71, 84, 81, 78, 75, + 88, 85, 82, 79, 92, 89, 86, 83, 96, 93, 90, 87, 100, + 97, 94, 91, 104, 101, 98, 95, 108, 105, 102, 99, 112, 109, + 106, 103, 116, 113, 110, 107, 120, 117, 114, 111, 124, 121, 118, + 115, 128, 125, 122, 119, 132, 129, 126, 123, 136, 133, 130, 127, + 140, 137, 134, 131, 144, 141, 138, 135, 148, 145, 142, 139, 152, + 149, 146, 143, 156, 153, 150, 147, 160, 157, 154, 151, 164, 161, + 158, 155, 168, 165, 162, 159, 172, 169, 166, 163, 176, 173, 170, + 167, 180, 177, 174, 171, 184, 181, 178, 175, 188, 185, 182, 179, + 192, 189, 186, 183, 196, 193, 190, 187, 200, 197, 194, 191, 204, + 201, 198, 195, 208, 205, 202, 199, 212, 209, 206, 203, 216, 213, + 210, 207, 220, 217, 214, 211, 224, 221, 218, 215, 228, 225, 222, + 219, 232, 229, 226, 223, 236, 233, 230, 227, 240, 237, 234, 231, + 244, 241, 238, 235, 248, 245, 242, 239, 252, 249, 246, 243, 253, + 250, 247, 254, 251, 255, 0, 1, 2, 3, 4, 5, 6, 7, // 8xN + 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, // 8x2 + 14, 7, 15, 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, // 8x4 + 25, 18, 11, 4, 26, 19, 12, 5, 27, 20, 13, 6, 28, + 21, 14, 7, 29, 22, 15, 30, 23, 31, 0, 8, 1, 16, // 8x8 + 9, 2, 24, 17, 10, 3, 32, 25, 18, 11, 4, 40, 33, + 26, 19, 12, 5, 48, 41, 34, 27, 20, 13, 6, 56, 49, + 42, 35, 28, 21, 14, 7, 57, 50, 43, 36, 29, 22, 15, + 58, 51, 44, 37, 30, 23, 59, 52, 45, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63, 0, 8, 1, 16, 9, // 8x16 + 2, 24, 17, 10, 3, 32, 25, 18, 11, 4, 40, 33, 26, + 19, 12, 5, 48, 41, 34, 27, 20, 13, 6, 56, 49, 42, + 35, 28, 21, 14, 7, 64, 57, 50, 43, 36, 29, 22, 15, + 72, 65, 58, 51, 44, 37, 30, 23, 80, 73, 66, 59, 52, + 45, 38, 31, 88, 81, 74, 67, 60, 53, 46, 39, 96, 89, + 82, 75, 68, 61, 54, 47, 104, 97, 90, 83, 76, 69, 62, + 55, 112, 105, 98, 91, 84, 77, 70, 63, 120, 113, 106, 99, + 92, 85, 78, 71, 121, 114, 107, 100, 93, 86, 79, 122, 115, + 108, 101, 94, 87, 123, 116, 109, 102, 95, 124, 117, 110, 103, + 125, 118, 111, 126, 119, 127, 0, 8, 1, 16, 9, 2, 24, // 8x32 + 17, 10, 3, 32, 25, 18, 11, 4, 40, 33, 26, 19, 12, + 5, 48, 41, 34, 27, 20, 13, 6, 56, 49, 42, 35, 28, + 21, 14, 7, 64, 57, 50, 43, 36, 29, 22, 15, 72, 65, + 58, 51, 44, 37, 30, 23, 80, 73, 66, 59, 52, 45, 38, + 31, 88, 81, 74, 67, 60, 53, 46, 39, 96, 89, 82, 75, + 68, 61, 54, 47, 104, 97, 90, 83, 76, 69, 62, 55, 112, + 105, 98, 91, 84, 77, 70, 63, 120, 113, 106, 99, 92, 85, + 78, 71, 128, 121, 114, 107, 100, 93, 86, 79, 136, 129, 122, + 115, 108, 101, 94, 87, 144, 137, 130, 123, 116, 109, 102, 95, + 152, 145, 138, 131, 124, 117, 110, 103, 160, 153, 146, 139, 132, + 125, 118, 111, 168, 161, 154, 147, 140, 133, 126, 119, 176, 169, + 162, 155, 148, 141, 134, 127, 184, 177, 170, 163, 156, 149, 142, + 135, 192, 185, 178, 171, 164, 157, 150, 143, 200, 193, 186, 179, + 172, 165, 158, 151, 208, 201, 194, 187, 180, 173, 166, 159, 216, + 209, 202, 195, 188, 181, 174, 167, 224, 217, 210, 203, 196, 189, + 182, 175, 232, 225, 218, 211, 204, 197, 190, 183, 240, 233, 226, + 219, 212, 205, 198, 191, 248, 241, 234, 227, 220, 213, 206, 199, + 249, 242, 235, 228, 221, 214, 207, 250, 243, 236, 229, 222, 215, + 251, 244, 237, 230, 223, 252, 245, 238, 231, 253, 246, 239, 254, + 247, 255, 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 32, // 8x64 + 25, 18, 11, 4, 40, 33, 26, 19, 12, 5, 48, 41, 34, + 27, 20, 13, 6, 56, 49, 42, 35, 28, 21, 14, 7, 64, + 57, 50, 43, 36, 29, 22, 15, 72, 65, 58, 51, 44, 37, + 30, 23, 80, 73, 66, 59, 52, 45, 38, 31, 88, 81, 74, + 67, 60, 53, 46, 39, 96, 89, 82, 75, 68, 61, 54, 47, + 104, 97, 90, 83, 76, 69, 62, 55, 112, 105, 98, 91, 84, + 77, 70, 63, 120, 113, 106, 99, 92, 85, 78, 71, 128, 121, + 114, 107, 100, 93, 86, 79, 136, 129, 122, 115, 108, 101, 94, + 87, 144, 137, 130, 123, 116, 109, 102, 95, 152, 145, 138, 131, + 124, 117, 110, 103, 160, 153, 146, 139, 132, 125, 118, 111, 168, + 161, 154, 147, 140, 133, 126, 119, 176, 169, 162, 155, 148, 141, + 134, 127, 184, 177, 170, 163, 156, 149, 142, 135, 192, 185, 178, + 171, 164, 157, 150, 143, 200, 193, 186, 179, 172, 165, 158, 151, + 208, 201, 194, 187, 180, 173, 166, 159, 216, 209, 202, 195, 188, + 181, 174, 167, 224, 217, 210, 203, 196, 189, 182, 175, 232, 225, + 218, 211, 204, 197, 190, 183, 240, 233, 226, 219, 212, 205, 198, + 191, 248, 241, 234, 227, 220, 213, 206, 199, 256, 249, 242, 235, + 228, 221, 214, 207, 264, 257, 250, 243, 236, 229, 222, 215, 272, + 265, 258, 251, 244, 237, 230, 223, 280, 273, 266, 259, 252, 245, + 238, 231, 288, 281, 274, 267, 260, 253, 246, 239, 296, 289, 282, + 275, 268, 261, 254, 247, 304, 297, 290, 283, 276, 269, 262, 255, + 312, 305, 298, 291, 284, 277, 270, 263, 320, 313, 306, 299, 292, + 285, 278, 271, 328, 321, 314, 307, 300, 293, 286, 279, 336, 329, + 322, 315, 308, 301, 294, 287, 344, 337, 330, 323, 316, 309, 302, + 295, 352, 345, 338, 331, 324, 317, 310, 303, 360, 353, 346, 339, + 332, 325, 318, 311, 368, 361, 354, 347, 340, 333, 326, 319, 376, + 369, 362, 355, 348, 341, 334, 327, 384, 377, 370, 363, 356, 349, + 342, 335, 392, 385, 378, 371, 364, 357, 350, 343, 400, 393, 386, + 379, 372, 365, 358, 351, 408, 401, 394, 387, 380, 373, 366, 359, + 416, 409, 402, 395, 388, 381, 374, 367, 424, 417, 410, 403, 396, + 389, 382, 375, 432, 425, 418, 411, 404, 397, 390, 383, 440, 433, + 426, 419, 412, 405, 398, 391, 448, 441, 434, 427, 420, 413, 406, + 399, 456, 449, 442, 435, 428, 421, 414, 407, 464, 457, 450, 443, + 436, 429, 422, 415, 472, 465, 458, 451, 444, 437, 430, 423, 480, + 473, 466, 459, 452, 445, 438, 431, 488, 481, 474, 467, 460, 453, + 446, 439, 496, 489, 482, 475, 468, 461, 454, 447, 504, 497, 490, + 483, 476, 469, 462, 455, 505, 498, 491, 484, 477, 470, 463, 506, + 499, 492, 485, 478, 471, 507, 500, 493, 486, 479, 508, 501, 494, + 487, 509, 502, 495, 510, 503, 511, 0, 1, 2, 3, 4, 5, // 16xN + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 16, 1, // 16x2 + 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, + 30, 15, 31, 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, // 16x4 + 49, 34, 19, 4, 50, 35, 20, 5, 51, 36, 21, 6, 52, + 37, 22, 7, 53, 38, 23, 8, 54, 39, 24, 9, 55, 40, + 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, 28, + 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, + 47, 63, 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, // 16x8 + 49, 34, 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, + 51, 36, 21, 6, 112, 97, 82, 67, 52, 37, 22, 7, 113, + 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, 54, 39, + 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, + 71, 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, + 118, 103, 88, 73, 58, 43, 28, 13, 119, 104, 89, 74, 59, + 44, 29, 14, 120, 105, 90, 75, 60, 45, 30, 15, 121, 106, + 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, 47, 123, 108, + 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, // 16x16 + 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, + 21, 6, 112, 97, 82, 67, 52, 37, 22, 7, 128, 113, 98, + 83, 68, 53, 38, 23, 8, 144, 129, 114, 99, 84, 69, 54, + 39, 24, 9, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, + 10, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, + 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, + 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, + 13, 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, + 44, 29, 14, 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, + 90, 75, 60, 45, 30, 15, 241, 226, 211, 196, 181, 166, 151, + 136, 121, 106, 91, 76, 61, 46, 31, 242, 227, 212, 197, 182, + 167, 152, 137, 122, 107, 92, 77, 62, 47, 243, 228, 213, 198, + 183, 168, 153, 138, 123, 108, 93, 78, 63, 244, 229, 214, 199, + 184, 169, 154, 139, 124, 109, 94, 79, 245, 230, 215, 200, 185, + 170, 155, 140, 125, 110, 95, 246, 231, 216, 201, 186, 171, 156, + 141, 126, 111, 247, 232, 217, 202, 187, 172, 157, 142, 127, 248, + 233, 218, 203, 188, 173, 158, 143, 249, 234, 219, 204, 189, 174, + 159, 250, 235, 220, 205, 190, 175, 251, 236, 221, 206, 191, 252, + 237, 222, 207, 253, 238, 223, 254, 239, 255, 0, 16, 1, 32, // 16x32 + 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, 65, + 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, + 82, 67, 52, 37, 22, 7, 128, 113, 98, 83, 68, 53, 38, + 23, 8, 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 160, + 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, 176, 161, 146, + 131, 116, 101, 86, 71, 56, 41, 26, 11, 192, 177, 162, 147, + 132, 117, 102, 87, 72, 57, 42, 27, 12, 208, 193, 178, 163, + 148, 133, 118, 103, 88, 73, 58, 43, 28, 13, 224, 209, 194, + 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 240, + 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, + 30, 15, 256, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, + 91, 76, 61, 46, 31, 272, 257, 242, 227, 212, 197, 182, 167, + 152, 137, 122, 107, 92, 77, 62, 47, 288, 273, 258, 243, 228, + 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 304, 289, + 274, 259, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, + 79, 320, 305, 290, 275, 260, 245, 230, 215, 200, 185, 170, 155, + 140, 125, 110, 95, 336, 321, 306, 291, 276, 261, 246, 231, 216, + 201, 186, 171, 156, 141, 126, 111, 352, 337, 322, 307, 292, 277, + 262, 247, 232, 217, 202, 187, 172, 157, 142, 127, 368, 353, 338, + 323, 308, 293, 278, 263, 248, 233, 218, 203, 188, 173, 158, 143, + 384, 369, 354, 339, 324, 309, 294, 279, 264, 249, 234, 219, 204, + 189, 174, 159, 400, 385, 370, 355, 340, 325, 310, 295, 280, 265, + 250, 235, 220, 205, 190, 175, 416, 401, 386, 371, 356, 341, 326, + 311, 296, 281, 266, 251, 236, 221, 206, 191, 432, 417, 402, 387, + 372, 357, 342, 327, 312, 297, 282, 267, 252, 237, 222, 207, 448, + 433, 418, 403, 388, 373, 358, 343, 328, 313, 298, 283, 268, 253, + 238, 223, 464, 449, 434, 419, 404, 389, 374, 359, 344, 329, 314, + 299, 284, 269, 254, 239, 480, 465, 450, 435, 420, 405, 390, 375, + 360, 345, 330, 315, 300, 285, 270, 255, 496, 481, 466, 451, 436, + 421, 406, 391, 376, 361, 346, 331, 316, 301, 286, 271, 497, 482, + 467, 452, 437, 422, 407, 392, 377, 362, 347, 332, 317, 302, 287, + 498, 483, 468, 453, 438, 423, 408, 393, 378, 363, 348, 333, 318, + 303, 499, 484, 469, 454, 439, 424, 409, 394, 379, 364, 349, 334, + 319, 500, 485, 470, 455, 440, 425, 410, 395, 380, 365, 350, 335, + 501, 486, 471, 456, 441, 426, 411, 396, 381, 366, 351, 502, 487, + 472, 457, 442, 427, 412, 397, 382, 367, 503, 488, 473, 458, 443, + 428, 413, 398, 383, 504, 489, 474, 459, 444, 429, 414, 399, 505, + 490, 475, 460, 445, 430, 415, 506, 491, 476, 461, 446, 431, 507, + 492, 477, 462, 447, 508, 493, 478, 463, 509, 494, 479, 510, 495, + 511, 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, // 16x64 + 34, 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, 51, + 36, 21, 6, 112, 97, 82, 67, 52, 37, 22, 7, 128, 113, + 98, 83, 68, 53, 38, 23, 8, 144, 129, 114, 99, 84, 69, + 54, 39, 24, 9, 160, 145, 130, 115, 100, 85, 70, 55, 40, + 25, 10, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, + 11, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, + 12, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, + 28, 13, 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, + 59, 44, 29, 14, 240, 225, 210, 195, 180, 165, 150, 135, 120, + 105, 90, 75, 60, 45, 30, 15, 256, 241, 226, 211, 196, 181, + 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 272, 257, 242, + 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47, + 288, 273, 258, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, + 93, 78, 63, 304, 289, 274, 259, 244, 229, 214, 199, 184, 169, + 154, 139, 124, 109, 94, 79, 320, 305, 290, 275, 260, 245, 230, + 215, 200, 185, 170, 155, 140, 125, 110, 95, 336, 321, 306, 291, + 276, 261, 246, 231, 216, 201, 186, 171, 156, 141, 126, 111, 352, + 337, 322, 307, 292, 277, 262, 247, 232, 217, 202, 187, 172, 157, + 142, 127, 368, 353, 338, 323, 308, 293, 278, 263, 248, 233, 218, + 203, 188, 173, 158, 143, 384, 369, 354, 339, 324, 309, 294, 279, + 264, 249, 234, 219, 204, 189, 174, 159, 400, 385, 370, 355, 340, + 325, 310, 295, 280, 265, 250, 235, 220, 205, 190, 175, 416, 401, + 386, 371, 356, 341, 326, 311, 296, 281, 266, 251, 236, 221, 206, + 191, 432, 417, 402, 387, 372, 357, 342, 327, 312, 297, 282, 267, + 252, 237, 222, 207, 448, 433, 418, 403, 388, 373, 358, 343, 328, + 313, 298, 283, 268, 253, 238, 223, 464, 449, 434, 419, 404, 389, + 374, 359, 344, 329, 314, 299, 284, 269, 254, 239, 480, 465, 450, + 435, 420, 405, 390, 375, 360, 345, 330, 315, 300, 285, 270, 255, + 496, 481, 466, 451, 436, 421, 406, 391, 376, 361, 346, 331, 316, + 301, 286, 271, 512, 497, 482, 467, 452, 437, 422, 407, 392, 377, + 362, 347, 332, 317, 302, 287, 528, 513, 498, 483, 468, 453, 438, + 423, 408, 393, 378, 363, 348, 333, 318, 303, 544, 529, 514, 499, + 484, 469, 454, 439, 424, 409, 394, 379, 364, 349, 334, 319, 560, + 545, 530, 515, 500, 485, 470, 455, 440, 425, 410, 395, 380, 365, + 350, 335, 576, 561, 546, 531, 516, 501, 486, 471, 456, 441, 426, + 411, 396, 381, 366, 351, 592, 577, 562, 547, 532, 517, 502, 487, + 472, 457, 442, 427, 412, 397, 382, 367, 608, 593, 578, 563, 548, + 533, 518, 503, 488, 473, 458, 443, 428, 413, 398, 383, 624, 609, + 594, 579, 564, 549, 534, 519, 504, 489, 474, 459, 444, 429, 414, + 399, 640, 625, 610, 595, 580, 565, 550, 535, 520, 505, 490, 475, + 460, 445, 430, 415, 656, 641, 626, 611, 596, 581, 566, 551, 536, + 521, 506, 491, 476, 461, 446, 431, 672, 657, 642, 627, 612, 597, + 582, 567, 552, 537, 522, 507, 492, 477, 462, 447, 688, 673, 658, + 643, 628, 613, 598, 583, 568, 553, 538, 523, 508, 493, 478, 463, + 704, 689, 674, 659, 644, 629, 614, 599, 584, 569, 554, 539, 524, + 509, 494, 479, 720, 705, 690, 675, 660, 645, 630, 615, 600, 585, + 570, 555, 540, 525, 510, 495, 736, 721, 706, 691, 676, 661, 646, + 631, 616, 601, 586, 571, 556, 541, 526, 511, 752, 737, 722, 707, + 692, 677, 662, 647, 632, 617, 602, 587, 572, 557, 542, 527, 768, + 753, 738, 723, 708, 693, 678, 663, 648, 633, 618, 603, 588, 573, + 558, 543, 784, 769, 754, 739, 724, 709, 694, 679, 664, 649, 634, + 619, 604, 589, 574, 559, 800, 785, 770, 755, 740, 725, 710, 695, + 680, 665, 650, 635, 620, 605, 590, 575, 816, 801, 786, 771, 756, + 741, 726, 711, 696, 681, 666, 651, 636, 621, 606, 591, 832, 817, + 802, 787, 772, 757, 742, 727, 712, 697, 682, 667, 652, 637, 622, + 607, 848, 833, 818, 803, 788, 773, 758, 743, 728, 713, 698, 683, + 668, 653, 638, 623, 864, 849, 834, 819, 804, 789, 774, 759, 744, + 729, 714, 699, 684, 669, 654, 639, 880, 865, 850, 835, 820, 805, + 790, 775, 760, 745, 730, 715, 700, 685, 670, 655, 896, 881, 866, + 851, 836, 821, 806, 791, 776, 761, 746, 731, 716, 701, 686, 671, + 912, 897, 882, 867, 852, 837, 822, 807, 792, 777, 762, 747, 732, + 717, 702, 687, 928, 913, 898, 883, 868, 853, 838, 823, 808, 793, + 778, 763, 748, 733, 718, 703, 944, 929, 914, 899, 884, 869, 854, + 839, 824, 809, 794, 779, 764, 749, 734, 719, 960, 945, 930, 915, + 900, 885, 870, 855, 840, 825, 810, 795, 780, 765, 750, 735, 976, + 961, 946, 931, 916, 901, 886, 871, 856, 841, 826, 811, 796, 781, + 766, 751, 992, 977, 962, 947, 932, 917, 902, 887, 872, 857, 842, + 827, 812, 797, 782, 767, 1008, 993, 978, 963, 948, 933, 918, 903, + 888, 873, 858, 843, 828, 813, 798, 783, 1009, 994, 979, 964, 949, + 934, 919, 904, 889, 874, 859, 844, 829, 814, 799, 1010, 995, 980, + 965, 950, 935, 920, 905, 890, 875, 860, 845, 830, 815, 1011, 996, + 981, 966, 951, 936, 921, 906, 891, 876, 861, 846, 831, 1012, 997, + 982, 967, 952, 937, 922, 907, 892, 877, 862, 847, 1013, 998, 983, + 968, 953, 938, 923, 908, 893, 878, 863, 1014, 999, 984, 969, 954, + 939, 924, 909, 894, 879, 1015, 1000, 985, 970, 955, 940, 925, 910, + 895, 1016, 1001, 986, 971, 956, 941, 926, 911, 1017, 1002, 987, 972, + 957, 942, 927, 1018, 1003, 988, 973, 958, 943, 1019, 1004, 989, 974, + 959, 1020, 1005, 990, 975, 1021, 1006, 991, 1022, 1007, 1023, 0, 1, // 32xN + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 0, 32, 1, 33, 2, 34, 3, 35, 4, // 32x2 + 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, + 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, + 62, 31, 63, 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, // 32x4 + 97, 66, 35, 4, 98, 67, 36, 5, 99, 68, 37, 6, 100, + 69, 38, 7, 101, 70, 39, 8, 102, 71, 40, 9, 103, 72, + 41, 10, 104, 73, 42, 11, 105, 74, 43, 12, 106, 75, 44, + 13, 107, 76, 45, 14, 108, 77, 46, 15, 109, 78, 47, 16, + 110, 79, 48, 17, 111, 80, 49, 18, 112, 81, 50, 19, 113, + 82, 51, 20, 114, 83, 52, 21, 115, 84, 53, 22, 116, 85, + 54, 23, 117, 86, 55, 24, 118, 87, 56, 25, 119, 88, 57, + 26, 120, 89, 58, 27, 121, 90, 59, 28, 122, 91, 60, 29, + 123, 92, 61, 30, 124, 93, 62, 31, 125, 94, 63, 126, 95, + 127, 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, // 32x8 + 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, + 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 225, 194, + 163, 132, 101, 70, 39, 8, 226, 195, 164, 133, 102, 71, 40, + 9, 227, 196, 165, 134, 103, 72, 41, 10, 228, 197, 166, 135, + 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, 12, 230, + 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, + 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, + 140, 109, 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, + 235, 204, 173, 142, 111, 80, 49, 18, 236, 205, 174, 143, 112, + 81, 50, 19, 237, 206, 175, 144, 113, 82, 51, 20, 238, 207, + 176, 145, 114, 83, 52, 21, 239, 208, 177, 146, 115, 84, 53, + 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, 210, 179, 148, + 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, 243, + 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, + 58, 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, + 153, 122, 91, 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, + 248, 217, 186, 155, 124, 93, 62, 31, 249, 218, 187, 156, 125, + 94, 63, 250, 219, 188, 157, 126, 95, 251, 220, 189, 158, 127, + 252, 221, 190, 159, 253, 222, 191, 254, 223, 255, 0, 32, 1, // 32x16 + 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, + 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, + 193, 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, + 70, 39, 8, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, + 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, 10, 352, 321, + 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353, 322, + 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, + 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, + 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, + 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, + 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, + 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, + 265, 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, + 359, 328, 297, 266, 235, 204, 173, 142, 111, 80, 49, 18, 484, + 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, + 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, 175, + 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, + 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, + 332, 301, 270, 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, + 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, + 23, 489, 458, 427, 396, 365, 334, 303, 272, 241, 210, 179, 148, + 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273, 242, + 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, + 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, + 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, + 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, + 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, + 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, + 278, 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, + 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 497, + 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, + 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, + 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, + 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, + 159, 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, + 471, 440, 409, 378, 347, 316, 285, 254, 223, 503, 472, 441, 410, + 379, 348, 317, 286, 255, 504, 473, 442, 411, 380, 349, 318, 287, + 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382, 351, + 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, + 479, 511, 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, // 32x32 + 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, + 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 256, + 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226, 195, 164, + 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, + 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, + 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, + 43, 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, + 75, 44, 13, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, + 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263, + 232, 201, 170, 139, 108, 77, 46, 15, 512, 481, 450, 419, 388, + 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 544, + 513, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, + 110, 79, 48, 17, 576, 545, 514, 483, 452, 421, 390, 359, 328, + 297, 266, 235, 204, 173, 142, 111, 80, 49, 18, 608, 577, 546, + 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, + 112, 81, 50, 19, 640, 609, 578, 547, 516, 485, 454, 423, 392, + 361, 330, 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 672, + 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, + 238, 207, 176, 145, 114, 83, 52, 21, 704, 673, 642, 611, 580, + 549, 518, 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, + 146, 115, 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, + 488, 457, 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, + 85, 54, 23, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, + 458, 427, 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, + 55, 24, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, + 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, + 56, 25, 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, + 491, 460, 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, + 88, 57, 26, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, + 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, + 151, 120, 89, 58, 27, 896, 865, 834, 803, 772, 741, 710, 679, + 648, 617, 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, + 245, 214, 183, 152, 121, 90, 59, 28, 928, 897, 866, 835, 804, + 773, 742, 711, 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, + 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 960, + 929, 898, 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, + 526, 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, + 123, 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, + 713, 682, 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, + 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, + 900, 869, 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, + 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, + 94, 63, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, + 653, 622, 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, + 250, 219, 188, 157, 126, 95, 995, 964, 933, 902, 871, 840, 809, + 778, 747, 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, + 375, 344, 313, 282, 251, 220, 189, 158, 127, 996, 965, 934, 903, + 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, + 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 997, 966, + 935, 904, 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, + 532, 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 998, + 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, + 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 999, + 968, 937, 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, + 565, 534, 503, 472, 441, 410, 379, 348, 317, 286, 255, 1000, 969, + 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, + 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 908, + 877, 846, 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, + 474, 443, 412, 381, 350, 319, 1002, 971, 940, 909, 878, 847, 816, + 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, + 382, 351, 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, + 662, 631, 600, 569, 538, 507, 476, 445, 414, 383, 1004, 973, 942, + 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, + 508, 477, 446, 415, 1005, 974, 943, 912, 881, 850, 819, 788, 757, + 726, 695, 664, 633, 602, 571, 540, 509, 478, 447, 1006, 975, 944, + 913, 882, 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, + 510, 479, 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, + 666, 635, 604, 573, 542, 511, 1008, 977, 946, 915, 884, 853, 822, + 791, 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 916, + 885, 854, 823, 792, 761, 730, 699, 668, 637, 606, 575, 1010, 979, + 948, 917, 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 1011, + 980, 949, 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 1012, + 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, + 951, 920, 889, 858, 827, 796, 765, 734, 703, 1014, 983, 952, 921, + 890, 859, 828, 797, 766, 735, 1015, 984, 953, 922, 891, 860, 829, + 798, 767, 1016, 985, 954, 923, 892, 861, 830, 799, 1017, 986, 955, + 924, 893, 862, 831, 1018, 987, 956, 925, 894, 863, 1019, 988, 957, + 926, 895, 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, 0, // 32x64 + 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, + 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, + 6, 224, 193, 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, + 132, 101, 70, 39, 8, 288, 257, 226, 195, 164, 133, 102, 71, + 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, 10, + 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, + 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, + 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, + 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, + 45, 14, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, + 139, 108, 77, 46, 15, 512, 481, 450, 419, 388, 357, 326, 295, + 264, 233, 202, 171, 140, 109, 78, 47, 16, 544, 513, 482, 451, + 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, + 17, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266, 235, + 204, 173, 142, 111, 80, 49, 18, 608, 577, 546, 515, 484, 453, + 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, + 19, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330, 299, + 268, 237, 206, 175, 144, 113, 82, 51, 20, 672, 641, 610, 579, + 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, + 145, 114, 83, 52, 21, 704, 673, 642, 611, 580, 549, 518, 487, + 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, 84, + 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, + 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, + 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427, 396, + 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 800, + 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, + 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 832, + 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460, 429, + 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, + 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, 492, + 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, + 58, 27, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617, 586, + 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, 183, + 152, 121, 90, 59, 28, 928, 897, 866, 835, 804, 773, 742, 711, + 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, 370, 339, 308, + 277, 246, 215, 184, 153, 122, 91, 60, 29, 960, 929, 898, 867, + 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495, 464, + 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, + 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, + 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, 248, + 217, 186, 155, 124, 93, 62, 31, 1024, 993, 962, 931, 900, 869, + 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466, + 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, +1056, 1025, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, + 653, 622, 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, + 250, 219, 188, 157, 126, 95, 1088, 1057, 1026, 995, 964, 933, 902, + 871, 840, 809, 778, 747, 716, 685, 654, 623, 592, 561, 530, 499, + 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 1120, +1089, 1058, 1027, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, + 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, 314, + 283, 252, 221, 190, 159, 1152, 1121, 1090, 1059, 1028, 997, 966, 935, + 904, 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, + 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 1184, 1153, +1122, 1091, 1060, 1029, 998, 967, 936, 905, 874, 843, 812, 781, 750, + 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, + 316, 285, 254, 223, 1216, 1185, 1154, 1123, 1092, 1061, 1030, 999, 968, + 937, 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, + 534, 503, 472, 441, 410, 379, 348, 317, 286, 255, 1248, 1217, 1186, +1155, 1124, 1093, 1062, 1031, 1000, 969, 938, 907, 876, 845, 814, 783, + 752, 721, 690, 659, 628, 597, 566, 535, 504, 473, 442, 411, 380, + 349, 318, 287, 1280, 1249, 1218, 1187, 1156, 1125, 1094, 1063, 1032, 1001, + 970, 939, 908, 877, 846, 815, 784, 753, 722, 691, 660, 629, 598, + 567, 536, 505, 474, 443, 412, 381, 350, 319, 1312, 1281, 1250, 1219, +1188, 1157, 1126, 1095, 1064, 1033, 1002, 971, 940, 909, 878, 847, 816, + 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, + 382, 351, 1344, 1313, 1282, 1251, 1220, 1189, 1158, 1127, 1096, 1065, 1034, +1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631, + 600, 569, 538, 507, 476, 445, 414, 383, 1376, 1345, 1314, 1283, 1252, +1221, 1190, 1159, 1128, 1097, 1066, 1035, 1004, 973, 942, 911, 880, 849, + 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, 446, + 415, 1408, 1377, 1346, 1315, 1284, 1253, 1222, 1191, 1160, 1129, 1098, 1067, +1036, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695, 664, + 633, 602, 571, 540, 509, 478, 447, 1440, 1409, 1378, 1347, 1316, 1285, +1254, 1223, 1192, 1161, 1130, 1099, 1068, 1037, 1006, 975, 944, 913, 882, + 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, 510, 479, +1472, 1441, 1410, 1379, 1348, 1317, 1286, 1255, 1224, 1193, 1162, 1131, 1100, +1069, 1038, 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, + 666, 635, 604, 573, 542, 511, 1504, 1473, 1442, 1411, 1380, 1349, 1318, +1287, 1256, 1225, 1194, 1163, 1132, 1101, 1070, 1039, 1008, 977, 946, 915, + 884, 853, 822, 791, 760, 729, 698, 667, 636, 605, 574, 543, 1536, +1505, 1474, 1443, 1412, 1381, 1350, 1319, 1288, 1257, 1226, 1195, 1164, 1133, +1102, 1071, 1040, 1009, 978, 947, 916, 885, 854, 823, 792, 761, 730, + 699, 668, 637, 606, 575, 1568, 1537, 1506, 1475, 1444, 1413, 1382, 1351, +1320, 1289, 1258, 1227, 1196, 1165, 1134, 1103, 1072, 1041, 1010, 979, 948, + 917, 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 1600, 1569, +1538, 1507, 1476, 1445, 1414, 1383, 1352, 1321, 1290, 1259, 1228, 1197, 1166, +1135, 1104, 1073, 1042, 1011, 980, 949, 918, 887, 856, 825, 794, 763, + 732, 701, 670, 639, 1632, 1601, 1570, 1539, 1508, 1477, 1446, 1415, 1384, +1353, 1322, 1291, 1260, 1229, 1198, 1167, 1136, 1105, 1074, 1043, 1012, 981, + 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1664, 1633, 1602, +1571, 1540, 1509, 1478, 1447, 1416, 1385, 1354, 1323, 1292, 1261, 1230, 1199, +1168, 1137, 1106, 1075, 1044, 1013, 982, 951, 920, 889, 858, 827, 796, + 765, 734, 703, 1696, 1665, 1634, 1603, 1572, 1541, 1510, 1479, 1448, 1417, +1386, 1355, 1324, 1293, 1262, 1231, 1200, 1169, 1138, 1107, 1076, 1045, 1014, + 983, 952, 921, 890, 859, 828, 797, 766, 735, 1728, 1697, 1666, 1635, +1604, 1573, 1542, 1511, 1480, 1449, 1418, 1387, 1356, 1325, 1294, 1263, 1232, +1201, 1170, 1139, 1108, 1077, 1046, 1015, 984, 953, 922, 891, 860, 829, + 798, 767, 1760, 1729, 1698, 1667, 1636, 1605, 1574, 1543, 1512, 1481, 1450, +1419, 1388, 1357, 1326, 1295, 1264, 1233, 1202, 1171, 1140, 1109, 1078, 1047, +1016, 985, 954, 923, 892, 861, 830, 799, 1792, 1761, 1730, 1699, 1668, +1637, 1606, 1575, 1544, 1513, 1482, 1451, 1420, 1389, 1358, 1327, 1296, 1265, +1234, 1203, 1172, 1141, 1110, 1079, 1048, 1017, 986, 955, 924, 893, 862, + 831, 1824, 1793, 1762, 1731, 1700, 1669, 1638, 1607, 1576, 1545, 1514, 1483, +1452, 1421, 1390, 1359, 1328, 1297, 1266, 1235, 1204, 1173, 1142, 1111, 1080, +1049, 1018, 987, 956, 925, 894, 863, 1856, 1825, 1794, 1763, 1732, 1701, +1670, 1639, 1608, 1577, 1546, 1515, 1484, 1453, 1422, 1391, 1360, 1329, 1298, +1267, 1236, 1205, 1174, 1143, 1112, 1081, 1050, 1019, 988, 957, 926, 895, +1888, 1857, 1826, 1795, 1764, 1733, 1702, 1671, 1640, 1609, 1578, 1547, 1516, +1485, 1454, 1423, 1392, 1361, 1330, 1299, 1268, 1237, 1206, 1175, 1144, 1113, +1082, 1051, 1020, 989, 958, 927, 1920, 1889, 1858, 1827, 1796, 1765, 1734, +1703, 1672, 1641, 1610, 1579, 1548, 1517, 1486, 1455, 1424, 1393, 1362, 1331, +1300, 1269, 1238, 1207, 1176, 1145, 1114, 1083, 1052, 1021, 990, 959, 1952, +1921, 1890, 1859, 1828, 1797, 1766, 1735, 1704, 1673, 1642, 1611, 1580, 1549, +1518, 1487, 1456, 1425, 1394, 1363, 1332, 1301, 1270, 1239, 1208, 1177, 1146, +1115, 1084, 1053, 1022, 991, 1984, 1953, 1922, 1891, 1860, 1829, 1798, 1767, +1736, 1705, 1674, 1643, 1612, 1581, 1550, 1519, 1488, 1457, 1426, 1395, 1364, +1333, 1302, 1271, 1240, 1209, 1178, 1147, 1116, 1085, 1054, 1023, 2016, 1985, +1954, 1923, 1892, 1861, 1830, 1799, 1768, 1737, 1706, 1675, 1644, 1613, 1582, +1551, 1520, 1489, 1458, 1427, 1396, 1365, 1334, 1303, 1272, 1241, 1210, 1179, +1148, 1117, 1086, 1055, 2017, 1986, 1955, 1924, 1893, 1862, 1831, 1800, 1769, +1738, 1707, 1676, 1645, 1614, 1583, 1552, 1521, 1490, 1459, 1428, 1397, 1366, +1335, 1304, 1273, 1242, 1211, 1180, 1149, 1118, 1087, 2018, 1987, 1956, 1925, +1894, 1863, 1832, 1801, 1770, 1739, 1708, 1677, 1646, 1615, 1584, 1553, 1522, +1491, 1460, 1429, 1398, 1367, 1336, 1305, 1274, 1243, 1212, 1181, 1150, 1119, +2019, 1988, 1957, 1926, 1895, 1864, 1833, 1802, 1771, 1740, 1709, 1678, 1647, +1616, 1585, 1554, 1523, 1492, 1461, 1430, 1399, 1368, 1337, 1306, 1275, 1244, +1213, 1182, 1151, 2020, 1989, 1958, 1927, 1896, 1865, 1834, 1803, 1772, 1741, +1710, 1679, 1648, 1617, 1586, 1555, 1524, 1493, 1462, 1431, 1400, 1369, 1338, +1307, 1276, 1245, 1214, 1183, 2021, 1990, 1959, 1928, 1897, 1866, 1835, 1804, +1773, 1742, 1711, 1680, 1649, 1618, 1587, 1556, 1525, 1494, 1463, 1432, 1401, +1370, 1339, 1308, 1277, 1246, 1215, 2022, 1991, 1960, 1929, 1898, 1867, 1836, +1805, 1774, 1743, 1712, 1681, 1650, 1619, 1588, 1557, 1526, 1495, 1464, 1433, +1402, 1371, 1340, 1309, 1278, 1247, 2023, 1992, 1961, 1930, 1899, 1868, 1837, +1806, 1775, 1744, 1713, 1682, 1651, 1620, 1589, 1558, 1527, 1496, 1465, 1434, +1403, 1372, 1341, 1310, 1279, 2024, 1993, 1962, 1931, 1900, 1869, 1838, 1807, +1776, 1745, 1714, 1683, 1652, 1621, 1590, 1559, 1528, 1497, 1466, 1435, 1404, +1373, 1342, 1311, 2025, 1994, 1963, 1932, 1901, 1870, 1839, 1808, 1777, 1746, +1715, 1684, 1653, 1622, 1591, 1560, 1529, 1498, 1467, 1436, 1405, 1374, 1343, +2026, 1995, 1964, 1933, 1902, 1871, 1840, 1809, 1778, 1747, 1716, 1685, 1654, +1623, 1592, 1561, 1530, 1499, 1468, 1437, 1406, 1375, 2027, 1996, 1965, 1934, +1903, 1872, 1841, 1810, 1779, 1748, 1717, 1686, 1655, 1624, 1593, 1562, 1531, +1500, 1469, 1438, 1407, 2028, 1997, 1966, 1935, 1904, 1873, 1842, 1811, 1780, +1749, 1718, 1687, 1656, 1625, 1594, 1563, 1532, 1501, 1470, 1439, 2029, 1998, +1967, 1936, 1905, 1874, 1843, 1812, 1781, 1750, 1719, 1688, 1657, 1626, 1595, +1564, 1533, 1502, 1471, 2030, 1999, 1968, 1937, 1906, 1875, 1844, 1813, 1782, +1751, 1720, 1689, 1658, 1627, 1596, 1565, 1534, 1503, 2031, 2000, 1969, 1938, +1907, 1876, 1845, 1814, 1783, 1752, 1721, 1690, 1659, 1628, 1597, 1566, 1535, +2032, 2001, 1970, 1939, 1908, 1877, 1846, 1815, 1784, 1753, 1722, 1691, 1660, +1629, 1598, 1567, 2033, 2002, 1971, 1940, 1909, 1878, 1847, 1816, 1785, 1754, +1723, 1692, 1661, 1630, 1599, 2034, 2003, 1972, 1941, 1910, 1879, 1848, 1817, +1786, 1755, 1724, 1693, 1662, 1631, 2035, 2004, 1973, 1942, 1911, 1880, 1849, +1818, 1787, 1756, 1725, 1694, 1663, 2036, 2005, 1974, 1943, 1912, 1881, 1850, +1819, 1788, 1757, 1726, 1695, 2037, 2006, 1975, 1944, 1913, 1882, 1851, 1820, +1789, 1758, 1727, 2038, 2007, 1976, 1945, 1914, 1883, 1852, 1821, 1790, 1759, +2039, 2008, 1977, 1946, 1915, 1884, 1853, 1822, 1791, 2040, 2009, 1978, 1947, +1916, 1885, 1854, 1823, 2041, 2010, 1979, 1948, 1917, 1886, 1855, 2042, 2011, +1980, 1949, 1918, 1887, 2043, 2012, 1981, 1950, 1919, 2044, 2013, 1982, 1951, +2045, 2014, 1983, 2046, 2015, 2047, 0, 1, 2, 3, 4, 5, 6, // 64xN + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 0, 64, 1, 65, 2, 66, 3, 67, // 64x2 + 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, + 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, + 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, + 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, + 30, 94, 31, 95, 32, 96, 33, 97, 34, 98, 35, 99, 36, + 100, 37, 101, 38, 102, 39, 103, 40, 104, 41, 105, 42, 106, + 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, 48, 112, 49, + 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, + 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, + 126, 63, 127, 0, 64, 1, 128, 65, 2, 192, 129, 66, 3, // 64x4 + 193, 130, 67, 4, 194, 131, 68, 5, 195, 132, 69, 6, 196, + 133, 70, 7, 197, 134, 71, 8, 198, 135, 72, 9, 199, 136, + 73, 10, 200, 137, 74, 11, 201, 138, 75, 12, 202, 139, 76, + 13, 203, 140, 77, 14, 204, 141, 78, 15, 205, 142, 79, 16, + 206, 143, 80, 17, 207, 144, 81, 18, 208, 145, 82, 19, 209, + 146, 83, 20, 210, 147, 84, 21, 211, 148, 85, 22, 212, 149, + 86, 23, 213, 150, 87, 24, 214, 151, 88, 25, 215, 152, 89, + 26, 216, 153, 90, 27, 217, 154, 91, 28, 218, 155, 92, 29, + 219, 156, 93, 30, 220, 157, 94, 31, 221, 158, 95, 32, 222, + 159, 96, 33, 223, 160, 97, 34, 224, 161, 98, 35, 225, 162, + 99, 36, 226, 163, 100, 37, 227, 164, 101, 38, 228, 165, 102, + 39, 229, 166, 103, 40, 230, 167, 104, 41, 231, 168, 105, 42, + 232, 169, 106, 43, 233, 170, 107, 44, 234, 171, 108, 45, 235, + 172, 109, 46, 236, 173, 110, 47, 237, 174, 111, 48, 238, 175, + 112, 49, 239, 176, 113, 50, 240, 177, 114, 51, 241, 178, 115, + 52, 242, 179, 116, 53, 243, 180, 117, 54, 244, 181, 118, 55, + 245, 182, 119, 56, 246, 183, 120, 57, 247, 184, 121, 58, 248, + 185, 122, 59, 249, 186, 123, 60, 250, 187, 124, 61, 251, 188, + 125, 62, 252, 189, 126, 63, 253, 190, 127, 254, 191, 255, 0, // 64x8 + 64, 1, 128, 65, 2, 192, 129, 66, 3, 256, 193, 130, 67, + 4, 320, 257, 194, 131, 68, 5, 384, 321, 258, 195, 132, 69, + 6, 448, 385, 322, 259, 196, 133, 70, 7, 449, 386, 323, 260, + 197, 134, 71, 8, 450, 387, 324, 261, 198, 135, 72, 9, 451, + 388, 325, 262, 199, 136, 73, 10, 452, 389, 326, 263, 200, 137, + 74, 11, 453, 390, 327, 264, 201, 138, 75, 12, 454, 391, 328, + 265, 202, 139, 76, 13, 455, 392, 329, 266, 203, 140, 77, 14, + 456, 393, 330, 267, 204, 141, 78, 15, 457, 394, 331, 268, 205, + 142, 79, 16, 458, 395, 332, 269, 206, 143, 80, 17, 459, 396, + 333, 270, 207, 144, 81, 18, 460, 397, 334, 271, 208, 145, 82, + 19, 461, 398, 335, 272, 209, 146, 83, 20, 462, 399, 336, 273, + 210, 147, 84, 21, 463, 400, 337, 274, 211, 148, 85, 22, 464, + 401, 338, 275, 212, 149, 86, 23, 465, 402, 339, 276, 213, 150, + 87, 24, 466, 403, 340, 277, 214, 151, 88, 25, 467, 404, 341, + 278, 215, 152, 89, 26, 468, 405, 342, 279, 216, 153, 90, 27, + 469, 406, 343, 280, 217, 154, 91, 28, 470, 407, 344, 281, 218, + 155, 92, 29, 471, 408, 345, 282, 219, 156, 93, 30, 472, 409, + 346, 283, 220, 157, 94, 31, 473, 410, 347, 284, 221, 158, 95, + 32, 474, 411, 348, 285, 222, 159, 96, 33, 475, 412, 349, 286, + 223, 160, 97, 34, 476, 413, 350, 287, 224, 161, 98, 35, 477, + 414, 351, 288, 225, 162, 99, 36, 478, 415, 352, 289, 226, 163, + 100, 37, 479, 416, 353, 290, 227, 164, 101, 38, 480, 417, 354, + 291, 228, 165, 102, 39, 481, 418, 355, 292, 229, 166, 103, 40, + 482, 419, 356, 293, 230, 167, 104, 41, 483, 420, 357, 294, 231, + 168, 105, 42, 484, 421, 358, 295, 232, 169, 106, 43, 485, 422, + 359, 296, 233, 170, 107, 44, 486, 423, 360, 297, 234, 171, 108, + 45, 487, 424, 361, 298, 235, 172, 109, 46, 488, 425, 362, 299, + 236, 173, 110, 47, 489, 426, 363, 300, 237, 174, 111, 48, 490, + 427, 364, 301, 238, 175, 112, 49, 491, 428, 365, 302, 239, 176, + 113, 50, 492, 429, 366, 303, 240, 177, 114, 51, 493, 430, 367, + 304, 241, 178, 115, 52, 494, 431, 368, 305, 242, 179, 116, 53, + 495, 432, 369, 306, 243, 180, 117, 54, 496, 433, 370, 307, 244, + 181, 118, 55, 497, 434, 371, 308, 245, 182, 119, 56, 498, 435, + 372, 309, 246, 183, 120, 57, 499, 436, 373, 310, 247, 184, 121, + 58, 500, 437, 374, 311, 248, 185, 122, 59, 501, 438, 375, 312, + 249, 186, 123, 60, 502, 439, 376, 313, 250, 187, 124, 61, 503, + 440, 377, 314, 251, 188, 125, 62, 504, 441, 378, 315, 252, 189, + 126, 63, 505, 442, 379, 316, 253, 190, 127, 506, 443, 380, 317, + 254, 191, 507, 444, 381, 318, 255, 508, 445, 382, 319, 509, 446, + 383, 510, 447, 511, 0, 64, 1, 128, 65, 2, 192, 129, 66, // 64x16 + 3, 256, 193, 130, 67, 4, 320, 257, 194, 131, 68, 5, 384, + 321, 258, 195, 132, 69, 6, 448, 385, 322, 259, 196, 133, 70, + 7, 512, 449, 386, 323, 260, 197, 134, 71, 8, 576, 513, 450, + 387, 324, 261, 198, 135, 72, 9, 640, 577, 514, 451, 388, 325, + 262, 199, 136, 73, 10, 704, 641, 578, 515, 452, 389, 326, 263, + 200, 137, 74, 11, 768, 705, 642, 579, 516, 453, 390, 327, 264, + 201, 138, 75, 12, 832, 769, 706, 643, 580, 517, 454, 391, 328, + 265, 202, 139, 76, 13, 896, 833, 770, 707, 644, 581, 518, 455, + 392, 329, 266, 203, 140, 77, 14, 960, 897, 834, 771, 708, 645, + 582, 519, 456, 393, 330, 267, 204, 141, 78, 15, 961, 898, 835, + 772, 709, 646, 583, 520, 457, 394, 331, 268, 205, 142, 79, 16, + 962, 899, 836, 773, 710, 647, 584, 521, 458, 395, 332, 269, 206, + 143, 80, 17, 963, 900, 837, 774, 711, 648, 585, 522, 459, 396, + 333, 270, 207, 144, 81, 18, 964, 901, 838, 775, 712, 649, 586, + 523, 460, 397, 334, 271, 208, 145, 82, 19, 965, 902, 839, 776, + 713, 650, 587, 524, 461, 398, 335, 272, 209, 146, 83, 20, 966, + 903, 840, 777, 714, 651, 588, 525, 462, 399, 336, 273, 210, 147, + 84, 21, 967, 904, 841, 778, 715, 652, 589, 526, 463, 400, 337, + 274, 211, 148, 85, 22, 968, 905, 842, 779, 716, 653, 590, 527, + 464, 401, 338, 275, 212, 149, 86, 23, 969, 906, 843, 780, 717, + 654, 591, 528, 465, 402, 339, 276, 213, 150, 87, 24, 970, 907, + 844, 781, 718, 655, 592, 529, 466, 403, 340, 277, 214, 151, 88, + 25, 971, 908, 845, 782, 719, 656, 593, 530, 467, 404, 341, 278, + 215, 152, 89, 26, 972, 909, 846, 783, 720, 657, 594, 531, 468, + 405, 342, 279, 216, 153, 90, 27, 973, 910, 847, 784, 721, 658, + 595, 532, 469, 406, 343, 280, 217, 154, 91, 28, 974, 911, 848, + 785, 722, 659, 596, 533, 470, 407, 344, 281, 218, 155, 92, 29, + 975, 912, 849, 786, 723, 660, 597, 534, 471, 408, 345, 282, 219, + 156, 93, 30, 976, 913, 850, 787, 724, 661, 598, 535, 472, 409, + 346, 283, 220, 157, 94, 31, 977, 914, 851, 788, 725, 662, 599, + 536, 473, 410, 347, 284, 221, 158, 95, 32, 978, 915, 852, 789, + 726, 663, 600, 537, 474, 411, 348, 285, 222, 159, 96, 33, 979, + 916, 853, 790, 727, 664, 601, 538, 475, 412, 349, 286, 223, 160, + 97, 34, 980, 917, 854, 791, 728, 665, 602, 539, 476, 413, 350, + 287, 224, 161, 98, 35, 981, 918, 855, 792, 729, 666, 603, 540, + 477, 414, 351, 288, 225, 162, 99, 36, 982, 919, 856, 793, 730, + 667, 604, 541, 478, 415, 352, 289, 226, 163, 100, 37, 983, 920, + 857, 794, 731, 668, 605, 542, 479, 416, 353, 290, 227, 164, 101, + 38, 984, 921, 858, 795, 732, 669, 606, 543, 480, 417, 354, 291, + 228, 165, 102, 39, 985, 922, 859, 796, 733, 670, 607, 544, 481, + 418, 355, 292, 229, 166, 103, 40, 986, 923, 860, 797, 734, 671, + 608, 545, 482, 419, 356, 293, 230, 167, 104, 41, 987, 924, 861, + 798, 735, 672, 609, 546, 483, 420, 357, 294, 231, 168, 105, 42, + 988, 925, 862, 799, 736, 673, 610, 547, 484, 421, 358, 295, 232, + 169, 106, 43, 989, 926, 863, 800, 737, 674, 611, 548, 485, 422, + 359, 296, 233, 170, 107, 44, 990, 927, 864, 801, 738, 675, 612, + 549, 486, 423, 360, 297, 234, 171, 108, 45, 991, 928, 865, 802, + 739, 676, 613, 550, 487, 424, 361, 298, 235, 172, 109, 46, 992, + 929, 866, 803, 740, 677, 614, 551, 488, 425, 362, 299, 236, 173, + 110, 47, 993, 930, 867, 804, 741, 678, 615, 552, 489, 426, 363, + 300, 237, 174, 111, 48, 994, 931, 868, 805, 742, 679, 616, 553, + 490, 427, 364, 301, 238, 175, 112, 49, 995, 932, 869, 806, 743, + 680, 617, 554, 491, 428, 365, 302, 239, 176, 113, 50, 996, 933, + 870, 807, 744, 681, 618, 555, 492, 429, 366, 303, 240, 177, 114, + 51, 997, 934, 871, 808, 745, 682, 619, 556, 493, 430, 367, 304, + 241, 178, 115, 52, 998, 935, 872, 809, 746, 683, 620, 557, 494, + 431, 368, 305, 242, 179, 116, 53, 999, 936, 873, 810, 747, 684, + 621, 558, 495, 432, 369, 306, 243, 180, 117, 54, 1000, 937, 874, + 811, 748, 685, 622, 559, 496, 433, 370, 307, 244, 181, 118, 55, +1001, 938, 875, 812, 749, 686, 623, 560, 497, 434, 371, 308, 245, + 182, 119, 56, 1002, 939, 876, 813, 750, 687, 624, 561, 498, 435, + 372, 309, 246, 183, 120, 57, 1003, 940, 877, 814, 751, 688, 625, + 562, 499, 436, 373, 310, 247, 184, 121, 58, 1004, 941, 878, 815, + 752, 689, 626, 563, 500, 437, 374, 311, 248, 185, 122, 59, 1005, + 942, 879, 816, 753, 690, 627, 564, 501, 438, 375, 312, 249, 186, + 123, 60, 1006, 943, 880, 817, 754, 691, 628, 565, 502, 439, 376, + 313, 250, 187, 124, 61, 1007, 944, 881, 818, 755, 692, 629, 566, + 503, 440, 377, 314, 251, 188, 125, 62, 1008, 945, 882, 819, 756, + 693, 630, 567, 504, 441, 378, 315, 252, 189, 126, 63, 1009, 946, + 883, 820, 757, 694, 631, 568, 505, 442, 379, 316, 253, 190, 127, +1010, 947, 884, 821, 758, 695, 632, 569, 506, 443, 380, 317, 254, + 191, 1011, 948, 885, 822, 759, 696, 633, 570, 507, 444, 381, 318, + 255, 1012, 949, 886, 823, 760, 697, 634, 571, 508, 445, 382, 319, +1013, 950, 887, 824, 761, 698, 635, 572, 509, 446, 383, 1014, 951, + 888, 825, 762, 699, 636, 573, 510, 447, 1015, 952, 889, 826, 763, + 700, 637, 574, 511, 1016, 953, 890, 827, 764, 701, 638, 575, 1017, + 954, 891, 828, 765, 702, 639, 1018, 955, 892, 829, 766, 703, 1019, + 956, 893, 830, 767, 1020, 957, 894, 831, 1021, 958, 895, 1022, 959, +1023, 0, 64, 1, 128, 65, 2, 192, 129, 66, 3, 256, 193, // 64x32 + 130, 67, 4, 320, 257, 194, 131, 68, 5, 384, 321, 258, 195, + 132, 69, 6, 448, 385, 322, 259, 196, 133, 70, 7, 512, 449, + 386, 323, 260, 197, 134, 71, 8, 576, 513, 450, 387, 324, 261, + 198, 135, 72, 9, 640, 577, 514, 451, 388, 325, 262, 199, 136, + 73, 10, 704, 641, 578, 515, 452, 389, 326, 263, 200, 137, 74, + 11, 768, 705, 642, 579, 516, 453, 390, 327, 264, 201, 138, 75, + 12, 832, 769, 706, 643, 580, 517, 454, 391, 328, 265, 202, 139, + 76, 13, 896, 833, 770, 707, 644, 581, 518, 455, 392, 329, 266, + 203, 140, 77, 14, 960, 897, 834, 771, 708, 645, 582, 519, 456, + 393, 330, 267, 204, 141, 78, 15, 1024, 961, 898, 835, 772, 709, + 646, 583, 520, 457, 394, 331, 268, 205, 142, 79, 16, 1088, 1025, + 962, 899, 836, 773, 710, 647, 584, 521, 458, 395, 332, 269, 206, + 143, 80, 17, 1152, 1089, 1026, 963, 900, 837, 774, 711, 648, 585, + 522, 459, 396, 333, 270, 207, 144, 81, 18, 1216, 1153, 1090, 1027, + 964, 901, 838, 775, 712, 649, 586, 523, 460, 397, 334, 271, 208, + 145, 82, 19, 1280, 1217, 1154, 1091, 1028, 965, 902, 839, 776, 713, + 650, 587, 524, 461, 398, 335, 272, 209, 146, 83, 20, 1344, 1281, +1218, 1155, 1092, 1029, 966, 903, 840, 777, 714, 651, 588, 525, 462, + 399, 336, 273, 210, 147, 84, 21, 1408, 1345, 1282, 1219, 1156, 1093, +1030, 967, 904, 841, 778, 715, 652, 589, 526, 463, 400, 337, 274, + 211, 148, 85, 22, 1472, 1409, 1346, 1283, 1220, 1157, 1094, 1031, 968, + 905, 842, 779, 716, 653, 590, 527, 464, 401, 338, 275, 212, 149, + 86, 23, 1536, 1473, 1410, 1347, 1284, 1221, 1158, 1095, 1032, 969, 906, + 843, 780, 717, 654, 591, 528, 465, 402, 339, 276, 213, 150, 87, + 24, 1600, 1537, 1474, 1411, 1348, 1285, 1222, 1159, 1096, 1033, 970, 907, + 844, 781, 718, 655, 592, 529, 466, 403, 340, 277, 214, 151, 88, + 25, 1664, 1601, 1538, 1475, 1412, 1349, 1286, 1223, 1160, 1097, 1034, 971, + 908, 845, 782, 719, 656, 593, 530, 467, 404, 341, 278, 215, 152, + 89, 26, 1728, 1665, 1602, 1539, 1476, 1413, 1350, 1287, 1224, 1161, 1098, +1035, 972, 909, 846, 783, 720, 657, 594, 531, 468, 405, 342, 279, + 216, 153, 90, 27, 1792, 1729, 1666, 1603, 1540, 1477, 1414, 1351, 1288, +1225, 1162, 1099, 1036, 973, 910, 847, 784, 721, 658, 595, 532, 469, + 406, 343, 280, 217, 154, 91, 28, 1856, 1793, 1730, 1667, 1604, 1541, +1478, 1415, 1352, 1289, 1226, 1163, 1100, 1037, 974, 911, 848, 785, 722, + 659, 596, 533, 470, 407, 344, 281, 218, 155, 92, 29, 1920, 1857, +1794, 1731, 1668, 1605, 1542, 1479, 1416, 1353, 1290, 1227, 1164, 1101, 1038, + 975, 912, 849, 786, 723, 660, 597, 534, 471, 408, 345, 282, 219, + 156, 93, 30, 1984, 1921, 1858, 1795, 1732, 1669, 1606, 1543, 1480, 1417, +1354, 1291, 1228, 1165, 1102, 1039, 976, 913, 850, 787, 724, 661, 598, + 535, 472, 409, 346, 283, 220, 157, 94, 31, 1985, 1922, 1859, 1796, +1733, 1670, 1607, 1544, 1481, 1418, 1355, 1292, 1229, 1166, 1103, 1040, 977, + 914, 851, 788, 725, 662, 599, 536, 473, 410, 347, 284, 221, 158, + 95, 32, 1986, 1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356, +1293, 1230, 1167, 1104, 1041, 978, 915, 852, 789, 726, 663, 600, 537, + 474, 411, 348, 285, 222, 159, 96, 33, 1987, 1924, 1861, 1798, 1735, +1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042, 979, 916, + 853, 790, 727, 664, 601, 538, 475, 412, 349, 286, 223, 160, 97, + 34, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547, 1484, 1421, 1358, 1295, +1232, 1169, 1106, 1043, 980, 917, 854, 791, 728, 665, 602, 539, 476, + 413, 350, 287, 224, 161, 98, 35, 1989, 1926, 1863, 1800, 1737, 1674, +1611, 1548, 1485, 1422, 1359, 1296, 1233, 1170, 1107, 1044, 981, 918, 855, + 792, 729, 666, 603, 540, 477, 414, 351, 288, 225, 162, 99, 36, +1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360, 1297, 1234, +1171, 1108, 1045, 982, 919, 856, 793, 730, 667, 604, 541, 478, 415, + 352, 289, 226, 163, 100, 37, 1991, 1928, 1865, 1802, 1739, 1676, 1613, +1550, 1487, 1424, 1361, 1298, 1235, 1172, 1109, 1046, 983, 920, 857, 794, + 731, 668, 605, 542, 479, 416, 353, 290, 227, 164, 101, 38, 1992, +1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425, 1362, 1299, 1236, 1173, +1110, 1047, 984, 921, 858, 795, 732, 669, 606, 543, 480, 417, 354, + 291, 228, 165, 102, 39, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552, +1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048, 985, 922, 859, 796, 733, + 670, 607, 544, 481, 418, 355, 292, 229, 166, 103, 40, 1994, 1931, +1868, 1805, 1742, 1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112, +1049, 986, 923, 860, 797, 734, 671, 608, 545, 482, 419, 356, 293, + 230, 167, 104, 41, 1995, 1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491, +1428, 1365, 1302, 1239, 1176, 1113, 1050, 987, 924, 861, 798, 735, 672, + 609, 546, 483, 420, 357, 294, 231, 168, 105, 42, 1996, 1933, 1870, +1807, 1744, 1681, 1618, 1555, 1492, 1429, 1366, 1303, 1240, 1177, 1114, 1051, + 988, 925, 862, 799, 736, 673, 610, 547, 484, 421, 358, 295, 232, + 169, 106, 43, 1997, 1934, 1871, 1808, 1745, 1682, 1619, 1556, 1493, 1430, +1367, 1304, 1241, 1178, 1115, 1052, 989, 926, 863, 800, 737, 674, 611, + 548, 485, 422, 359, 296, 233, 170, 107, 44, 1998, 1935, 1872, 1809, +1746, 1683, 1620, 1557, 1494, 1431, 1368, 1305, 1242, 1179, 1116, 1053, 990, + 927, 864, 801, 738, 675, 612, 549, 486, 423, 360, 297, 234, 171, + 108, 45, 1999, 1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369, +1306, 1243, 1180, 1117, 1054, 991, 928, 865, 802, 739, 676, 613, 550, + 487, 424, 361, 298, 235, 172, 109, 46, 2000, 1937, 1874, 1811, 1748, +1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055, 992, 929, + 866, 803, 740, 677, 614, 551, 488, 425, 362, 299, 236, 173, 110, + 47, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560, 1497, 1434, 1371, 1308, +1245, 1182, 1119, 1056, 993, 930, 867, 804, 741, 678, 615, 552, 489, + 426, 363, 300, 237, 174, 111, 48, 2002, 1939, 1876, 1813, 1750, 1687, +1624, 1561, 1498, 1435, 1372, 1309, 1246, 1183, 1120, 1057, 994, 931, 868, + 805, 742, 679, 616, 553, 490, 427, 364, 301, 238, 175, 112, 49, +2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373, 1310, 1247, +1184, 1121, 1058, 995, 932, 869, 806, 743, 680, 617, 554, 491, 428, + 365, 302, 239, 176, 113, 50, 2004, 1941, 1878, 1815, 1752, 1689, 1626, +1563, 1500, 1437, 1374, 1311, 1248, 1185, 1122, 1059, 996, 933, 870, 807, + 744, 681, 618, 555, 492, 429, 366, 303, 240, 177, 114, 51, 2005, +1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438, 1375, 1312, 1249, 1186, +1123, 1060, 997, 934, 871, 808, 745, 682, 619, 556, 493, 430, 367, + 304, 241, 178, 115, 52, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565, +1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061, 998, 935, 872, 809, 746, + 683, 620, 557, 494, 431, 368, 305, 242, 179, 116, 53, 2007, 1944, +1881, 1818, 1755, 1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125, +1062, 999, 936, 873, 810, 747, 684, 621, 558, 495, 432, 369, 306, + 243, 180, 117, 54, 2008, 1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504, +1441, 1378, 1315, 1252, 1189, 1126, 1063, 1000, 937, 874, 811, 748, 685, + 622, 559, 496, 433, 370, 307, 244, 181, 118, 55, 2009, 1946, 1883, +1820, 1757, 1694, 1631, 1568, 1505, 1442, 1379, 1316, 1253, 1190, 1127, 1064, +1001, 938, 875, 812, 749, 686, 623, 560, 497, 434, 371, 308, 245, + 182, 119, 56, 2010, 1947, 1884, 1821, 1758, 1695, 1632, 1569, 1506, 1443, +1380, 1317, 1254, 1191, 1128, 1065, 1002, 939, 876, 813, 750, 687, 624, + 561, 498, 435, 372, 309, 246, 183, 120, 57, 2011, 1948, 1885, 1822, +1759, 1696, 1633, 1570, 1507, 1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003, + 940, 877, 814, 751, 688, 625, 562, 499, 436, 373, 310, 247, 184, + 121, 58, 2012, 1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382, +1319, 1256, 1193, 1130, 1067, 1004, 941, 878, 815, 752, 689, 626, 563, + 500, 437, 374, 311, 248, 185, 122, 59, 2013, 1950, 1887, 1824, 1761, +1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005, 942, + 879, 816, 753, 690, 627, 564, 501, 438, 375, 312, 249, 186, 123, + 60, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573, 1510, 1447, 1384, 1321, +1258, 1195, 1132, 1069, 1006, 943, 880, 817, 754, 691, 628, 565, 502, + 439, 376, 313, 250, 187, 124, 61, 2015, 1952, 1889, 1826, 1763, 1700, +1637, 1574, 1511, 1448, 1385, 1322, 1259, 1196, 1133, 1070, 1007, 944, 881, + 818, 755, 692, 629, 566, 503, 440, 377, 314, 251, 188, 125, 62, +2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386, 1323, 1260, +1197, 1134, 1071, 1008, 945, 882, 819, 756, 693, 630, 567, 504, 441, + 378, 315, 252, 189, 126, 63, 2017, 1954, 1891, 1828, 1765, 1702, 1639, +1576, 1513, 1450, 1387, 1324, 1261, 1198, 1135, 1072, 1009, 946, 883, 820, + 757, 694, 631, 568, 505, 442, 379, 316, 253, 190, 127, 2018, 1955, +1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199, 1136, +1073, 1010, 947, 884, 821, 758, 695, 632, 569, 506, 443, 380, 317, + 254, 191, 2019, 1956, 1893, 1830, 1767, 1704, 1641, 1578, 1515, 1452, 1389, +1326, 1263, 1200, 1137, 1074, 1011, 948, 885, 822, 759, 696, 633, 570, + 507, 444, 381, 318, 255, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579, +1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012, 949, 886, 823, 760, + 697, 634, 571, 508, 445, 382, 319, 2021, 1958, 1895, 1832, 1769, 1706, +1643, 1580, 1517, 1454, 1391, 1328, 1265, 1202, 1139, 1076, 1013, 950, 887, + 824, 761, 698, 635, 572, 509, 446, 383, 2022, 1959, 1896, 1833, 1770, +1707, 1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014, 951, + 888, 825, 762, 699, 636, 573, 510, 447, 2023, 1960, 1897, 1834, 1771, +1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267, 1204, 1141, 1078, 1015, 952, + 889, 826, 763, 700, 637, 574, 511, 2024, 1961, 1898, 1835, 1772, 1709, +1646, 1583, 1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016, 953, 890, + 827, 764, 701, 638, 575, 2025, 1962, 1899, 1836, 1773, 1710, 1647, 1584, +1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017, 954, 891, 828, 765, + 702, 639, 2026, 1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396, +1333, 1270, 1207, 1144, 1081, 1018, 955, 892, 829, 766, 703, 2027, 1964, +1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334, 1271, 1208, 1145, +1082, 1019, 956, 893, 830, 767, 2028, 1965, 1902, 1839, 1776, 1713, 1650, +1587, 1524, 1461, 1398, 1335, 1272, 1209, 1146, 1083, 1020, 957, 894, 831, +2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399, 1336, 1273, +1210, 1147, 1084, 1021, 958, 895, 2030, 1967, 1904, 1841, 1778, 1715, 1652, +1589, 1526, 1463, 1400, 1337, 1274, 1211, 1148, 1085, 1022, 959, 2031, 1968, +1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212, 1149, +1086, 1023, 2032, 1969, 1906, 1843, 1780, 1717, 1654, 1591, 1528, 1465, 1402, +1339, 1276, 1213, 1150, 1087, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592, +1529, 1466, 1403, 1340, 1277, 1214, 1151, 2034, 1971, 1908, 1845, 1782, 1719, +1656, 1593, 1530, 1467, 1404, 1341, 1278, 1215, 2035, 1972, 1909, 1846, 1783, +1720, 1657, 1594, 1531, 1468, 1405, 1342, 1279, 2036, 1973, 1910, 1847, 1784, +1721, 1658, 1595, 1532, 1469, 1406, 1343, 2037, 1974, 1911, 1848, 1785, 1722, +1659, 1596, 1533, 1470, 1407, 2038, 1975, 1912, 1849, 1786, 1723, 1660, 1597, +1534, 1471, 2039, 1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 2040, 1977, +1914, 1851, 1788, 1725, 1662, 1599, 2041, 1978, 1915, 1852, 1789, 1726, 1663, +2042, 1979, 1916, 1853, 1790, 1727, 2043, 1980, 1917, 1854, 1791, 2044, 1981, +1918, 1855, 2045, 1982, 1919, 2046, 1983, 2047, 0, 64, 1, 128, 65, // 64x64 + 2, 192, 129, 66, 3, 256, 193, 130, 67, 4, 320, 257, 194, + 131, 68, 5, 384, 321, 258, 195, 132, 69, 6, 448, 385, 322, + 259, 196, 133, 70, 7, 512, 449, 386, 323, 260, 197, 134, 71, + 8, 576, 513, 450, 387, 324, 261, 198, 135, 72, 9, 640, 577, + 514, 451, 388, 325, 262, 199, 136, 73, 10, 704, 641, 578, 515, + 452, 389, 326, 263, 200, 137, 74, 11, 768, 705, 642, 579, 516, + 453, 390, 327, 264, 201, 138, 75, 12, 832, 769, 706, 643, 580, + 517, 454, 391, 328, 265, 202, 139, 76, 13, 896, 833, 770, 707, + 644, 581, 518, 455, 392, 329, 266, 203, 140, 77, 14, 960, 897, + 834, 771, 708, 645, 582, 519, 456, 393, 330, 267, 204, 141, 78, + 15, 1024, 961, 898, 835, 772, 709, 646, 583, 520, 457, 394, 331, + 268, 205, 142, 79, 16, 1088, 1025, 962, 899, 836, 773, 710, 647, + 584, 521, 458, 395, 332, 269, 206, 143, 80, 17, 1152, 1089, 1026, + 963, 900, 837, 774, 711, 648, 585, 522, 459, 396, 333, 270, 207, + 144, 81, 18, 1216, 1153, 1090, 1027, 964, 901, 838, 775, 712, 649, + 586, 523, 460, 397, 334, 271, 208, 145, 82, 19, 1280, 1217, 1154, +1091, 1028, 965, 902, 839, 776, 713, 650, 587, 524, 461, 398, 335, + 272, 209, 146, 83, 20, 1344, 1281, 1218, 1155, 1092, 1029, 966, 903, + 840, 777, 714, 651, 588, 525, 462, 399, 336, 273, 210, 147, 84, + 21, 1408, 1345, 1282, 1219, 1156, 1093, 1030, 967, 904, 841, 778, 715, + 652, 589, 526, 463, 400, 337, 274, 211, 148, 85, 22, 1472, 1409, +1346, 1283, 1220, 1157, 1094, 1031, 968, 905, 842, 779, 716, 653, 590, + 527, 464, 401, 338, 275, 212, 149, 86, 23, 1536, 1473, 1410, 1347, +1284, 1221, 1158, 1095, 1032, 969, 906, 843, 780, 717, 654, 591, 528, + 465, 402, 339, 276, 213, 150, 87, 24, 1600, 1537, 1474, 1411, 1348, +1285, 1222, 1159, 1096, 1033, 970, 907, 844, 781, 718, 655, 592, 529, + 466, 403, 340, 277, 214, 151, 88, 25, 1664, 1601, 1538, 1475, 1412, +1349, 1286, 1223, 1160, 1097, 1034, 971, 908, 845, 782, 719, 656, 593, + 530, 467, 404, 341, 278, 215, 152, 89, 26, 1728, 1665, 1602, 1539, +1476, 1413, 1350, 1287, 1224, 1161, 1098, 1035, 972, 909, 846, 783, 720, + 657, 594, 531, 468, 405, 342, 279, 216, 153, 90, 27, 1792, 1729, +1666, 1603, 1540, 1477, 1414, 1351, 1288, 1225, 1162, 1099, 1036, 973, 910, + 847, 784, 721, 658, 595, 532, 469, 406, 343, 280, 217, 154, 91, + 28, 1856, 1793, 1730, 1667, 1604, 1541, 1478, 1415, 1352, 1289, 1226, 1163, +1100, 1037, 974, 911, 848, 785, 722, 659, 596, 533, 470, 407, 344, + 281, 218, 155, 92, 29, 1920, 1857, 1794, 1731, 1668, 1605, 1542, 1479, +1416, 1353, 1290, 1227, 1164, 1101, 1038, 975, 912, 849, 786, 723, 660, + 597, 534, 471, 408, 345, 282, 219, 156, 93, 30, 1984, 1921, 1858, +1795, 1732, 1669, 1606, 1543, 1480, 1417, 1354, 1291, 1228, 1165, 1102, 1039, + 976, 913, 850, 787, 724, 661, 598, 535, 472, 409, 346, 283, 220, + 157, 94, 31, 2048, 1985, 1922, 1859, 1796, 1733, 1670, 1607, 1544, 1481, +1418, 1355, 1292, 1229, 1166, 1103, 1040, 977, 914, 851, 788, 725, 662, + 599, 536, 473, 410, 347, 284, 221, 158, 95, 32, 2112, 2049, 1986, +1923, 1860, 1797, 1734, 1671, 1608, 1545, 1482, 1419, 1356, 1293, 1230, 1167, +1104, 1041, 978, 915, 852, 789, 726, 663, 600, 537, 474, 411, 348, + 285, 222, 159, 96, 33, 2176, 2113, 2050, 1987, 1924, 1861, 1798, 1735, +1672, 1609, 1546, 1483, 1420, 1357, 1294, 1231, 1168, 1105, 1042, 979, 916, + 853, 790, 727, 664, 601, 538, 475, 412, 349, 286, 223, 160, 97, + 34, 2240, 2177, 2114, 2051, 1988, 1925, 1862, 1799, 1736, 1673, 1610, 1547, +1484, 1421, 1358, 1295, 1232, 1169, 1106, 1043, 980, 917, 854, 791, 728, + 665, 602, 539, 476, 413, 350, 287, 224, 161, 98, 35, 2304, 2241, +2178, 2115, 2052, 1989, 1926, 1863, 1800, 1737, 1674, 1611, 1548, 1485, 1422, +1359, 1296, 1233, 1170, 1107, 1044, 981, 918, 855, 792, 729, 666, 603, + 540, 477, 414, 351, 288, 225, 162, 99, 36, 2368, 2305, 2242, 2179, +2116, 2053, 1990, 1927, 1864, 1801, 1738, 1675, 1612, 1549, 1486, 1423, 1360, +1297, 1234, 1171, 1108, 1045, 982, 919, 856, 793, 730, 667, 604, 541, + 478, 415, 352, 289, 226, 163, 100, 37, 2432, 2369, 2306, 2243, 2180, +2117, 2054, 1991, 1928, 1865, 1802, 1739, 1676, 1613, 1550, 1487, 1424, 1361, +1298, 1235, 1172, 1109, 1046, 983, 920, 857, 794, 731, 668, 605, 542, + 479, 416, 353, 290, 227, 164, 101, 38, 2496, 2433, 2370, 2307, 2244, +2181, 2118, 2055, 1992, 1929, 1866, 1803, 1740, 1677, 1614, 1551, 1488, 1425, +1362, 1299, 1236, 1173, 1110, 1047, 984, 921, 858, 795, 732, 669, 606, + 543, 480, 417, 354, 291, 228, 165, 102, 39, 2560, 2497, 2434, 2371, +2308, 2245, 2182, 2119, 2056, 1993, 1930, 1867, 1804, 1741, 1678, 1615, 1552, +1489, 1426, 1363, 1300, 1237, 1174, 1111, 1048, 985, 922, 859, 796, 733, + 670, 607, 544, 481, 418, 355, 292, 229, 166, 103, 40, 2624, 2561, +2498, 2435, 2372, 2309, 2246, 2183, 2120, 2057, 1994, 1931, 1868, 1805, 1742, +1679, 1616, 1553, 1490, 1427, 1364, 1301, 1238, 1175, 1112, 1049, 986, 923, + 860, 797, 734, 671, 608, 545, 482, 419, 356, 293, 230, 167, 104, + 41, 2688, 2625, 2562, 2499, 2436, 2373, 2310, 2247, 2184, 2121, 2058, 1995, +1932, 1869, 1806, 1743, 1680, 1617, 1554, 1491, 1428, 1365, 1302, 1239, 1176, +1113, 1050, 987, 924, 861, 798, 735, 672, 609, 546, 483, 420, 357, + 294, 231, 168, 105, 42, 2752, 2689, 2626, 2563, 2500, 2437, 2374, 2311, +2248, 2185, 2122, 2059, 1996, 1933, 1870, 1807, 1744, 1681, 1618, 1555, 1492, +1429, 1366, 1303, 1240, 1177, 1114, 1051, 988, 925, 862, 799, 736, 673, + 610, 547, 484, 421, 358, 295, 232, 169, 106, 43, 2816, 2753, 2690, +2627, 2564, 2501, 2438, 2375, 2312, 2249, 2186, 2123, 2060, 1997, 1934, 1871, +1808, 1745, 1682, 1619, 1556, 1493, 1430, 1367, 1304, 1241, 1178, 1115, 1052, + 989, 926, 863, 800, 737, 674, 611, 548, 485, 422, 359, 296, 233, + 170, 107, 44, 2880, 2817, 2754, 2691, 2628, 2565, 2502, 2439, 2376, 2313, +2250, 2187, 2124, 2061, 1998, 1935, 1872, 1809, 1746, 1683, 1620, 1557, 1494, +1431, 1368, 1305, 1242, 1179, 1116, 1053, 990, 927, 864, 801, 738, 675, + 612, 549, 486, 423, 360, 297, 234, 171, 108, 45, 2944, 2881, 2818, +2755, 2692, 2629, 2566, 2503, 2440, 2377, 2314, 2251, 2188, 2125, 2062, 1999, +1936, 1873, 1810, 1747, 1684, 1621, 1558, 1495, 1432, 1369, 1306, 1243, 1180, +1117, 1054, 991, 928, 865, 802, 739, 676, 613, 550, 487, 424, 361, + 298, 235, 172, 109, 46, 3008, 2945, 2882, 2819, 2756, 2693, 2630, 2567, +2504, 2441, 2378, 2315, 2252, 2189, 2126, 2063, 2000, 1937, 1874, 1811, 1748, +1685, 1622, 1559, 1496, 1433, 1370, 1307, 1244, 1181, 1118, 1055, 992, 929, + 866, 803, 740, 677, 614, 551, 488, 425, 362, 299, 236, 173, 110, + 47, 3072, 3009, 2946, 2883, 2820, 2757, 2694, 2631, 2568, 2505, 2442, 2379, +2316, 2253, 2190, 2127, 2064, 2001, 1938, 1875, 1812, 1749, 1686, 1623, 1560, +1497, 1434, 1371, 1308, 1245, 1182, 1119, 1056, 993, 930, 867, 804, 741, + 678, 615, 552, 489, 426, 363, 300, 237, 174, 111, 48, 3136, 3073, +3010, 2947, 2884, 2821, 2758, 2695, 2632, 2569, 2506, 2443, 2380, 2317, 2254, +2191, 2128, 2065, 2002, 1939, 1876, 1813, 1750, 1687, 1624, 1561, 1498, 1435, +1372, 1309, 1246, 1183, 1120, 1057, 994, 931, 868, 805, 742, 679, 616, + 553, 490, 427, 364, 301, 238, 175, 112, 49, 3200, 3137, 3074, 3011, +2948, 2885, 2822, 2759, 2696, 2633, 2570, 2507, 2444, 2381, 2318, 2255, 2192, +2129, 2066, 2003, 1940, 1877, 1814, 1751, 1688, 1625, 1562, 1499, 1436, 1373, +1310, 1247, 1184, 1121, 1058, 995, 932, 869, 806, 743, 680, 617, 554, + 491, 428, 365, 302, 239, 176, 113, 50, 3264, 3201, 3138, 3075, 3012, +2949, 2886, 2823, 2760, 2697, 2634, 2571, 2508, 2445, 2382, 2319, 2256, 2193, +2130, 2067, 2004, 1941, 1878, 1815, 1752, 1689, 1626, 1563, 1500, 1437, 1374, +1311, 1248, 1185, 1122, 1059, 996, 933, 870, 807, 744, 681, 618, 555, + 492, 429, 366, 303, 240, 177, 114, 51, 3328, 3265, 3202, 3139, 3076, +3013, 2950, 2887, 2824, 2761, 2698, 2635, 2572, 2509, 2446, 2383, 2320, 2257, +2194, 2131, 2068, 2005, 1942, 1879, 1816, 1753, 1690, 1627, 1564, 1501, 1438, +1375, 1312, 1249, 1186, 1123, 1060, 997, 934, 871, 808, 745, 682, 619, + 556, 493, 430, 367, 304, 241, 178, 115, 52, 3392, 3329, 3266, 3203, +3140, 3077, 3014, 2951, 2888, 2825, 2762, 2699, 2636, 2573, 2510, 2447, 2384, +2321, 2258, 2195, 2132, 2069, 2006, 1943, 1880, 1817, 1754, 1691, 1628, 1565, +1502, 1439, 1376, 1313, 1250, 1187, 1124, 1061, 998, 935, 872, 809, 746, + 683, 620, 557, 494, 431, 368, 305, 242, 179, 116, 53, 3456, 3393, +3330, 3267, 3204, 3141, 3078, 3015, 2952, 2889, 2826, 2763, 2700, 2637, 2574, +2511, 2448, 2385, 2322, 2259, 2196, 2133, 2070, 2007, 1944, 1881, 1818, 1755, +1692, 1629, 1566, 1503, 1440, 1377, 1314, 1251, 1188, 1125, 1062, 999, 936, + 873, 810, 747, 684, 621, 558, 495, 432, 369, 306, 243, 180, 117, + 54, 3520, 3457, 3394, 3331, 3268, 3205, 3142, 3079, 3016, 2953, 2890, 2827, +2764, 2701, 2638, 2575, 2512, 2449, 2386, 2323, 2260, 2197, 2134, 2071, 2008, +1945, 1882, 1819, 1756, 1693, 1630, 1567, 1504, 1441, 1378, 1315, 1252, 1189, +1126, 1063, 1000, 937, 874, 811, 748, 685, 622, 559, 496, 433, 370, + 307, 244, 181, 118, 55, 3584, 3521, 3458, 3395, 3332, 3269, 3206, 3143, +3080, 3017, 2954, 2891, 2828, 2765, 2702, 2639, 2576, 2513, 2450, 2387, 2324, +2261, 2198, 2135, 2072, 2009, 1946, 1883, 1820, 1757, 1694, 1631, 1568, 1505, +1442, 1379, 1316, 1253, 1190, 1127, 1064, 1001, 938, 875, 812, 749, 686, + 623, 560, 497, 434, 371, 308, 245, 182, 119, 56, 3648, 3585, 3522, +3459, 3396, 3333, 3270, 3207, 3144, 3081, 3018, 2955, 2892, 2829, 2766, 2703, +2640, 2577, 2514, 2451, 2388, 2325, 2262, 2199, 2136, 2073, 2010, 1947, 1884, +1821, 1758, 1695, 1632, 1569, 1506, 1443, 1380, 1317, 1254, 1191, 1128, 1065, +1002, 939, 876, 813, 750, 687, 624, 561, 498, 435, 372, 309, 246, + 183, 120, 57, 3712, 3649, 3586, 3523, 3460, 3397, 3334, 3271, 3208, 3145, +3082, 3019, 2956, 2893, 2830, 2767, 2704, 2641, 2578, 2515, 2452, 2389, 2326, +2263, 2200, 2137, 2074, 2011, 1948, 1885, 1822, 1759, 1696, 1633, 1570, 1507, +1444, 1381, 1318, 1255, 1192, 1129, 1066, 1003, 940, 877, 814, 751, 688, + 625, 562, 499, 436, 373, 310, 247, 184, 121, 58, 3776, 3713, 3650, +3587, 3524, 3461, 3398, 3335, 3272, 3209, 3146, 3083, 3020, 2957, 2894, 2831, +2768, 2705, 2642, 2579, 2516, 2453, 2390, 2327, 2264, 2201, 2138, 2075, 2012, +1949, 1886, 1823, 1760, 1697, 1634, 1571, 1508, 1445, 1382, 1319, 1256, 1193, +1130, 1067, 1004, 941, 878, 815, 752, 689, 626, 563, 500, 437, 374, + 311, 248, 185, 122, 59, 3840, 3777, 3714, 3651, 3588, 3525, 3462, 3399, +3336, 3273, 3210, 3147, 3084, 3021, 2958, 2895, 2832, 2769, 2706, 2643, 2580, +2517, 2454, 2391, 2328, 2265, 2202, 2139, 2076, 2013, 1950, 1887, 1824, 1761, +1698, 1635, 1572, 1509, 1446, 1383, 1320, 1257, 1194, 1131, 1068, 1005, 942, + 879, 816, 753, 690, 627, 564, 501, 438, 375, 312, 249, 186, 123, + 60, 3904, 3841, 3778, 3715, 3652, 3589, 3526, 3463, 3400, 3337, 3274, 3211, +3148, 3085, 3022, 2959, 2896, 2833, 2770, 2707, 2644, 2581, 2518, 2455, 2392, +2329, 2266, 2203, 2140, 2077, 2014, 1951, 1888, 1825, 1762, 1699, 1636, 1573, +1510, 1447, 1384, 1321, 1258, 1195, 1132, 1069, 1006, 943, 880, 817, 754, + 691, 628, 565, 502, 439, 376, 313, 250, 187, 124, 61, 3968, 3905, +3842, 3779, 3716, 3653, 3590, 3527, 3464, 3401, 3338, 3275, 3212, 3149, 3086, +3023, 2960, 2897, 2834, 2771, 2708, 2645, 2582, 2519, 2456, 2393, 2330, 2267, +2204, 2141, 2078, 2015, 1952, 1889, 1826, 1763, 1700, 1637, 1574, 1511, 1448, +1385, 1322, 1259, 1196, 1133, 1070, 1007, 944, 881, 818, 755, 692, 629, + 566, 503, 440, 377, 314, 251, 188, 125, 62, 4032, 3969, 3906, 3843, +3780, 3717, 3654, 3591, 3528, 3465, 3402, 3339, 3276, 3213, 3150, 3087, 3024, +2961, 2898, 2835, 2772, 2709, 2646, 2583, 2520, 2457, 2394, 2331, 2268, 2205, +2142, 2079, 2016, 1953, 1890, 1827, 1764, 1701, 1638, 1575, 1512, 1449, 1386, +1323, 1260, 1197, 1134, 1071, 1008, 945, 882, 819, 756, 693, 630, 567, + 504, 441, 378, 315, 252, 189, 126, 63, 4033, 3970, 3907, 3844, 3781, +3718, 3655, 3592, 3529, 3466, 3403, 3340, 3277, 3214, 3151, 3088, 3025, 2962, +2899, 2836, 2773, 2710, 2647, 2584, 2521, 2458, 2395, 2332, 2269, 2206, 2143, +2080, 2017, 1954, 1891, 1828, 1765, 1702, 1639, 1576, 1513, 1450, 1387, 1324, +1261, 1198, 1135, 1072, 1009, 946, 883, 820, 757, 694, 631, 568, 505, + 442, 379, 316, 253, 190, 127, 4034, 3971, 3908, 3845, 3782, 3719, 3656, +3593, 3530, 3467, 3404, 3341, 3278, 3215, 3152, 3089, 3026, 2963, 2900, 2837, +2774, 2711, 2648, 2585, 2522, 2459, 2396, 2333, 2270, 2207, 2144, 2081, 2018, +1955, 1892, 1829, 1766, 1703, 1640, 1577, 1514, 1451, 1388, 1325, 1262, 1199, +1136, 1073, 1010, 947, 884, 821, 758, 695, 632, 569, 506, 443, 380, + 317, 254, 191, 4035, 3972, 3909, 3846, 3783, 3720, 3657, 3594, 3531, 3468, +3405, 3342, 3279, 3216, 3153, 3090, 3027, 2964, 2901, 2838, 2775, 2712, 2649, +2586, 2523, 2460, 2397, 2334, 2271, 2208, 2145, 2082, 2019, 1956, 1893, 1830, +1767, 1704, 1641, 1578, 1515, 1452, 1389, 1326, 1263, 1200, 1137, 1074, 1011, + 948, 885, 822, 759, 696, 633, 570, 507, 444, 381, 318, 255, 4036, +3973, 3910, 3847, 3784, 3721, 3658, 3595, 3532, 3469, 3406, 3343, 3280, 3217, +3154, 3091, 3028, 2965, 2902, 2839, 2776, 2713, 2650, 2587, 2524, 2461, 2398, +2335, 2272, 2209, 2146, 2083, 2020, 1957, 1894, 1831, 1768, 1705, 1642, 1579, +1516, 1453, 1390, 1327, 1264, 1201, 1138, 1075, 1012, 949, 886, 823, 760, + 697, 634, 571, 508, 445, 382, 319, 4037, 3974, 3911, 3848, 3785, 3722, +3659, 3596, 3533, 3470, 3407, 3344, 3281, 3218, 3155, 3092, 3029, 2966, 2903, +2840, 2777, 2714, 2651, 2588, 2525, 2462, 2399, 2336, 2273, 2210, 2147, 2084, +2021, 1958, 1895, 1832, 1769, 1706, 1643, 1580, 1517, 1454, 1391, 1328, 1265, +1202, 1139, 1076, 1013, 950, 887, 824, 761, 698, 635, 572, 509, 446, + 383, 4038, 3975, 3912, 3849, 3786, 3723, 3660, 3597, 3534, 3471, 3408, 3345, +3282, 3219, 3156, 3093, 3030, 2967, 2904, 2841, 2778, 2715, 2652, 2589, 2526, +2463, 2400, 2337, 2274, 2211, 2148, 2085, 2022, 1959, 1896, 1833, 1770, 1707, +1644, 1581, 1518, 1455, 1392, 1329, 1266, 1203, 1140, 1077, 1014, 951, 888, + 825, 762, 699, 636, 573, 510, 447, 4039, 3976, 3913, 3850, 3787, 3724, +3661, 3598, 3535, 3472, 3409, 3346, 3283, 3220, 3157, 3094, 3031, 2968, 2905, +2842, 2779, 2716, 2653, 2590, 2527, 2464, 2401, 2338, 2275, 2212, 2149, 2086, +2023, 1960, 1897, 1834, 1771, 1708, 1645, 1582, 1519, 1456, 1393, 1330, 1267, +1204, 1141, 1078, 1015, 952, 889, 826, 763, 700, 637, 574, 511, 4040, +3977, 3914, 3851, 3788, 3725, 3662, 3599, 3536, 3473, 3410, 3347, 3284, 3221, +3158, 3095, 3032, 2969, 2906, 2843, 2780, 2717, 2654, 2591, 2528, 2465, 2402, +2339, 2276, 2213, 2150, 2087, 2024, 1961, 1898, 1835, 1772, 1709, 1646, 1583, +1520, 1457, 1394, 1331, 1268, 1205, 1142, 1079, 1016, 953, 890, 827, 764, + 701, 638, 575, 4041, 3978, 3915, 3852, 3789, 3726, 3663, 3600, 3537, 3474, +3411, 3348, 3285, 3222, 3159, 3096, 3033, 2970, 2907, 2844, 2781, 2718, 2655, +2592, 2529, 2466, 2403, 2340, 2277, 2214, 2151, 2088, 2025, 1962, 1899, 1836, +1773, 1710, 1647, 1584, 1521, 1458, 1395, 1332, 1269, 1206, 1143, 1080, 1017, + 954, 891, 828, 765, 702, 639, 4042, 3979, 3916, 3853, 3790, 3727, 3664, +3601, 3538, 3475, 3412, 3349, 3286, 3223, 3160, 3097, 3034, 2971, 2908, 2845, +2782, 2719, 2656, 2593, 2530, 2467, 2404, 2341, 2278, 2215, 2152, 2089, 2026, +1963, 1900, 1837, 1774, 1711, 1648, 1585, 1522, 1459, 1396, 1333, 1270, 1207, +1144, 1081, 1018, 955, 892, 829, 766, 703, 4043, 3980, 3917, 3854, 3791, +3728, 3665, 3602, 3539, 3476, 3413, 3350, 3287, 3224, 3161, 3098, 3035, 2972, +2909, 2846, 2783, 2720, 2657, 2594, 2531, 2468, 2405, 2342, 2279, 2216, 2153, +2090, 2027, 1964, 1901, 1838, 1775, 1712, 1649, 1586, 1523, 1460, 1397, 1334, +1271, 1208, 1145, 1082, 1019, 956, 893, 830, 767, 4044, 3981, 3918, 3855, +3792, 3729, 3666, 3603, 3540, 3477, 3414, 3351, 3288, 3225, 3162, 3099, 3036, +2973, 2910, 2847, 2784, 2721, 2658, 2595, 2532, 2469, 2406, 2343, 2280, 2217, +2154, 2091, 2028, 1965, 1902, 1839, 1776, 1713, 1650, 1587, 1524, 1461, 1398, +1335, 1272, 1209, 1146, 1083, 1020, 957, 894, 831, 4045, 3982, 3919, 3856, +3793, 3730, 3667, 3604, 3541, 3478, 3415, 3352, 3289, 3226, 3163, 3100, 3037, +2974, 2911, 2848, 2785, 2722, 2659, 2596, 2533, 2470, 2407, 2344, 2281, 2218, +2155, 2092, 2029, 1966, 1903, 1840, 1777, 1714, 1651, 1588, 1525, 1462, 1399, +1336, 1273, 1210, 1147, 1084, 1021, 958, 895, 4046, 3983, 3920, 3857, 3794, +3731, 3668, 3605, 3542, 3479, 3416, 3353, 3290, 3227, 3164, 3101, 3038, 2975, +2912, 2849, 2786, 2723, 2660, 2597, 2534, 2471, 2408, 2345, 2282, 2219, 2156, +2093, 2030, 1967, 1904, 1841, 1778, 1715, 1652, 1589, 1526, 1463, 1400, 1337, +1274, 1211, 1148, 1085, 1022, 959, 4047, 3984, 3921, 3858, 3795, 3732, 3669, +3606, 3543, 3480, 3417, 3354, 3291, 3228, 3165, 3102, 3039, 2976, 2913, 2850, +2787, 2724, 2661, 2598, 2535, 2472, 2409, 2346, 2283, 2220, 2157, 2094, 2031, +1968, 1905, 1842, 1779, 1716, 1653, 1590, 1527, 1464, 1401, 1338, 1275, 1212, +1149, 1086, 1023, 4048, 3985, 3922, 3859, 3796, 3733, 3670, 3607, 3544, 3481, +3418, 3355, 3292, 3229, 3166, 3103, 3040, 2977, 2914, 2851, 2788, 2725, 2662, +2599, 2536, 2473, 2410, 2347, 2284, 2221, 2158, 2095, 2032, 1969, 1906, 1843, +1780, 1717, 1654, 1591, 1528, 1465, 1402, 1339, 1276, 1213, 1150, 1087, 4049, +3986, 3923, 3860, 3797, 3734, 3671, 3608, 3545, 3482, 3419, 3356, 3293, 3230, +3167, 3104, 3041, 2978, 2915, 2852, 2789, 2726, 2663, 2600, 2537, 2474, 2411, +2348, 2285, 2222, 2159, 2096, 2033, 1970, 1907, 1844, 1781, 1718, 1655, 1592, +1529, 1466, 1403, 1340, 1277, 1214, 1151, 4050, 3987, 3924, 3861, 3798, 3735, +3672, 3609, 3546, 3483, 3420, 3357, 3294, 3231, 3168, 3105, 3042, 2979, 2916, +2853, 2790, 2727, 2664, 2601, 2538, 2475, 2412, 2349, 2286, 2223, 2160, 2097, +2034, 1971, 1908, 1845, 1782, 1719, 1656, 1593, 1530, 1467, 1404, 1341, 1278, +1215, 4051, 3988, 3925, 3862, 3799, 3736, 3673, 3610, 3547, 3484, 3421, 3358, +3295, 3232, 3169, 3106, 3043, 2980, 2917, 2854, 2791, 2728, 2665, 2602, 2539, +2476, 2413, 2350, 2287, 2224, 2161, 2098, 2035, 1972, 1909, 1846, 1783, 1720, +1657, 1594, 1531, 1468, 1405, 1342, 1279, 4052, 3989, 3926, 3863, 3800, 3737, +3674, 3611, 3548, 3485, 3422, 3359, 3296, 3233, 3170, 3107, 3044, 2981, 2918, +2855, 2792, 2729, 2666, 2603, 2540, 2477, 2414, 2351, 2288, 2225, 2162, 2099, +2036, 1973, 1910, 1847, 1784, 1721, 1658, 1595, 1532, 1469, 1406, 1343, 4053, +3990, 3927, 3864, 3801, 3738, 3675, 3612, 3549, 3486, 3423, 3360, 3297, 3234, +3171, 3108, 3045, 2982, 2919, 2856, 2793, 2730, 2667, 2604, 2541, 2478, 2415, +2352, 2289, 2226, 2163, 2100, 2037, 1974, 1911, 1848, 1785, 1722, 1659, 1596, +1533, 1470, 1407, 4054, 3991, 3928, 3865, 3802, 3739, 3676, 3613, 3550, 3487, +3424, 3361, 3298, 3235, 3172, 3109, 3046, 2983, 2920, 2857, 2794, 2731, 2668, +2605, 2542, 2479, 2416, 2353, 2290, 2227, 2164, 2101, 2038, 1975, 1912, 1849, +1786, 1723, 1660, 1597, 1534, 1471, 4055, 3992, 3929, 3866, 3803, 3740, 3677, +3614, 3551, 3488, 3425, 3362, 3299, 3236, 3173, 3110, 3047, 2984, 2921, 2858, +2795, 2732, 2669, 2606, 2543, 2480, 2417, 2354, 2291, 2228, 2165, 2102, 2039, +1976, 1913, 1850, 1787, 1724, 1661, 1598, 1535, 4056, 3993, 3930, 3867, 3804, +3741, 3678, 3615, 3552, 3489, 3426, 3363, 3300, 3237, 3174, 3111, 3048, 2985, +2922, 2859, 2796, 2733, 2670, 2607, 2544, 2481, 2418, 2355, 2292, 2229, 2166, +2103, 2040, 1977, 1914, 1851, 1788, 1725, 1662, 1599, 4057, 3994, 3931, 3868, +3805, 3742, 3679, 3616, 3553, 3490, 3427, 3364, 3301, 3238, 3175, 3112, 3049, +2986, 2923, 2860, 2797, 2734, 2671, 2608, 2545, 2482, 2419, 2356, 2293, 2230, +2167, 2104, 2041, 1978, 1915, 1852, 1789, 1726, 1663, 4058, 3995, 3932, 3869, +3806, 3743, 3680, 3617, 3554, 3491, 3428, 3365, 3302, 3239, 3176, 3113, 3050, +2987, 2924, 2861, 2798, 2735, 2672, 2609, 2546, 2483, 2420, 2357, 2294, 2231, +2168, 2105, 2042, 1979, 1916, 1853, 1790, 1727, 4059, 3996, 3933, 3870, 3807, +3744, 3681, 3618, 3555, 3492, 3429, 3366, 3303, 3240, 3177, 3114, 3051, 2988, +2925, 2862, 2799, 2736, 2673, 2610, 2547, 2484, 2421, 2358, 2295, 2232, 2169, +2106, 2043, 1980, 1917, 1854, 1791, 4060, 3997, 3934, 3871, 3808, 3745, 3682, +3619, 3556, 3493, 3430, 3367, 3304, 3241, 3178, 3115, 3052, 2989, 2926, 2863, +2800, 2737, 2674, 2611, 2548, 2485, 2422, 2359, 2296, 2233, 2170, 2107, 2044, +1981, 1918, 1855, 4061, 3998, 3935, 3872, 3809, 3746, 3683, 3620, 3557, 3494, +3431, 3368, 3305, 3242, 3179, 3116, 3053, 2990, 2927, 2864, 2801, 2738, 2675, +2612, 2549, 2486, 2423, 2360, 2297, 2234, 2171, 2108, 2045, 1982, 1919, 4062, +3999, 3936, 3873, 3810, 3747, 3684, 3621, 3558, 3495, 3432, 3369, 3306, 3243, +3180, 3117, 3054, 2991, 2928, 2865, 2802, 2739, 2676, 2613, 2550, 2487, 2424, +2361, 2298, 2235, 2172, 2109, 2046, 1983, 4063, 4000, 3937, 3874, 3811, 3748, +3685, 3622, 3559, 3496, 3433, 3370, 3307, 3244, 3181, 3118, 3055, 2992, 2929, +2866, 2803, 2740, 2677, 2614, 2551, 2488, 2425, 2362, 2299, 2236, 2173, 2110, +2047, 4064, 4001, 3938, 3875, 3812, 3749, 3686, 3623, 3560, 3497, 3434, 3371, +3308, 3245, 3182, 3119, 3056, 2993, 2930, 2867, 2804, 2741, 2678, 2615, 2552, +2489, 2426, 2363, 2300, 2237, 2174, 2111, 4065, 4002, 3939, 3876, 3813, 3750, +3687, 3624, 3561, 3498, 3435, 3372, 3309, 3246, 3183, 3120, 3057, 2994, 2931, +2868, 2805, 2742, 2679, 2616, 2553, 2490, 2427, 2364, 2301, 2238, 2175, 4066, +4003, 3940, 3877, 3814, 3751, 3688, 3625, 3562, 3499, 3436, 3373, 3310, 3247, +3184, 3121, 3058, 2995, 2932, 2869, 2806, 2743, 2680, 2617, 2554, 2491, 2428, +2365, 2302, 2239, 4067, 4004, 3941, 3878, 3815, 3752, 3689, 3626, 3563, 3500, +3437, 3374, 3311, 3248, 3185, 3122, 3059, 2996, 2933, 2870, 2807, 2744, 2681, +2618, 2555, 2492, 2429, 2366, 2303, 4068, 4005, 3942, 3879, 3816, 3753, 3690, +3627, 3564, 3501, 3438, 3375, 3312, 3249, 3186, 3123, 3060, 2997, 2934, 2871, +2808, 2745, 2682, 2619, 2556, 2493, 2430, 2367, 4069, 4006, 3943, 3880, 3817, +3754, 3691, 3628, 3565, 3502, 3439, 3376, 3313, 3250, 3187, 3124, 3061, 2998, +2935, 2872, 2809, 2746, 2683, 2620, 2557, 2494, 2431, 4070, 4007, 3944, 3881, +3818, 3755, 3692, 3629, 3566, 3503, 3440, 3377, 3314, 3251, 3188, 3125, 3062, +2999, 2936, 2873, 2810, 2747, 2684, 2621, 2558, 2495, 4071, 4008, 3945, 3882, +3819, 3756, 3693, 3630, 3567, 3504, 3441, 3378, 3315, 3252, 3189, 3126, 3063, +3000, 2937, 2874, 2811, 2748, 2685, 2622, 2559, 4072, 4009, 3946, 3883, 3820, +3757, 3694, 3631, 3568, 3505, 3442, 3379, 3316, 3253, 3190, 3127, 3064, 3001, +2938, 2875, 2812, 2749, 2686, 2623, 4073, 4010, 3947, 3884, 3821, 3758, 3695, +3632, 3569, 3506, 3443, 3380, 3317, 3254, 3191, 3128, 3065, 3002, 2939, 2876, +2813, 2750, 2687, 4074, 4011, 3948, 3885, 3822, 3759, 3696, 3633, 3570, 3507, +3444, 3381, 3318, 3255, 3192, 3129, 3066, 3003, 2940, 2877, 2814, 2751, 4075, +4012, 3949, 3886, 3823, 3760, 3697, 3634, 3571, 3508, 3445, 3382, 3319, 3256, +3193, 3130, 3067, 3004, 2941, 2878, 2815, 4076, 4013, 3950, 3887, 3824, 3761, +3698, 3635, 3572, 3509, 3446, 3383, 3320, 3257, 3194, 3131, 3068, 3005, 2942, +2879, 4077, 4014, 3951, 3888, 3825, 3762, 3699, 3636, 3573, 3510, 3447, 3384, +3321, 3258, 3195, 3132, 3069, 3006, 2943, 4078, 4015, 3952, 3889, 3826, 3763, +3700, 3637, 3574, 3511, 3448, 3385, 3322, 3259, 3196, 3133, 3070, 3007, 4079, +4016, 3953, 3890, 3827, 3764, 3701, 3638, 3575, 3512, 3449, 3386, 3323, 3260, +3197, 3134, 3071, 4080, 4017, 3954, 3891, 3828, 3765, 3702, 3639, 3576, 3513, +3450, 3387, 3324, 3261, 3198, 3135, 4081, 4018, 3955, 3892, 3829, 3766, 3703, +3640, 3577, 3514, 3451, 3388, 3325, 3262, 3199, 4082, 4019, 3956, 3893, 3830, +3767, 3704, 3641, 3578, 3515, 3452, 3389, 3326, 3263, 4083, 4020, 3957, 3894, +3831, 3768, 3705, 3642, 3579, 3516, 3453, 3390, 3327, 4084, 4021, 3958, 3895, +3832, 3769, 3706, 3643, 3580, 3517, 3454, 3391, 4085, 4022, 3959, 3896, 3833, +3770, 3707, 3644, 3581, 3518, 3455, 4086, 4023, 3960, 3897, 3834, 3771, 3708, +3645, 3582, 3519, 4087, 4024, 3961, 3898, 3835, 3772, 3709, 3646, 3583, 4088, +4025, 3962, 3899, 3836, 3773, 3710, 3647, 4089, 4026, 3963, 3900, 3837, 3774, +3711, 4090, 4027, 3964, 3901, 3838, 3775, 4091, 4028, 3965, 3902, 3839, 4092, +4029, 3966, 3903, 4093, 4030, 3967, 4094, 4031, 4095, 0, 0, 1, 0, // 4x4 GROUPED 1xN, 1x2, 1x4 + 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, // 1x8, 1x16 + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 1x32 + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, // 1x64 + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 0, 1, 0, 2, 1, 3, 0, // 2xN, 2x2, 2x4 + 2, 1, 3, 4, 6, 5, 7, 0, 2, 1, 4, 3, 6, // 2x8 + 5, 8, 7, 10, 9, 12, 11, 14, 13, 15, 0, 2, 1, // 2x16 + 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 15, + 16, 18, 17, 20, 19, 22, 21, 24, 23, 26, 25, 28, 27, + 30, 29, 31, 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, // 2x32 + 9, 12, 11, 14, 13, 15, 16, 18, 17, 20, 19, 22, 21, + 24, 23, 26, 25, 28, 27, 30, 29, 31, 32, 34, 33, 36, + 35, 38, 37, 40, 39, 42, 41, 44, 43, 46, 45, 47, 48, + 50, 49, 52, 51, 54, 53, 56, 55, 58, 57, 60, 59, 62, + 61, 63, 0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, // 2x64 + 12, 11, 14, 13, 15, 16, 18, 17, 20, 19, 22, 21, 24, + 23, 26, 25, 28, 27, 30, 29, 31, 32, 34, 33, 36, 35, + 38, 37, 40, 39, 42, 41, 44, 43, 46, 45, 47, 48, 50, + 49, 52, 51, 54, 53, 56, 55, 58, 57, 60, 59, 62, 61, + 63, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 0, 1, 2, 3, 0, 4, 1, 5, 2, 6, 3, 7, 0, // 4xN, 4x2, 4x4 + 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, 10, 7, 14, + 11, 15, 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, // 4x8 + 10, 7, 14, 11, 15, 16, 20, 17, 24, 21, 18, 28, 25, + 22, 19, 29, 26, 23, 30, 27, 31, 0, 4, 1, 8, 5, // 4x16 + 2, 12, 9, 6, 3, 13, 10, 7, 14, 11, 15, 16, 20, + 17, 24, 21, 18, 28, 25, 22, 19, 29, 26, 23, 30, 27, + 31, 32, 36, 33, 40, 37, 34, 44, 41, 38, 35, 45, 42, + 39, 46, 43, 47, 48, 52, 49, 56, 53, 50, 60, 57, 54, + 51, 61, 58, 55, 62, 59, 63, 0, 4, 1, 8, 5, 2, // 4x32 + 12, 9, 6, 3, 13, 10, 7, 14, 11, 15, 16, 20, 17, + 24, 21, 18, 28, 25, 22, 19, 29, 26, 23, 30, 27, 31, + 32, 36, 33, 40, 37, 34, 44, 41, 38, 35, 45, 42, 39, + 46, 43, 47, 48, 52, 49, 56, 53, 50, 60, 57, 54, 51, + 61, 58, 55, 62, 59, 63, 64, 68, 65, 72, 69, 66, 76, + 73, 70, 67, 77, 74, 71, 78, 75, 79, 80, 84, 81, 88, + 85, 82, 92, 89, 86, 83, 93, 90, 87, 94, 91, 95, 96, + 100, 97, 104, 101, 98, 108, 105, 102, 99, 109, 106, 103, 110, + 107, 111, 112, 116, 113, 120, 117, 114, 124, 121, 118, 115, 125, + 122, 119, 126, 123, 127, 0, 4, 1, 8, 5, 2, 12, 9, // 4x64 + 6, 3, 13, 10, 7, 14, 11, 15, 16, 20, 17, 24, 21, + 18, 28, 25, 22, 19, 29, 26, 23, 30, 27, 31, 32, 36, + 33, 40, 37, 34, 44, 41, 38, 35, 45, 42, 39, 46, 43, + 47, 48, 52, 49, 56, 53, 50, 60, 57, 54, 51, 61, 58, + 55, 62, 59, 63, 64, 68, 65, 72, 69, 66, 76, 73, 70, + 67, 77, 74, 71, 78, 75, 79, 80, 84, 81, 88, 85, 82, + 92, 89, 86, 83, 93, 90, 87, 94, 91, 95, 96, 100, 97, + 104, 101, 98, 108, 105, 102, 99, 109, 106, 103, 110, 107, 111, + 112, 116, 113, 120, 117, 114, 124, 121, 118, 115, 125, 122, 119, + 126, 123, 127, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, 7, 0, 8, 1, 9, // 8xN, 8x2 + 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, // 8x4 + 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 26, + 19, 27, 4, 12, 5, 20, 13, 6, 28, 21, 14, 7, 29, + 22, 15, 30, 23, 31, 0, 8, 1, 16, 9, 2, 24, 17, // 8x8 + 10, 3, 25, 18, 11, 26, 19, 27, 32, 40, 33, 48, 41, + 34, 56, 49, 42, 35, 57, 50, 43, 58, 51, 59, 4, 12, + 5, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, + 31, 36, 44, 37, 52, 45, 38, 60, 53, 46, 39, 61, 54, + 47, 62, 55, 63, 0, 8, 1, 16, 9, 2, 24, 17, 10, // 8x16 + 3, 25, 18, 11, 26, 19, 27, 32, 40, 33, 48, 41, 34, + 56, 49, 42, 35, 57, 50, 43, 58, 51, 59, 4, 12, 5, + 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, + 64, 72, 65, 80, 73, 66, 88, 81, 74, 67, 89, 82, 75, + 90, 83, 91, 36, 44, 37, 52, 45, 38, 60, 53, 46, 39, + 61, 54, 47, 62, 55, 63, 96, 104, 97, 112, 105, 98, 120, + 113, 106, 99, 121, 114, 107, 122, 115, 123, 68, 76, 69, 84, + 77, 70, 92, 85, 78, 71, 93, 86, 79, 94, 87, 95, 100, + 108, 101, 116, 109, 102, 124, 117, 110, 103, 125, 118, 111, 126, + 119, 127, 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, // 8x32 + 18, 11, 26, 19, 27, 32, 40, 33, 48, 41, 34, 56, 49, + 42, 35, 57, 50, 43, 58, 51, 59, 4, 12, 5, 20, 13, + 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, 64, 72, + 65, 80, 73, 66, 88, 81, 74, 67, 89, 82, 75, 90, 83, + 91, 36, 44, 37, 52, 45, 38, 60, 53, 46, 39, 61, 54, + 47, 62, 55, 63, 96, 104, 97, 112, 105, 98, 120, 113, 106, + 99, 121, 114, 107, 122, 115, 123, 68, 76, 69, 84, 77, 70, + 92, 85, 78, 71, 93, 86, 79, 94, 87, 95, 128, 136, 129, + 144, 137, 130, 152, 145, 138, 131, 153, 146, 139, 154, 147, 155, + 100, 108, 101, 116, 109, 102, 124, 117, 110, 103, 125, 118, 111, + 126, 119, 127, 160, 168, 161, 176, 169, 162, 184, 177, 170, 163, + 185, 178, 171, 186, 179, 187, 132, 140, 133, 148, 141, 134, 156, + 149, 142, 135, 157, 150, 143, 158, 151, 159, 192, 200, 193, 208, + 201, 194, 216, 209, 202, 195, 217, 210, 203, 218, 211, 219, 164, + 172, 165, 180, 173, 166, 188, 181, 174, 167, 189, 182, 175, 190, + 183, 191, 224, 232, 225, 240, 233, 226, 248, 241, 234, 227, 249, + 242, 235, 250, 243, 251, 196, 204, 197, 212, 205, 198, 220, 213, + 206, 199, 221, 214, 207, 222, 215, 223, 228, 236, 229, 244, 237, + 230, 252, 245, 238, 231, 253, 246, 239, 254, 247, 255, 0, 8, // 8x64 + 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 26, 19, + 27, 32, 40, 33, 48, 41, 34, 56, 49, 42, 35, 57, 50, + 43, 58, 51, 59, 4, 12, 5, 20, 13, 6, 28, 21, 14, + 7, 29, 22, 15, 30, 23, 31, 64, 72, 65, 80, 73, 66, + 88, 81, 74, 67, 89, 82, 75, 90, 83, 91, 36, 44, 37, + 52, 45, 38, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63, + 96, 104, 97, 112, 105, 98, 120, 113, 106, 99, 121, 114, 107, + 122, 115, 123, 68, 76, 69, 84, 77, 70, 92, 85, 78, 71, + 93, 86, 79, 94, 87, 95, 128, 136, 129, 144, 137, 130, 152, + 145, 138, 131, 153, 146, 139, 154, 147, 155, 100, 108, 101, 116, + 109, 102, 124, 117, 110, 103, 125, 118, 111, 126, 119, 127, 160, + 168, 161, 176, 169, 162, 184, 177, 170, 163, 185, 178, 171, 186, + 179, 187, 132, 140, 133, 148, 141, 134, 156, 149, 142, 135, 157, + 150, 143, 158, 151, 159, 192, 200, 193, 208, 201, 194, 216, 209, + 202, 195, 217, 210, 203, 218, 211, 219, 164, 172, 165, 180, 173, + 166, 188, 181, 174, 167, 189, 182, 175, 190, 183, 191, 224, 232, + 225, 240, 233, 226, 248, 241, 234, 227, 249, 242, 235, 250, 243, + 251, 196, 204, 197, 212, 205, 198, 220, 213, 206, 199, 221, 214, + 207, 222, 215, 223, 228, 236, 229, 244, 237, 230, 252, 245, 238, + 231, 253, 246, 239, 254, 247, 255, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 16xN + 10, 11, 12, 13, 14, 15, 0, 16, 1, 17, 2, 18, 3, // 16x2 + 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, + 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31, 0, // 16x4 + 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 50, + 35, 51, 4, 20, 5, 36, 21, 6, 52, 37, 22, 7, 53, + 38, 23, 54, 39, 55, 8, 24, 9, 40, 25, 10, 56, 41, + 26, 11, 57, 42, 27, 58, 43, 59, 12, 28, 13, 44, 29, + 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, 0, 16, // 16x8 + 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 50, 35, + 51, 64, 80, 65, 96, 81, 66, 112, 97, 82, 67, 113, 98, + 83, 114, 99, 115, 4, 20, 5, 36, 21, 6, 52, 37, 22, + 7, 53, 38, 23, 54, 39, 55, 68, 84, 69, 100, 85, 70, + 116, 101, 86, 71, 117, 102, 87, 118, 103, 119, 8, 24, 9, + 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 58, 43, 59, + 72, 88, 73, 104, 89, 74, 120, 105, 90, 75, 121, 106, 91, + 122, 107, 123, 12, 28, 13, 44, 29, 14, 60, 45, 30, 15, + 61, 46, 31, 62, 47, 63, 76, 92, 77, 108, 93, 78, 124, + 109, 94, 79, 125, 110, 95, 126, 111, 127, 0, 16, 1, 32, // 16x16 + 17, 2, 48, 33, 18, 3, 49, 34, 19, 50, 35, 51, 64, + 80, 65, 96, 81, 66, 112, 97, 82, 67, 113, 98, 83, 114, + 99, 115, 4, 20, 5, 36, 21, 6, 52, 37, 22, 7, 53, + 38, 23, 54, 39, 55, 128, 144, 129, 160, 145, 130, 176, 161, + 146, 131, 177, 162, 147, 178, 163, 179, 68, 84, 69, 100, 85, + 70, 116, 101, 86, 71, 117, 102, 87, 118, 103, 119, 8, 24, + 9, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 58, 43, + 59, 192, 208, 193, 224, 209, 194, 240, 225, 210, 195, 241, 226, + 211, 242, 227, 243, 132, 148, 133, 164, 149, 134, 180, 165, 150, + 135, 181, 166, 151, 182, 167, 183, 72, 88, 73, 104, 89, 74, + 120, 105, 90, 75, 121, 106, 91, 122, 107, 123, 12, 28, 13, + 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, + 196, 212, 197, 228, 213, 198, 244, 229, 214, 199, 245, 230, 215, + 246, 231, 247, 136, 152, 137, 168, 153, 138, 184, 169, 154, 139, + 185, 170, 155, 186, 171, 187, 76, 92, 77, 108, 93, 78, 124, + 109, 94, 79, 125, 110, 95, 126, 111, 127, 200, 216, 201, 232, + 217, 202, 248, 233, 218, 203, 249, 234, 219, 250, 235, 251, 140, + 156, 141, 172, 157, 142, 188, 173, 158, 143, 189, 174, 159, 190, + 175, 191, 204, 220, 205, 236, 221, 206, 252, 237, 222, 207, 253, + 238, 223, 254, 239, 255, 0, 16, 1, 32, 17, 2, 48, 33, // 16x32 + 18, 3, 49, 34, 19, 50, 35, 51, 64, 80, 65, 96, 81, + 66, 112, 97, 82, 67, 113, 98, 83, 114, 99, 115, 4, 20, + 5, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 54, 39, + 55, 128, 144, 129, 160, 145, 130, 176, 161, 146, 131, 177, 162, + 147, 178, 163, 179, 68, 84, 69, 100, 85, 70, 116, 101, 86, + 71, 117, 102, 87, 118, 103, 119, 8, 24, 9, 40, 25, 10, + 56, 41, 26, 11, 57, 42, 27, 58, 43, 59, 192, 208, 193, + 224, 209, 194, 240, 225, 210, 195, 241, 226, 211, 242, 227, 243, + 132, 148, 133, 164, 149, 134, 180, 165, 150, 135, 181, 166, 151, + 182, 167, 183, 72, 88, 73, 104, 89, 74, 120, 105, 90, 75, + 121, 106, 91, 122, 107, 123, 12, 28, 13, 44, 29, 14, 60, + 45, 30, 15, 61, 46, 31, 62, 47, 63, 256, 272, 257, 288, + 273, 258, 304, 289, 274, 259, 305, 290, 275, 306, 291, 307, 196, + 212, 197, 228, 213, 198, 244, 229, 214, 199, 245, 230, 215, 246, + 231, 247, 136, 152, 137, 168, 153, 138, 184, 169, 154, 139, 185, + 170, 155, 186, 171, 187, 76, 92, 77, 108, 93, 78, 124, 109, + 94, 79, 125, 110, 95, 126, 111, 127, 320, 336, 321, 352, 337, + 322, 368, 353, 338, 323, 369, 354, 339, 370, 355, 371, 260, 276, + 261, 292, 277, 262, 308, 293, 278, 263, 309, 294, 279, 310, 295, + 311, 200, 216, 201, 232, 217, 202, 248, 233, 218, 203, 249, 234, + 219, 250, 235, 251, 140, 156, 141, 172, 157, 142, 188, 173, 158, + 143, 189, 174, 159, 190, 175, 191, 384, 400, 385, 416, 401, 386, + 432, 417, 402, 387, 433, 418, 403, 434, 419, 435, 324, 340, 325, + 356, 341, 326, 372, 357, 342, 327, 373, 358, 343, 374, 359, 375, + 264, 280, 265, 296, 281, 266, 312, 297, 282, 267, 313, 298, 283, + 314, 299, 315, 204, 220, 205, 236, 221, 206, 252, 237, 222, 207, + 253, 238, 223, 254, 239, 255, 448, 464, 449, 480, 465, 450, 496, + 481, 466, 451, 497, 482, 467, 498, 483, 499, 388, 404, 389, 420, + 405, 390, 436, 421, 406, 391, 437, 422, 407, 438, 423, 439, 328, + 344, 329, 360, 345, 330, 376, 361, 346, 331, 377, 362, 347, 378, + 363, 379, 268, 284, 269, 300, 285, 270, 316, 301, 286, 271, 317, + 302, 287, 318, 303, 319, 452, 468, 453, 484, 469, 454, 500, 485, + 470, 455, 501, 486, 471, 502, 487, 503, 392, 408, 393, 424, 409, + 394, 440, 425, 410, 395, 441, 426, 411, 442, 427, 443, 332, 348, + 333, 364, 349, 334, 380, 365, 350, 335, 381, 366, 351, 382, 367, + 383, 456, 472, 457, 488, 473, 458, 504, 489, 474, 459, 505, 490, + 475, 506, 491, 507, 396, 412, 397, 428, 413, 398, 444, 429, 414, + 399, 445, 430, 415, 446, 431, 447, 460, 476, 461, 492, 477, 462, + 508, 493, 478, 463, 509, 494, 479, 510, 495, 511, 0, 16, 1, // 16x64 + 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 50, 35, 51, + 64, 80, 65, 96, 81, 66, 112, 97, 82, 67, 113, 98, 83, + 114, 99, 115, 4, 20, 5, 36, 21, 6, 52, 37, 22, 7, + 53, 38, 23, 54, 39, 55, 128, 144, 129, 160, 145, 130, 176, + 161, 146, 131, 177, 162, 147, 178, 163, 179, 68, 84, 69, 100, + 85, 70, 116, 101, 86, 71, 117, 102, 87, 118, 103, 119, 8, + 24, 9, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 58, + 43, 59, 192, 208, 193, 224, 209, 194, 240, 225, 210, 195, 241, + 226, 211, 242, 227, 243, 132, 148, 133, 164, 149, 134, 180, 165, + 150, 135, 181, 166, 151, 182, 167, 183, 72, 88, 73, 104, 89, + 74, 120, 105, 90, 75, 121, 106, 91, 122, 107, 123, 12, 28, + 13, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, + 63, 256, 272, 257, 288, 273, 258, 304, 289, 274, 259, 305, 290, + 275, 306, 291, 307, 196, 212, 197, 228, 213, 198, 244, 229, 214, + 199, 245, 230, 215, 246, 231, 247, 136, 152, 137, 168, 153, 138, + 184, 169, 154, 139, 185, 170, 155, 186, 171, 187, 76, 92, 77, + 108, 93, 78, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, + 320, 336, 321, 352, 337, 322, 368, 353, 338, 323, 369, 354, 339, + 370, 355, 371, 260, 276, 261, 292, 277, 262, 308, 293, 278, 263, + 309, 294, 279, 310, 295, 311, 200, 216, 201, 232, 217, 202, 248, + 233, 218, 203, 249, 234, 219, 250, 235, 251, 140, 156, 141, 172, + 157, 142, 188, 173, 158, 143, 189, 174, 159, 190, 175, 191, 384, + 400, 385, 416, 401, 386, 432, 417, 402, 387, 433, 418, 403, 434, + 419, 435, 324, 340, 325, 356, 341, 326, 372, 357, 342, 327, 373, + 358, 343, 374, 359, 375, 264, 280, 265, 296, 281, 266, 312, 297, + 282, 267, 313, 298, 283, 314, 299, 315, 204, 220, 205, 236, 221, + 206, 252, 237, 222, 207, 253, 238, 223, 254, 239, 255, 448, 464, + 449, 480, 465, 450, 496, 481, 466, 451, 497, 482, 467, 498, 483, + 499, 388, 404, 389, 420, 405, 390, 436, 421, 406, 391, 437, 422, + 407, 438, 423, 439, 328, 344, 329, 360, 345, 330, 376, 361, 346, + 331, 377, 362, 347, 378, 363, 379, 268, 284, 269, 300, 285, 270, + 316, 301, 286, 271, 317, 302, 287, 318, 303, 319, 452, 468, 453, + 484, 469, 454, 500, 485, 470, 455, 501, 486, 471, 502, 487, 503, + 392, 408, 393, 424, 409, 394, 440, 425, 410, 395, 441, 426, 411, + 442, 427, 443, 332, 348, 333, 364, 349, 334, 380, 365, 350, 335, + 381, 366, 351, 382, 367, 383, 456, 472, 457, 488, 473, 458, 504, + 489, 474, 459, 505, 490, 475, 506, 491, 507, 396, 412, 397, 428, + 413, 398, 444, 429, 414, 399, 445, 430, 415, 446, 431, 447, 460, + 476, 461, 492, 477, 462, 508, 493, 478, 463, 509, 494, 479, 510, + 495, 511, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 0, 1, 2, 3, 4, 5, // 32xN + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, // 32x2 + 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, + 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, + 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, + 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, 0, // 32x4 + 32, 1, 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, + 67, 99, 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, + 70, 39, 102, 71, 103, 8, 40, 9, 72, 41, 10, 104, 73, + 42, 11, 105, 74, 43, 106, 75, 107, 12, 44, 13, 76, 45, + 14, 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, 16, 48, + 17, 80, 49, 18, 112, 81, 50, 19, 113, 82, 51, 114, 83, + 115, 20, 52, 21, 84, 53, 22, 116, 85, 54, 23, 117, 86, + 55, 118, 87, 119, 24, 56, 25, 88, 57, 26, 120, 89, 58, + 27, 121, 90, 59, 122, 91, 123, 28, 60, 29, 92, 61, 30, + 124, 93, 62, 31, 125, 94, 63, 126, 95, 127, 0, 32, 1, // 32x8 + 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, 99, + 128, 160, 129, 192, 161, 130, 224, 193, 162, 131, 225, 194, 163, + 226, 195, 227, 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, + 101, 70, 39, 102, 71, 103, 132, 164, 133, 196, 165, 134, 228, + 197, 166, 135, 229, 198, 167, 230, 199, 231, 8, 40, 9, 72, + 41, 10, 104, 73, 42, 11, 105, 74, 43, 106, 75, 107, 136, + 168, 137, 200, 169, 138, 232, 201, 170, 139, 233, 202, 171, 234, + 203, 235, 12, 44, 13, 76, 45, 14, 108, 77, 46, 15, 109, + 78, 47, 110, 79, 111, 140, 172, 141, 204, 173, 142, 236, 205, + 174, 143, 237, 206, 175, 238, 207, 239, 16, 48, 17, 80, 49, + 18, 112, 81, 50, 19, 113, 82, 51, 114, 83, 115, 144, 176, + 145, 208, 177, 146, 240, 209, 178, 147, 241, 210, 179, 242, 211, + 243, 20, 52, 21, 84, 53, 22, 116, 85, 54, 23, 117, 86, + 55, 118, 87, 119, 148, 180, 149, 212, 181, 150, 244, 213, 182, + 151, 245, 214, 183, 246, 215, 247, 24, 56, 25, 88, 57, 26, + 120, 89, 58, 27, 121, 90, 59, 122, 91, 123, 152, 184, 153, + 216, 185, 154, 248, 217, 186, 155, 249, 218, 187, 250, 219, 251, + 28, 60, 29, 92, 61, 30, 124, 93, 62, 31, 125, 94, 63, + 126, 95, 127, 156, 188, 157, 220, 189, 158, 252, 221, 190, 159, + 253, 222, 191, 254, 223, 255, 0, 32, 1, 64, 33, 2, 96, // 32x16 + 65, 34, 3, 97, 66, 35, 98, 67, 99, 128, 160, 129, 192, + 161, 130, 224, 193, 162, 131, 225, 194, 163, 226, 195, 227, 4, + 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, 70, 39, 102, + 71, 103, 256, 288, 257, 320, 289, 258, 352, 321, 290, 259, 353, + 322, 291, 354, 323, 355, 132, 164, 133, 196, 165, 134, 228, 197, + 166, 135, 229, 198, 167, 230, 199, 231, 8, 40, 9, 72, 41, + 10, 104, 73, 42, 11, 105, 74, 43, 106, 75, 107, 384, 416, + 385, 448, 417, 386, 480, 449, 418, 387, 481, 450, 419, 482, 451, + 483, 260, 292, 261, 324, 293, 262, 356, 325, 294, 263, 357, 326, + 295, 358, 327, 359, 136, 168, 137, 200, 169, 138, 232, 201, 170, + 139, 233, 202, 171, 234, 203, 235, 12, 44, 13, 76, 45, 14, + 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, 388, 420, 389, + 452, 421, 390, 484, 453, 422, 391, 485, 454, 423, 486, 455, 487, + 264, 296, 265, 328, 297, 266, 360, 329, 298, 267, 361, 330, 299, + 362, 331, 363, 140, 172, 141, 204, 173, 142, 236, 205, 174, 143, + 237, 206, 175, 238, 207, 239, 16, 48, 17, 80, 49, 18, 112, + 81, 50, 19, 113, 82, 51, 114, 83, 115, 392, 424, 393, 456, + 425, 394, 488, 457, 426, 395, 489, 458, 427, 490, 459, 491, 268, + 300, 269, 332, 301, 270, 364, 333, 302, 271, 365, 334, 303, 366, + 335, 367, 144, 176, 145, 208, 177, 146, 240, 209, 178, 147, 241, + 210, 179, 242, 211, 243, 20, 52, 21, 84, 53, 22, 116, 85, + 54, 23, 117, 86, 55, 118, 87, 119, 396, 428, 397, 460, 429, + 398, 492, 461, 430, 399, 493, 462, 431, 494, 463, 495, 272, 304, + 273, 336, 305, 274, 368, 337, 306, 275, 369, 338, 307, 370, 339, + 371, 148, 180, 149, 212, 181, 150, 244, 213, 182, 151, 245, 214, + 183, 246, 215, 247, 24, 56, 25, 88, 57, 26, 120, 89, 58, + 27, 121, 90, 59, 122, 91, 123, 400, 432, 401, 464, 433, 402, + 496, 465, 434, 403, 497, 466, 435, 498, 467, 499, 276, 308, 277, + 340, 309, 278, 372, 341, 310, 279, 373, 342, 311, 374, 343, 375, + 152, 184, 153, 216, 185, 154, 248, 217, 186, 155, 249, 218, 187, + 250, 219, 251, 28, 60, 29, 92, 61, 30, 124, 93, 62, 31, + 125, 94, 63, 126, 95, 127, 404, 436, 405, 468, 437, 406, 500, + 469, 438, 407, 501, 470, 439, 502, 471, 503, 280, 312, 281, 344, + 313, 282, 376, 345, 314, 283, 377, 346, 315, 378, 347, 379, 156, + 188, 157, 220, 189, 158, 252, 221, 190, 159, 253, 222, 191, 254, + 223, 255, 408, 440, 409, 472, 441, 410, 504, 473, 442, 411, 505, + 474, 443, 506, 475, 507, 284, 316, 285, 348, 317, 286, 380, 349, + 318, 287, 381, 350, 319, 382, 351, 383, 412, 444, 413, 476, 445, + 414, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511, 0, 32, // 32x32 + 1, 64, 33, 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, + 99, 128, 160, 129, 192, 161, 130, 224, 193, 162, 131, 225, 194, + 163, 226, 195, 227, 4, 36, 5, 68, 37, 6, 100, 69, 38, + 7, 101, 70, 39, 102, 71, 103, 256, 288, 257, 320, 289, 258, + 352, 321, 290, 259, 353, 322, 291, 354, 323, 355, 132, 164, 133, + 196, 165, 134, 228, 197, 166, 135, 229, 198, 167, 230, 199, 231, + 8, 40, 9, 72, 41, 10, 104, 73, 42, 11, 105, 74, 43, + 106, 75, 107, 384, 416, 385, 448, 417, 386, 480, 449, 418, 387, + 481, 450, 419, 482, 451, 483, 260, 292, 261, 324, 293, 262, 356, + 325, 294, 263, 357, 326, 295, 358, 327, 359, 136, 168, 137, 200, + 169, 138, 232, 201, 170, 139, 233, 202, 171, 234, 203, 235, 12, + 44, 13, 76, 45, 14, 108, 77, 46, 15, 109, 78, 47, 110, + 79, 111, 512, 544, 513, 576, 545, 514, 608, 577, 546, 515, 609, + 578, 547, 610, 579, 611, 388, 420, 389, 452, 421, 390, 484, 453, + 422, 391, 485, 454, 423, 486, 455, 487, 264, 296, 265, 328, 297, + 266, 360, 329, 298, 267, 361, 330, 299, 362, 331, 363, 140, 172, + 141, 204, 173, 142, 236, 205, 174, 143, 237, 206, 175, 238, 207, + 239, 16, 48, 17, 80, 49, 18, 112, 81, 50, 19, 113, 82, + 51, 114, 83, 115, 640, 672, 641, 704, 673, 642, 736, 705, 674, + 643, 737, 706, 675, 738, 707, 739, 516, 548, 517, 580, 549, 518, + 612, 581, 550, 519, 613, 582, 551, 614, 583, 615, 392, 424, 393, + 456, 425, 394, 488, 457, 426, 395, 489, 458, 427, 490, 459, 491, + 268, 300, 269, 332, 301, 270, 364, 333, 302, 271, 365, 334, 303, + 366, 335, 367, 144, 176, 145, 208, 177, 146, 240, 209, 178, 147, + 241, 210, 179, 242, 211, 243, 20, 52, 21, 84, 53, 22, 116, + 85, 54, 23, 117, 86, 55, 118, 87, 119, 768, 800, 769, 832, + 801, 770, 864, 833, 802, 771, 865, 834, 803, 866, 835, 867, 644, + 676, 645, 708, 677, 646, 740, 709, 678, 647, 741, 710, 679, 742, + 711, 743, 520, 552, 521, 584, 553, 522, 616, 585, 554, 523, 617, + 586, 555, 618, 587, 619, 396, 428, 397, 460, 429, 398, 492, 461, + 430, 399, 493, 462, 431, 494, 463, 495, 272, 304, 273, 336, 305, + 274, 368, 337, 306, 275, 369, 338, 307, 370, 339, 371, 148, 180, + 149, 212, 181, 150, 244, 213, 182, 151, 245, 214, 183, 246, 215, + 247, 24, 56, 25, 88, 57, 26, 120, 89, 58, 27, 121, 90, + 59, 122, 91, 123, 896, 928, 897, 960, 929, 898, 992, 961, 930, + 899, 993, 962, 931, 994, 963, 995, 772, 804, 773, 836, 805, 774, + 868, 837, 806, 775, 869, 838, 807, 870, 839, 871, 648, 680, 649, + 712, 681, 650, 744, 713, 682, 651, 745, 714, 683, 746, 715, 747, + 524, 556, 525, 588, 557, 526, 620, 589, 558, 527, 621, 590, 559, + 622, 591, 623, 400, 432, 401, 464, 433, 402, 496, 465, 434, 403, + 497, 466, 435, 498, 467, 499, 276, 308, 277, 340, 309, 278, 372, + 341, 310, 279, 373, 342, 311, 374, 343, 375, 152, 184, 153, 216, + 185, 154, 248, 217, 186, 155, 249, 218, 187, 250, 219, 251, 28, + 60, 29, 92, 61, 30, 124, 93, 62, 31, 125, 94, 63, 126, + 95, 127, 900, 932, 901, 964, 933, 902, 996, 965, 934, 903, 997, + 966, 935, 998, 967, 999, 776, 808, 777, 840, 809, 778, 872, 841, + 810, 779, 873, 842, 811, 874, 843, 875, 652, 684, 653, 716, 685, + 654, 748, 717, 686, 655, 749, 718, 687, 750, 719, 751, 528, 560, + 529, 592, 561, 530, 624, 593, 562, 531, 625, 594, 563, 626, 595, + 627, 404, 436, 405, 468, 437, 406, 500, 469, 438, 407, 501, 470, + 439, 502, 471, 503, 280, 312, 281, 344, 313, 282, 376, 345, 314, + 283, 377, 346, 315, 378, 347, 379, 156, 188, 157, 220, 189, 158, + 252, 221, 190, 159, 253, 222, 191, 254, 223, 255, 904, 936, 905, + 968, 937, 906, 1000, 969, 938, 907, 1001, 970, 939, 1002, 971, 1003, + 780, 812, 781, 844, 813, 782, 876, 845, 814, 783, 877, 846, 815, + 878, 847, 879, 656, 688, 657, 720, 689, 658, 752, 721, 690, 659, + 753, 722, 691, 754, 723, 755, 532, 564, 533, 596, 565, 534, 628, + 597, 566, 535, 629, 598, 567, 630, 599, 631, 408, 440, 409, 472, + 441, 410, 504, 473, 442, 411, 505, 474, 443, 506, 475, 507, 284, + 316, 285, 348, 317, 286, 380, 349, 318, 287, 381, 350, 319, 382, + 351, 383, 908, 940, 909, 972, 941, 910, 1004, 973, 942, 911, 1005, + 974, 943, 1006, 975, 1007, 784, 816, 785, 848, 817, 786, 880, 849, + 818, 787, 881, 850, 819, 882, 851, 883, 660, 692, 661, 724, 693, + 662, 756, 725, 694, 663, 757, 726, 695, 758, 727, 759, 536, 568, + 537, 600, 569, 538, 632, 601, 570, 539, 633, 602, 571, 634, 603, + 635, 412, 444, 413, 476, 445, 414, 508, 477, 446, 415, 509, 478, + 447, 510, 479, 511, 912, 944, 913, 976, 945, 914, 1008, 977, 946, + 915, 1009, 978, 947, 1010, 979, 1011, 788, 820, 789, 852, 821, 790, + 884, 853, 822, 791, 885, 854, 823, 886, 855, 887, 664, 696, 665, + 728, 697, 666, 760, 729, 698, 667, 761, 730, 699, 762, 731, 763, + 540, 572, 541, 604, 573, 542, 636, 605, 574, 543, 637, 606, 575, + 638, 607, 639, 916, 948, 917, 980, 949, 918, 1012, 981, 950, 919, +1013, 982, 951, 1014, 983, 1015, 792, 824, 793, 856, 825, 794, 888, + 857, 826, 795, 889, 858, 827, 890, 859, 891, 668, 700, 669, 732, + 701, 670, 764, 733, 702, 671, 765, 734, 703, 766, 735, 767, 920, + 952, 921, 984, 953, 922, 1016, 985, 954, 923, 1017, 986, 955, 1018, + 987, 1019, 796, 828, 797, 860, 829, 798, 892, 861, 830, 799, 893, + 862, 831, 894, 863, 895, 924, 956, 925, 988, 957, 926, 1020, 989, + 958, 927, 1021, 990, 959, 1022, 991, 1023, 0, 32, 1, 64, 33, // 32x64 + 2, 96, 65, 34, 3, 97, 66, 35, 98, 67, 99, 128, 160, + 129, 192, 161, 130, 224, 193, 162, 131, 225, 194, 163, 226, 195, + 227, 4, 36, 5, 68, 37, 6, 100, 69, 38, 7, 101, 70, + 39, 102, 71, 103, 256, 288, 257, 320, 289, 258, 352, 321, 290, + 259, 353, 322, 291, 354, 323, 355, 132, 164, 133, 196, 165, 134, + 228, 197, 166, 135, 229, 198, 167, 230, 199, 231, 8, 40, 9, + 72, 41, 10, 104, 73, 42, 11, 105, 74, 43, 106, 75, 107, + 384, 416, 385, 448, 417, 386, 480, 449, 418, 387, 481, 450, 419, + 482, 451, 483, 260, 292, 261, 324, 293, 262, 356, 325, 294, 263, + 357, 326, 295, 358, 327, 359, 136, 168, 137, 200, 169, 138, 232, + 201, 170, 139, 233, 202, 171, 234, 203, 235, 12, 44, 13, 76, + 45, 14, 108, 77, 46, 15, 109, 78, 47, 110, 79, 111, 512, + 544, 513, 576, 545, 514, 608, 577, 546, 515, 609, 578, 547, 610, + 579, 611, 388, 420, 389, 452, 421, 390, 484, 453, 422, 391, 485, + 454, 423, 486, 455, 487, 264, 296, 265, 328, 297, 266, 360, 329, + 298, 267, 361, 330, 299, 362, 331, 363, 140, 172, 141, 204, 173, + 142, 236, 205, 174, 143, 237, 206, 175, 238, 207, 239, 16, 48, + 17, 80, 49, 18, 112, 81, 50, 19, 113, 82, 51, 114, 83, + 115, 640, 672, 641, 704, 673, 642, 736, 705, 674, 643, 737, 706, + 675, 738, 707, 739, 516, 548, 517, 580, 549, 518, 612, 581, 550, + 519, 613, 582, 551, 614, 583, 615, 392, 424, 393, 456, 425, 394, + 488, 457, 426, 395, 489, 458, 427, 490, 459, 491, 268, 300, 269, + 332, 301, 270, 364, 333, 302, 271, 365, 334, 303, 366, 335, 367, + 144, 176, 145, 208, 177, 146, 240, 209, 178, 147, 241, 210, 179, + 242, 211, 243, 20, 52, 21, 84, 53, 22, 116, 85, 54, 23, + 117, 86, 55, 118, 87, 119, 768, 800, 769, 832, 801, 770, 864, + 833, 802, 771, 865, 834, 803, 866, 835, 867, 644, 676, 645, 708, + 677, 646, 740, 709, 678, 647, 741, 710, 679, 742, 711, 743, 520, + 552, 521, 584, 553, 522, 616, 585, 554, 523, 617, 586, 555, 618, + 587, 619, 396, 428, 397, 460, 429, 398, 492, 461, 430, 399, 493, + 462, 431, 494, 463, 495, 272, 304, 273, 336, 305, 274, 368, 337, + 306, 275, 369, 338, 307, 370, 339, 371, 148, 180, 149, 212, 181, + 150, 244, 213, 182, 151, 245, 214, 183, 246, 215, 247, 24, 56, + 25, 88, 57, 26, 120, 89, 58, 27, 121, 90, 59, 122, 91, + 123, 896, 928, 897, 960, 929, 898, 992, 961, 930, 899, 993, 962, + 931, 994, 963, 995, 772, 804, 773, 836, 805, 774, 868, 837, 806, + 775, 869, 838, 807, 870, 839, 871, 648, 680, 649, 712, 681, 650, + 744, 713, 682, 651, 745, 714, 683, 746, 715, 747, 524, 556, 525, + 588, 557, 526, 620, 589, 558, 527, 621, 590, 559, 622, 591, 623, + 400, 432, 401, 464, 433, 402, 496, 465, 434, 403, 497, 466, 435, + 498, 467, 499, 276, 308, 277, 340, 309, 278, 372, 341, 310, 279, + 373, 342, 311, 374, 343, 375, 152, 184, 153, 216, 185, 154, 248, + 217, 186, 155, 249, 218, 187, 250, 219, 251, 28, 60, 29, 92, + 61, 30, 124, 93, 62, 31, 125, 94, 63, 126, 95, 127, 900, + 932, 901, 964, 933, 902, 996, 965, 934, 903, 997, 966, 935, 998, + 967, 999, 776, 808, 777, 840, 809, 778, 872, 841, 810, 779, 873, + 842, 811, 874, 843, 875, 652, 684, 653, 716, 685, 654, 748, 717, + 686, 655, 749, 718, 687, 750, 719, 751, 528, 560, 529, 592, 561, + 530, 624, 593, 562, 531, 625, 594, 563, 626, 595, 627, 404, 436, + 405, 468, 437, 406, 500, 469, 438, 407, 501, 470, 439, 502, 471, + 503, 280, 312, 281, 344, 313, 282, 376, 345, 314, 283, 377, 346, + 315, 378, 347, 379, 156, 188, 157, 220, 189, 158, 252, 221, 190, + 159, 253, 222, 191, 254, 223, 255, 904, 936, 905, 968, 937, 906, +1000, 969, 938, 907, 1001, 970, 939, 1002, 971, 1003, 780, 812, 781, + 844, 813, 782, 876, 845, 814, 783, 877, 846, 815, 878, 847, 879, + 656, 688, 657, 720, 689, 658, 752, 721, 690, 659, 753, 722, 691, + 754, 723, 755, 532, 564, 533, 596, 565, 534, 628, 597, 566, 535, + 629, 598, 567, 630, 599, 631, 408, 440, 409, 472, 441, 410, 504, + 473, 442, 411, 505, 474, 443, 506, 475, 507, 284, 316, 285, 348, + 317, 286, 380, 349, 318, 287, 381, 350, 319, 382, 351, 383, 908, + 940, 909, 972, 941, 910, 1004, 973, 942, 911, 1005, 974, 943, 1006, + 975, 1007, 784, 816, 785, 848, 817, 786, 880, 849, 818, 787, 881, + 850, 819, 882, 851, 883, 660, 692, 661, 724, 693, 662, 756, 725, + 694, 663, 757, 726, 695, 758, 727, 759, 536, 568, 537, 600, 569, + 538, 632, 601, 570, 539, 633, 602, 571, 634, 603, 635, 412, 444, + 413, 476, 445, 414, 508, 477, 446, 415, 509, 478, 447, 510, 479, + 511, 912, 944, 913, 976, 945, 914, 1008, 977, 946, 915, 1009, 978, + 947, 1010, 979, 1011, 788, 820, 789, 852, 821, 790, 884, 853, 822, + 791, 885, 854, 823, 886, 855, 887, 664, 696, 665, 728, 697, 666, + 760, 729, 698, 667, 761, 730, 699, 762, 731, 763, 540, 572, 541, + 604, 573, 542, 636, 605, 574, 543, 637, 606, 575, 638, 607, 639, + 916, 948, 917, 980, 949, 918, 1012, 981, 950, 919, 1013, 982, 951, +1014, 983, 1015, 792, 824, 793, 856, 825, 794, 888, 857, 826, 795, + 889, 858, 827, 890, 859, 891, 668, 700, 669, 732, 701, 670, 764, + 733, 702, 671, 765, 734, 703, 766, 735, 767, 920, 952, 921, 984, + 953, 922, 1016, 985, 954, 923, 1017, 986, 955, 1018, 987, 1019, 796, + 828, 797, 860, 829, 798, 892, 861, 830, 799, 893, 862, 831, 894, + 863, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, + 990, 959, 1022, 991, 1023, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // 64xN + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, + 63, 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, // 64x2 + 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, + 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, + 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, + 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 0, // 64x4 + 64, 1, 128, 65, 2, 192, 129, 66, 3, 193, 130, 67, 194, + 131, 195, 4, 68, 5, 132, 69, 6, 196, 133, 70, 7, 197, + 134, 71, 198, 135, 199, 8, 72, 9, 136, 73, 10, 200, 137, + 74, 11, 201, 138, 75, 202, 139, 203, 12, 76, 13, 140, 77, + 14, 204, 141, 78, 15, 205, 142, 79, 206, 143, 207, 16, 80, + 17, 144, 81, 18, 208, 145, 82, 19, 209, 146, 83, 210, 147, + 211, 20, 84, 21, 148, 85, 22, 212, 149, 86, 23, 213, 150, + 87, 214, 151, 215, 24, 88, 25, 152, 89, 26, 216, 153, 90, + 27, 217, 154, 91, 218, 155, 219, 28, 92, 29, 156, 93, 30, + 220, 157, 94, 31, 221, 158, 95, 222, 159, 223, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 0, 64, 1, 128, 65, // 64x8 + 2, 192, 129, 66, 3, 193, 130, 67, 194, 131, 195, 256, 320, + 257, 384, 321, 258, 448, 385, 322, 259, 449, 386, 323, 450, 387, + 451, 4, 68, 5, 132, 69, 6, 196, 133, 70, 7, 197, 134, + 71, 198, 135, 199, 260, 324, 261, 388, 325, 262, 452, 389, 326, + 263, 453, 390, 327, 454, 391, 455, 8, 72, 9, 136, 73, 10, + 200, 137, 74, 11, 201, 138, 75, 202, 139, 203, 264, 328, 265, + 392, 329, 266, 456, 393, 330, 267, 457, 394, 331, 458, 395, 459, + 12, 76, 13, 140, 77, 14, 204, 141, 78, 15, 205, 142, 79, + 206, 143, 207, 268, 332, 269, 396, 333, 270, 460, 397, 334, 271, + 461, 398, 335, 462, 399, 463, 16, 80, 17, 144, 81, 18, 208, + 145, 82, 19, 209, 146, 83, 210, 147, 211, 272, 336, 273, 400, + 337, 274, 464, 401, 338, 275, 465, 402, 339, 466, 403, 467, 20, + 84, 21, 148, 85, 22, 212, 149, 86, 23, 213, 150, 87, 214, + 151, 215, 276, 340, 277, 404, 341, 278, 468, 405, 342, 279, 469, + 406, 343, 470, 407, 471, 24, 88, 25, 152, 89, 26, 216, 153, + 90, 27, 217, 154, 91, 218, 155, 219, 280, 344, 281, 408, 345, + 282, 472, 409, 346, 283, 473, 410, 347, 474, 411, 475, 28, 92, + 29, 156, 93, 30, 220, 157, 94, 31, 221, 158, 95, 222, 159, + 223, 284, 348, 285, 412, 349, 286, 476, 413, 350, 287, 477, 414, + 351, 478, 415, 479, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, 511, + 0, 64, 1, 128, 65, 2, 192, 129, 66, 3, 193, 130, 67, // 64x16 + 194, 131, 195, 256, 320, 257, 384, 321, 258, 448, 385, 322, 259, + 449, 386, 323, 450, 387, 451, 4, 68, 5, 132, 69, 6, 196, + 133, 70, 7, 197, 134, 71, 198, 135, 199, 512, 576, 513, 640, + 577, 514, 704, 641, 578, 515, 705, 642, 579, 706, 643, 707, 260, + 324, 261, 388, 325, 262, 452, 389, 326, 263, 453, 390, 327, 454, + 391, 455, 8, 72, 9, 136, 73, 10, 200, 137, 74, 11, 201, + 138, 75, 202, 139, 203, 768, 832, 769, 896, 833, 770, 960, 897, + 834, 771, 961, 898, 835, 962, 899, 963, 516, 580, 517, 644, 581, + 518, 708, 645, 582, 519, 709, 646, 583, 710, 647, 711, 264, 328, + 265, 392, 329, 266, 456, 393, 330, 267, 457, 394, 331, 458, 395, + 459, 12, 76, 13, 140, 77, 14, 204, 141, 78, 15, 205, 142, + 79, 206, 143, 207, 772, 836, 773, 900, 837, 774, 964, 901, 838, + 775, 965, 902, 839, 966, 903, 967, 520, 584, 521, 648, 585, 522, + 712, 649, 586, 523, 713, 650, 587, 714, 651, 715, 268, 332, 269, + 396, 333, 270, 460, 397, 334, 271, 461, 398, 335, 462, 399, 463, + 16, 80, 17, 144, 81, 18, 208, 145, 82, 19, 209, 146, 83, + 210, 147, 211, 776, 840, 777, 904, 841, 778, 968, 905, 842, 779, + 969, 906, 843, 970, 907, 971, 524, 588, 525, 652, 589, 526, 716, + 653, 590, 527, 717, 654, 591, 718, 655, 719, 272, 336, 273, 400, + 337, 274, 464, 401, 338, 275, 465, 402, 339, 466, 403, 467, 20, + 84, 21, 148, 85, 22, 212, 149, 86, 23, 213, 150, 87, 214, + 151, 215, 780, 844, 781, 908, 845, 782, 972, 909, 846, 783, 973, + 910, 847, 974, 911, 975, 528, 592, 529, 656, 593, 530, 720, 657, + 594, 531, 721, 658, 595, 722, 659, 723, 276, 340, 277, 404, 341, + 278, 468, 405, 342, 279, 469, 406, 343, 470, 407, 471, 24, 88, + 25, 152, 89, 26, 216, 153, 90, 27, 217, 154, 91, 218, 155, + 219, 784, 848, 785, 912, 849, 786, 976, 913, 850, 787, 977, 914, + 851, 978, 915, 979, 532, 596, 533, 660, 597, 534, 724, 661, 598, + 535, 725, 662, 599, 726, 663, 727, 280, 344, 281, 408, 345, 282, + 472, 409, 346, 283, 473, 410, 347, 474, 411, 475, 28, 92, 29, + 156, 93, 30, 220, 157, 94, 31, 221, 158, 95, 222, 159, 223, + 788, 852, 789, 916, 853, 790, 980, 917, 854, 791, 981, 918, 855, + 982, 919, 983, 536, 600, 537, 664, 601, 538, 728, 665, 602, 539, + 729, 666, 603, 730, 667, 731, 284, 348, 285, 412, 349, 286, 476, + 413, 350, 287, 477, 414, 351, 478, 415, 479, 792, 856, 793, 920, + 857, 794, 984, 921, 858, 795, 985, 922, 859, 986, 923, 987, 540, + 604, 541, 668, 605, 542, 732, 669, 606, 543, 733, 670, 607, 734, + 671, 735, 796, 860, 797, 924, 861, 798, 988, 925, 862, 799, 989, + 926, 863, 990, 927, 991, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, +1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 1023, 0, 64, 1, // 64x32 + 128, 65, 2, 192, 129, 66, 3, 193, 130, 67, 194, 131, 195, + 256, 320, 257, 384, 321, 258, 448, 385, 322, 259, 449, 386, 323, + 450, 387, 451, 4, 68, 5, 132, 69, 6, 196, 133, 70, 7, + 197, 134, 71, 198, 135, 199, 512, 576, 513, 640, 577, 514, 704, + 641, 578, 515, 705, 642, 579, 706, 643, 707, 260, 324, 261, 388, + 325, 262, 452, 389, 326, 263, 453, 390, 327, 454, 391, 455, 8, + 72, 9, 136, 73, 10, 200, 137, 74, 11, 201, 138, 75, 202, + 139, 203, 768, 832, 769, 896, 833, 770, 960, 897, 834, 771, 961, + 898, 835, 962, 899, 963, 516, 580, 517, 644, 581, 518, 708, 645, + 582, 519, 709, 646, 583, 710, 647, 711, 264, 328, 265, 392, 329, + 266, 456, 393, 330, 267, 457, 394, 331, 458, 395, 459, 12, 76, + 13, 140, 77, 14, 204, 141, 78, 15, 205, 142, 79, 206, 143, + 207, 1024, 1088, 1025, 1152, 1089, 1026, 1216, 1153, 1090, 1027, 1217, 1154, +1091, 1218, 1155, 1219, 772, 836, 773, 900, 837, 774, 964, 901, 838, + 775, 965, 902, 839, 966, 903, 967, 520, 584, 521, 648, 585, 522, + 712, 649, 586, 523, 713, 650, 587, 714, 651, 715, 268, 332, 269, + 396, 333, 270, 460, 397, 334, 271, 461, 398, 335, 462, 399, 463, + 16, 80, 17, 144, 81, 18, 208, 145, 82, 19, 209, 146, 83, + 210, 147, 211, 1280, 1344, 1281, 1408, 1345, 1282, 1472, 1409, 1346, 1283, +1473, 1410, 1347, 1474, 1411, 1475, 1028, 1092, 1029, 1156, 1093, 1030, 1220, +1157, 1094, 1031, 1221, 1158, 1095, 1222, 1159, 1223, 776, 840, 777, 904, + 841, 778, 968, 905, 842, 779, 969, 906, 843, 970, 907, 971, 524, + 588, 525, 652, 589, 526, 716, 653, 590, 527, 717, 654, 591, 718, + 655, 719, 272, 336, 273, 400, 337, 274, 464, 401, 338, 275, 465, + 402, 339, 466, 403, 467, 20, 84, 21, 148, 85, 22, 212, 149, + 86, 23, 213, 150, 87, 214, 151, 215, 1536, 1600, 1537, 1664, 1601, +1538, 1728, 1665, 1602, 1539, 1729, 1666, 1603, 1730, 1667, 1731, 1284, 1348, +1285, 1412, 1349, 1286, 1476, 1413, 1350, 1287, 1477, 1414, 1351, 1478, 1415, +1479, 1032, 1096, 1033, 1160, 1097, 1034, 1224, 1161, 1098, 1035, 1225, 1162, +1099, 1226, 1163, 1227, 780, 844, 781, 908, 845, 782, 972, 909, 846, + 783, 973, 910, 847, 974, 911, 975, 528, 592, 529, 656, 593, 530, + 720, 657, 594, 531, 721, 658, 595, 722, 659, 723, 276, 340, 277, + 404, 341, 278, 468, 405, 342, 279, 469, 406, 343, 470, 407, 471, + 24, 88, 25, 152, 89, 26, 216, 153, 90, 27, 217, 154, 91, + 218, 155, 219, 1792, 1856, 1793, 1920, 1857, 1794, 1984, 1921, 1858, 1795, +1985, 1922, 1859, 1986, 1923, 1987, 1540, 1604, 1541, 1668, 1605, 1542, 1732, +1669, 1606, 1543, 1733, 1670, 1607, 1734, 1671, 1735, 1288, 1352, 1289, 1416, +1353, 1290, 1480, 1417, 1354, 1291, 1481, 1418, 1355, 1482, 1419, 1483, 1036, +1100, 1037, 1164, 1101, 1038, 1228, 1165, 1102, 1039, 1229, 1166, 1103, 1230, +1167, 1231, 784, 848, 785, 912, 849, 786, 976, 913, 850, 787, 977, + 914, 851, 978, 915, 979, 532, 596, 533, 660, 597, 534, 724, 661, + 598, 535, 725, 662, 599, 726, 663, 727, 280, 344, 281, 408, 345, + 282, 472, 409, 346, 283, 473, 410, 347, 474, 411, 475, 28, 92, + 29, 156, 93, 30, 220, 157, 94, 31, 221, 158, 95, 222, 159, + 223, 1796, 1860, 1797, 1924, 1861, 1798, 1988, 1925, 1862, 1799, 1989, 1926, +1863, 1990, 1927, 1991, 1544, 1608, 1545, 1672, 1609, 1546, 1736, 1673, 1610, +1547, 1737, 1674, 1611, 1738, 1675, 1739, 1292, 1356, 1293, 1420, 1357, 1294, +1484, 1421, 1358, 1295, 1485, 1422, 1359, 1486, 1423, 1487, 1040, 1104, 1041, +1168, 1105, 1042, 1232, 1169, 1106, 1043, 1233, 1170, 1107, 1234, 1171, 1235, + 788, 852, 789, 916, 853, 790, 980, 917, 854, 791, 981, 918, 855, + 982, 919, 983, 536, 600, 537, 664, 601, 538, 728, 665, 602, 539, + 729, 666, 603, 730, 667, 731, 284, 348, 285, 412, 349, 286, 476, + 413, 350, 287, 477, 414, 351, 478, 415, 479, 1800, 1864, 1801, 1928, +1865, 1802, 1992, 1929, 1866, 1803, 1993, 1930, 1867, 1994, 1931, 1995, 1548, +1612, 1549, 1676, 1613, 1550, 1740, 1677, 1614, 1551, 1741, 1678, 1615, 1742, +1679, 1743, 1296, 1360, 1297, 1424, 1361, 1298, 1488, 1425, 1362, 1299, 1489, +1426, 1363, 1490, 1427, 1491, 1044, 1108, 1045, 1172, 1109, 1046, 1236, 1173, +1110, 1047, 1237, 1174, 1111, 1238, 1175, 1239, 792, 856, 793, 920, 857, + 794, 984, 921, 858, 795, 985, 922, 859, 986, 923, 987, 540, 604, + 541, 668, 605, 542, 732, 669, 606, 543, 733, 670, 607, 734, 671, + 735, 1804, 1868, 1805, 1932, 1869, 1806, 1996, 1933, 1870, 1807, 1997, 1934, +1871, 1998, 1935, 1999, 1552, 1616, 1553, 1680, 1617, 1554, 1744, 1681, 1618, +1555, 1745, 1682, 1619, 1746, 1683, 1747, 1300, 1364, 1301, 1428, 1365, 1302, +1492, 1429, 1366, 1303, 1493, 1430, 1367, 1494, 1431, 1495, 1048, 1112, 1049, +1176, 1113, 1050, 1240, 1177, 1114, 1051, 1241, 1178, 1115, 1242, 1179, 1243, + 796, 860, 797, 924, 861, 798, 988, 925, 862, 799, 989, 926, 863, + 990, 927, 991, 1808, 1872, 1809, 1936, 1873, 1810, 2000, 1937, 1874, 1811, +2001, 1938, 1875, 2002, 1939, 2003, 1556, 1620, 1557, 1684, 1621, 1558, 1748, +1685, 1622, 1559, 1749, 1686, 1623, 1750, 1687, 1751, 1304, 1368, 1305, 1432, +1369, 1306, 1496, 1433, 1370, 1307, 1497, 1434, 1371, 1498, 1435, 1499, 1052, +1116, 1053, 1180, 1117, 1054, 1244, 1181, 1118, 1055, 1245, 1182, 1119, 1246, +1183, 1247, 1812, 1876, 1813, 1940, 1877, 1814, 2004, 1941, 1878, 1815, 2005, +1942, 1879, 2006, 1943, 2007, 1560, 1624, 1561, 1688, 1625, 1562, 1752, 1689, +1626, 1563, 1753, 1690, 1627, 1754, 1691, 1755, 1308, 1372, 1309, 1436, 1373, +1310, 1500, 1437, 1374, 1311, 1501, 1438, 1375, 1502, 1439, 1503, 1816, 1880, +1817, 1944, 1881, 1818, 2008, 1945, 1882, 1819, 2009, 1946, 1883, 2010, 1947, +2011, 1564, 1628, 1565, 1692, 1629, 1566, 1756, 1693, 1630, 1567, 1757, 1694, +1631, 1758, 1695, 1759, 1820, 1884, 1821, 1948, 1885, 1822, 2012, 1949, 1886, +1823, 2013, 1950, 1887, 2014, 1951, 2015, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047, +2047, 2047, 2047, 2047, 0, 64, 1, 128, 65, 2, 192, 129, 66, // 64x64 + 3, 193, 130, 67, 194, 131, 195, 256, 320, 257, 384, 321, 258, + 448, 385, 322, 259, 449, 386, 323, 450, 387, 451, 4, 68, 5, + 132, 69, 6, 196, 133, 70, 7, 197, 134, 71, 198, 135, 199, + 512, 576, 513, 640, 577, 514, 704, 641, 578, 515, 705, 642, 579, + 706, 643, 707, 260, 324, 261, 388, 325, 262, 452, 389, 326, 263, + 453, 390, 327, 454, 391, 455, 8, 72, 9, 136, 73, 10, 200, + 137, 74, 11, 201, 138, 75, 202, 139, 203, 768, 832, 769, 896, + 833, 770, 960, 897, 834, 771, 961, 898, 835, 962, 899, 963, 516, + 580, 517, 644, 581, 518, 708, 645, 582, 519, 709, 646, 583, 710, + 647, 711, 264, 328, 265, 392, 329, 266, 456, 393, 330, 267, 457, + 394, 331, 458, 395, 459, 12, 76, 13, 140, 77, 14, 204, 141, + 78, 15, 205, 142, 79, 206, 143, 207, 1024, 1088, 1025, 1152, 1089, +1026, 1216, 1153, 1090, 1027, 1217, 1154, 1091, 1218, 1155, 1219, 772, 836, + 773, 900, 837, 774, 964, 901, 838, 775, 965, 902, 839, 966, 903, + 967, 520, 584, 521, 648, 585, 522, 712, 649, 586, 523, 713, 650, + 587, 714, 651, 715, 268, 332, 269, 396, 333, 270, 460, 397, 334, + 271, 461, 398, 335, 462, 399, 463, 16, 80, 17, 144, 81, 18, + 208, 145, 82, 19, 209, 146, 83, 210, 147, 211, 1280, 1344, 1281, +1408, 1345, 1282, 1472, 1409, 1346, 1283, 1473, 1410, 1347, 1474, 1411, 1475, +1028, 1092, 1029, 1156, 1093, 1030, 1220, 1157, 1094, 1031, 1221, 1158, 1095, +1222, 1159, 1223, 776, 840, 777, 904, 841, 778, 968, 905, 842, 779, + 969, 906, 843, 970, 907, 971, 524, 588, 525, 652, 589, 526, 716, + 653, 590, 527, 717, 654, 591, 718, 655, 719, 272, 336, 273, 400, + 337, 274, 464, 401, 338, 275, 465, 402, 339, 466, 403, 467, 20, + 84, 21, 148, 85, 22, 212, 149, 86, 23, 213, 150, 87, 214, + 151, 215, 1536, 1600, 1537, 1664, 1601, 1538, 1728, 1665, 1602, 1539, 1729, +1666, 1603, 1730, 1667, 1731, 1284, 1348, 1285, 1412, 1349, 1286, 1476, 1413, +1350, 1287, 1477, 1414, 1351, 1478, 1415, 1479, 1032, 1096, 1033, 1160, 1097, +1034, 1224, 1161, 1098, 1035, 1225, 1162, 1099, 1226, 1163, 1227, 780, 844, + 781, 908, 845, 782, 972, 909, 846, 783, 973, 910, 847, 974, 911, + 975, 528, 592, 529, 656, 593, 530, 720, 657, 594, 531, 721, 658, + 595, 722, 659, 723, 276, 340, 277, 404, 341, 278, 468, 405, 342, + 279, 469, 406, 343, 470, 407, 471, 24, 88, 25, 152, 89, 26, + 216, 153, 90, 27, 217, 154, 91, 218, 155, 219, 1792, 1856, 1793, +1920, 1857, 1794, 1984, 1921, 1858, 1795, 1985, 1922, 1859, 1986, 1923, 1987, +1540, 1604, 1541, 1668, 1605, 1542, 1732, 1669, 1606, 1543, 1733, 1670, 1607, +1734, 1671, 1735, 1288, 1352, 1289, 1416, 1353, 1290, 1480, 1417, 1354, 1291, +1481, 1418, 1355, 1482, 1419, 1483, 1036, 1100, 1037, 1164, 1101, 1038, 1228, +1165, 1102, 1039, 1229, 1166, 1103, 1230, 1167, 1231, 784, 848, 785, 912, + 849, 786, 976, 913, 850, 787, 977, 914, 851, 978, 915, 979, 532, + 596, 533, 660, 597, 534, 724, 661, 598, 535, 725, 662, 599, 726, + 663, 727, 280, 344, 281, 408, 345, 282, 472, 409, 346, 283, 473, + 410, 347, 474, 411, 475, 28, 92, 29, 156, 93, 30, 220, 157, + 94, 31, 221, 158, 95, 222, 159, 223, 1796, 1860, 1797, 1924, 1861, +1798, 1988, 1925, 1862, 1799, 1989, 1926, 1863, 1990, 1927, 1991, 1544, 1608, +1545, 1672, 1609, 1546, 1736, 1673, 1610, 1547, 1737, 1674, 1611, 1738, 1675, +1739, 1292, 1356, 1293, 1420, 1357, 1294, 1484, 1421, 1358, 1295, 1485, 1422, +1359, 1486, 1423, 1487, 1040, 1104, 1041, 1168, 1105, 1042, 1232, 1169, 1106, +1043, 1233, 1170, 1107, 1234, 1171, 1235, 788, 852, 789, 916, 853, 790, + 980, 917, 854, 791, 981, 918, 855, 982, 919, 983, 536, 600, 537, + 664, 601, 538, 728, 665, 602, 539, 729, 666, 603, 730, 667, 731, + 284, 348, 285, 412, 349, 286, 476, 413, 350, 287, 477, 414, 351, + 478, 415, 479, 1800, 1864, 1801, 1928, 1865, 1802, 1992, 1929, 1866, 1803, +1993, 1930, 1867, 1994, 1931, 1995, 1548, 1612, 1549, 1676, 1613, 1550, 1740, +1677, 1614, 1551, 1741, 1678, 1615, 1742, 1679, 1743, 1296, 1360, 1297, 1424, +1361, 1298, 1488, 1425, 1362, 1299, 1489, 1426, 1363, 1490, 1427, 1491, 1044, +1108, 1045, 1172, 1109, 1046, 1236, 1173, 1110, 1047, 1237, 1174, 1111, 1238, +1175, 1239, 792, 856, 793, 920, 857, 794, 984, 921, 858, 795, 985, + 922, 859, 986, 923, 987, 540, 604, 541, 668, 605, 542, 732, 669, + 606, 543, 733, 670, 607, 734, 671, 735, 1804, 1868, 1805, 1932, 1869, +1806, 1996, 1933, 1870, 1807, 1997, 1934, 1871, 1998, 1935, 1999, 1552, 1616, +1553, 1680, 1617, 1554, 1744, 1681, 1618, 1555, 1745, 1682, 1619, 1746, 1683, +1747, 1300, 1364, 1301, 1428, 1365, 1302, 1492, 1429, 1366, 1303, 1493, 1430, +1367, 1494, 1431, 1495, 1048, 1112, 1049, 1176, 1113, 1050, 1240, 1177, 1114, +1051, 1241, 1178, 1115, 1242, 1179, 1243, 796, 860, 797, 924, 861, 798, + 988, 925, 862, 799, 989, 926, 863, 990, 927, 991, 1808, 1872, 1809, +1936, 1873, 1810, 2000, 1937, 1874, 1811, 2001, 1938, 1875, 2002, 1939, 2003, +1556, 1620, 1557, 1684, 1621, 1558, 1748, 1685, 1622, 1559, 1749, 1686, 1623, +1750, 1687, 1751, 1304, 1368, 1305, 1432, 1369, 1306, 1496, 1433, 1370, 1307, +1497, 1434, 1371, 1498, 1435, 1499, 1052, 1116, 1053, 1180, 1117, 1054, 1244, +1181, 1118, 1055, 1245, 1182, 1119, 1246, 1183, 1247, 1812, 1876, 1813, 1940, +1877, 1814, 2004, 1941, 1878, 1815, 2005, 1942, 1879, 2006, 1943, 2007, 1560, +1624, 1561, 1688, 1625, 1562, 1752, 1689, 1626, 1563, 1753, 1690, 1627, 1754, +1691, 1755, 1308, 1372, 1309, 1436, 1373, 1310, 1500, 1437, 1374, 1311, 1501, +1438, 1375, 1502, 1439, 1503, 1816, 1880, 1817, 1944, 1881, 1818, 2008, 1945, +1882, 1819, 2009, 1946, 1883, 2010, 1947, 2011, 1564, 1628, 1565, 1692, 1629, +1566, 1756, 1693, 1630, 1567, 1757, 1694, 1631, 1758, 1695, 1759, 1820, 1884, +1821, 1948, 1885, 1822, 2012, 1949, 1886, 1823, 2013, 1950, 1887, 2014, 1951, +2015, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, 4095, +4095, 4095, 4095, 4095, 4095, +}; + +// Get scan order table based on scan group type (grouped or ungrouped) +// and log2 block width and height index +static const uint32_t* const g_scan_order[SCAN_GROUP_TYPES][MAX_LOG2_INDEX][MAX_LOG2_INDEX] = +{ + { + { g_scan_order_buffer + 0, g_scan_order_buffer + 1, g_scan_order_buffer + 3, g_scan_order_buffer + 7, g_scan_order_buffer + 15, g_scan_order_buffer + 31, g_scan_order_buffer + 63, }, + { g_scan_order_buffer + 127, g_scan_order_buffer + 129, g_scan_order_buffer + 133, g_scan_order_buffer + 141, g_scan_order_buffer + 157, g_scan_order_buffer + 189, g_scan_order_buffer + 253, }, + { g_scan_order_buffer + 381, g_scan_order_buffer + 385, g_scan_order_buffer + 393, g_scan_order_buffer + 409, g_scan_order_buffer + 441, g_scan_order_buffer + 505, g_scan_order_buffer + 633, }, + { g_scan_order_buffer + 889, g_scan_order_buffer + 897, g_scan_order_buffer + 913, g_scan_order_buffer + 945, g_scan_order_buffer + 1009, g_scan_order_buffer + 1137, g_scan_order_buffer + 1393, }, + { g_scan_order_buffer + 1905, g_scan_order_buffer + 1921, g_scan_order_buffer + 1953, g_scan_order_buffer + 2017, g_scan_order_buffer + 2145, g_scan_order_buffer + 2401, g_scan_order_buffer + 2913, }, + { g_scan_order_buffer + 3937, g_scan_order_buffer + 3969, g_scan_order_buffer + 4033, g_scan_order_buffer + 4161, g_scan_order_buffer + 4417, g_scan_order_buffer + 4929, g_scan_order_buffer + 5953, }, + { g_scan_order_buffer + 8001, g_scan_order_buffer + 8065, g_scan_order_buffer + 8193, g_scan_order_buffer + 8449, g_scan_order_buffer + 8961, g_scan_order_buffer + 9985, g_scan_order_buffer + 12033, }, + }, + { + { g_scan_order_buffer + 16129, g_scan_order_buffer + 16130, g_scan_order_buffer + 16132, g_scan_order_buffer + 16136, g_scan_order_buffer + 16144, g_scan_order_buffer + 16160, g_scan_order_buffer + 16192, }, + { g_scan_order_buffer + 16256, g_scan_order_buffer + 16258, g_scan_order_buffer + 16262, g_scan_order_buffer + 16270, g_scan_order_buffer + 16286, g_scan_order_buffer + 16318, g_scan_order_buffer + 16382, }, + { g_scan_order_buffer + 16510, g_scan_order_buffer + 16514, g_scan_order_buffer + 16522, g_scan_order_buffer + 16538, g_scan_order_buffer + 16570, g_scan_order_buffer + 16634, g_scan_order_buffer + 16762, }, + { g_scan_order_buffer + 17018, g_scan_order_buffer + 17026, g_scan_order_buffer + 17042, g_scan_order_buffer + 17074, g_scan_order_buffer + 17138, g_scan_order_buffer + 17266, g_scan_order_buffer + 17522, }, + { g_scan_order_buffer + 18034, g_scan_order_buffer + 18050, g_scan_order_buffer + 18082, g_scan_order_buffer + 18146, g_scan_order_buffer + 18274, g_scan_order_buffer + 18530, g_scan_order_buffer + 19042, }, + { g_scan_order_buffer + 20066, g_scan_order_buffer + 20098, g_scan_order_buffer + 20162, g_scan_order_buffer + 20290, g_scan_order_buffer + 20546, g_scan_order_buffer + 21058, g_scan_order_buffer + 22082, }, + { g_scan_order_buffer + 24130, g_scan_order_buffer + 24194, g_scan_order_buffer + 24322, g_scan_order_buffer + 24578, g_scan_order_buffer + 25090, g_scan_order_buffer + 26114, g_scan_order_buffer + 28162, }, + } +}; + + +/** + * \brief Return array of scan order indices. + * + * \param scan_group Scan group type, normal or coefficient. + * \param scan_type Scan type, diagonal, horizontal or vertical + * \param log2_w Log2 of block width. + * \param log2_h Log2 of block height. + * + * \return Returns pointer to scan order table based on given dimensions. + */ +const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h) +{ + // TODO: horizontal and vertical scan types + assert(scan_type == SCAN_DIAG && "Horizontal and vertical scan not implemented."); + + if (scan_group == SCAN_GROUP_4X4) { + return g_scan_order[scan_group][log2_w][log2_h]; + } + else { + if (log2_w <= 1 || log2_h <= 1) { + // Just return array containing [0, 15] in order + return g_scan_order[scan_group][0][4]; + } + else { + return g_scan_order[scan_group][log2_w - 2][log2_h - 2]; + } + } +} diff --git a/src/tables.h b/src/tables.h index 1ab81cfb..44621251 100644 --- a/src/tables.h +++ b/src/tables.h @@ -134,6 +134,15 @@ typedef enum */ extern const uint32_t* const uvg_g_sig_last_scan[3][5]; extern const int8_t uvg_g_convert_to_bit[LCU_WIDTH + 1]; +extern const int8_t uvg_g_convert_to_log2[LCU_WIDTH + 1]; extern const uint32_t uvg_g_log2_sbb_size[7 + 1][7 + 1][2]; +#define SCAN_GROUP_TYPES 2 +#define MAX_LOG2_INDEX 7 + +#define SCAN_GROUP_UNGROUPED 0 +#define SCAN_GROUP_4X4 1 + +const uint32_t* const uvg_get_scan_order_table(int scan_group, int scan_type, int log2_w, int log2_h); + #endif //TABLES_H_ diff --git a/src/transform.c b/src/transform.c index c0adc121..98728da0 100644 --- a/src/transform.c +++ b/src/transform.c @@ -34,8 +34,10 @@ #include "encode_coding_tree.h" #include "image.h" +#include "intra.h" #include "uvg266.h" #include "lfnst_tables.h" +#include "rate_control.h" #include "rdo.h" #include "strategies/strategies-dct.h" #include "strategies/strategies-quant.h" @@ -77,6 +79,7 @@ const uint8_t uvg_g_chroma_scale[58]= * Parameters pred_in and rec_out may be aliased. * * \param width Transform width. + * \param height Transform height. * \param in_stride Stride for ref_in and pred_in * \param out_stride Stride for rec_out. * \param ref_in Reference pixels. @@ -87,6 +90,7 @@ const uint8_t uvg_g_chroma_scale[58]= * \returns Whether coeff_out contains any non-zero coefficients. */ static bool bypass_transquant(const int width, + const int height, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, @@ -96,7 +100,7 @@ static bool bypass_transquant(const int width, { bool nonzero_coeffs = false; - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int32_t in_idx = x + y * in_stride; int32_t out_idx = x + y * out_stride; @@ -123,6 +127,7 @@ static bool bypass_transquant(const int width, * \param coeff coefficients (residual) to filter */ static void rdpcm(const int width, + const int height, const rdpcm_dir dir, coeff_t *coeff) { @@ -130,7 +135,7 @@ static void rdpcm(const int width, const int min_x = (dir == RDPCM_HOR) ? 1 : 0; const int min_y = (dir == RDPCM_HOR) ? 0 : 1; - for (int y = width - 1; y >= min_y; y--) { + for (int y = height - 1; y >= min_y; y--) { for (int x = width - 1; x >= min_x; x--) { const int index = x + y * width; coeff[index] -= coeff[index - offset]; @@ -171,19 +176,26 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con */ void uvg_derive_lfnst_constraints( cu_info_t* const pred_cu, - const int depth, bool* constraints, const coeff_t* coeff, const int width, - const int height) + const int height, + const vector2d_t * const lcu_px, + color_t color) { - coeff_scan_order_t scan_idx = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth); + coeff_scan_order_t scan_idx = SCAN_DIAG; // ToDo: large block support in VVC? - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - const uint32_t* scan = uvg_g_sig_last_scan[scan_idx][log2_block_size - 1]; + const uint32_t log2_tr_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_tr_height = uvg_g_convert_to_log2[height]; + const uint32_t* scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_idx, log2_tr_width, log2_tr_height); signed scan_pos_last = -1; + coeff_t temp[TR_MAX_WIDTH * TR_MAX_WIDTH]; + if(lcu_px != NULL) { + uvg_get_sub_coeff(temp, coeff, lcu_px->x, lcu_px->y, width, height, color == COLOR_Y? LCU_WIDTH : LCU_WIDTH_C); + coeff = temp; + } for (int i = 0; i < width * height; i++) { if (coeff[scan[i]]) { @@ -203,17 +215,18 @@ void uvg_derive_lfnst_constraints( /** * \brief NxN inverse transform (2D) - * \param coeff input data (transform coefficients) - * \param block output data (residual) - * \param block_size input data (width of transform) + * \param coeff input data (transform coefficients) + * \param block output data (residual) + * \param width transform width + * \param height transform height */ -void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size) +void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height) { - int32_t j,k; - for (j = 0; j < block_size; j++) { - for(k = 0; k < block_size; k ++) { + int32_t j, k; + for (j = 0; j < height; j++) { + for(k = 0; k < width; k ++) { // Casting back and forth to make UBSan not trigger due to left-shifting negatives - coeff[j * block_size + k] = (int16_t)((uint16_t)(block[j * block_size + k])); + coeff[j * width + k] = (int16_t)((uint16_t)(block[j * width + k])); } } } @@ -224,12 +237,12 @@ void uvg_transformskip(const encoder_control_t * const encoder, int16_t *block,i * \param block output data (residual) * \param block_size width of transform */ -void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_size) +void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block,int16_t *coeff, int8_t block_width, int8_t block_height) { int32_t j,k; - for ( j = 0; j < block_size; j++ ) { - for(k = 0; k < block_size; k ++) { - block[j * block_size + k] = coeff[j * block_size + k]; + for ( j = 0; j < block_height; j++ ) { + for(k = 0; k < block_width; k ++) { + block[j * block_width + k] = coeff[j * block_width + k]; } } } @@ -243,17 +256,18 @@ void uvg_itransformskip(const encoder_control_t * const encoder, int16_t *block, void uvg_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu) { - if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx) + if (encoder->cfg.mts || tu->lfnst_idx || tu->cr_lfnst_idx || block_width != block_height) { - uvg_mts_dct(encoder->bitdepth, color, tu, block_size, block, coeff, encoder->cfg.mts); + uvg_mts_dct(encoder->bitdepth, color, tu, block_width, block_height, block, coeff, encoder->cfg.mts); } else { - dct_func *dct_func = uvg_get_dct_func(block_size, color, tu->type); + dct_func *dct_func = uvg_get_dct_func(block_width, block_height, color, tu->type); dct_func(encoder->bitdepth, block, coeff); } } @@ -261,17 +275,18 @@ void uvg_transform2d(const encoder_control_t * const encoder, void uvg_itransform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu) { - if (encoder->cfg.mts) + if (encoder->cfg.mts || block_width != block_height) { - uvg_mts_idct(encoder->bitdepth, color, tu, block_size, coeff, block, encoder->cfg.mts); + uvg_mts_idct(encoder->bitdepth, color, tu, block_width, block_height, coeff, block, encoder->cfg.mts); } else { - dct_func *idct_func = uvg_get_idct_func(block_size, color, tu->type); + dct_func *idct_func = uvg_get_idct_func(block_width, block_height, color, tu->type); idct_func(encoder->bitdepth, coeff, block); } } @@ -348,7 +363,7 @@ static void generate_jccr_transforms( } } } - costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1; + costs[jccr] = jccr == 0 ? MIN(d1, d2) : d1; } int64_t min_dist1 = costs[0]; int64_t min_dist2 = INT64_MAX; @@ -373,6 +388,7 @@ static void generate_jccr_transforms( &temp_resi[(cbf_mask1 - 1) * trans_offset], &u_coeff[*num_transforms * trans_offset], width, + height, COLOR_U, pred_cu ); @@ -386,6 +402,7 @@ static void generate_jccr_transforms( &temp_resi[(cbf_mask2 - 1) * trans_offset], &u_coeff[*num_transforms * trans_offset], width, + height, COLOR_U, pred_cu ); @@ -401,27 +418,96 @@ static void generate_jccr_transforms( static void quantize_chroma( encoder_state_t* const state, - int depth, - int8_t width, - int8_t height, + cu_info_t * const cur_tu, + const cu_loc_t* const cu_loc, coeff_t u_coeff[5120], coeff_t v_coeff[2048], - enum uvg_chroma_transforms transforms[5], - const int trans_offset, - int i, + enum uvg_chroma_transforms transform, coeff_t u_quant_coeff[1024], coeff_t v_quant_coeff[1024], const coeff_scan_order_t scan_order, bool* u_has_coeffs, bool* v_has_coeffs, - uint8_t lfnst_idx) + uint8_t lfnst_idx, + enum uvg_tree_type tree_type, + double* u_coeff_cost, + double* v_coeff_cost) { + int8_t width = cu_loc->chroma_width; + int8_t height = cu_loc->chroma_height; + if(state->encoder_control->cfg.dep_quant && transform != CHROMA_TS) { + int abs_sum = 0; + state->quant_blocks[2].needs_init |= state->encoder_control->cfg.jccr; + uvg_dep_quant( + state, + cur_tu, + width, + height, + u_coeff, + u_quant_coeff, + COLOR_U, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list + ); + + cbf_clear(&cur_tu->cbf, COLOR_U); + if (abs_sum > 0) { + *u_has_coeffs = 1; + cbf_set(&cur_tu->cbf, COLOR_U); + } + + *u_coeff_cost = uvg_get_coeff_cost( + state, + u_quant_coeff, + cur_tu, + cu_loc, + COLOR_U, + SCAN_DIAG, + false, + COEFF_ORDER_LINEAR); + + if (transform == DCT7_CHROMA) { + abs_sum = 0; + state->rate_estimator[2].needs_init = true; + uvg_dep_quant( + state, + cur_tu, + width, + height, + v_coeff, + v_quant_coeff, + COLOR_V, + tree_type, + &abs_sum, + state->encoder_control->cfg.scaling_list + ); + + cbf_clear(&cur_tu->cbf, COLOR_V); + if (abs_sum > 0) { + *v_has_coeffs = 1; + cbf_set(&cur_tu->cbf, COLOR_V); + } + + *v_coeff_cost = uvg_get_coeff_cost( + state, + v_quant_coeff, + cur_tu, + cu_loc, + COLOR_V, + SCAN_DIAG, + false, + COEFF_ORDER_LINEAR); + cbf_clear(&cur_tu->cbf, COLOR_U); + cbf_clear(&cur_tu->cbf, COLOR_V); + } + return; + } if (state->encoder_control->cfg.rdoq_enable && - (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip)) + (transform != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip)) { - uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, - scan_order, CU_INTRA, depth, 0, - lfnst_idx); + uvg_rdoq(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V, + scan_order, CU_INTRA, 0, lfnst_idx, 0); int j; for (j = 0; j < width * height; ++j) { @@ -431,26 +517,25 @@ static void quantize_chroma( } } - if (transforms[i] == DCT7_CHROMA) { + if (transform == DCT7_CHROMA) { uint16_t temp_cbf = 0; - if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U); - uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V, - scan_order, CU_INTRA, depth, temp_cbf, - lfnst_idx); + if (*u_has_coeffs)cbf_set(&temp_cbf, COLOR_U); + uvg_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V, + scan_order, CU_INTRA, temp_cbf, lfnst_idx, 0); } } - else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) { - uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U, scan_order); - uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V, scan_order); + else if (state->encoder_control->cfg.rdoq_enable && transform == CHROMA_TS) { + uvg_ts_rdoq(state, u_coeff, u_quant_coeff, width, height, COLOR_U, scan_order); + uvg_ts_rdoq(state, v_coeff, v_quant_coeff, width, height, COLOR_V, scan_order); } else { - uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, - scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx); + uvg_quant(state, u_coeff, u_quant_coeff, width, height, transform != JCCR_1 ? COLOR_U : COLOR_V, + scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx); - if (!IS_JCCR_MODE(transforms[i])) { - uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V, - scan_order, CU_INTRA, transforms[i] == CHROMA_TS, lfnst_idx); + if (!IS_JCCR_MODE(transform)) { + uvg_quant(state, v_coeff, v_quant_coeff, width, height, COLOR_V, + scan_order, CU_INTRA, transform == CHROMA_TS, lfnst_idx); } } @@ -460,7 +545,7 @@ static void quantize_chroma( break; } } - if (!IS_JCCR_MODE(transforms[i])) { + if (!IS_JCCR_MODE(transform)) { for (int j = 0; j < width * height; ++j) { if (v_quant_coeff[j]) { *v_has_coeffs = 1; @@ -472,13 +557,10 @@ static void quantize_chroma( void uvg_chroma_transform_search( encoder_state_t* const state, - int depth, lcu_t* const lcu, cabac_data_t* temp_cabac, - int8_t width, - int8_t height, + const cu_loc_t* const cu_loc, const int offset, - const uint8_t mode, cu_info_t* pred_cu, uvg_pixel u_pred[1024], uvg_pixel v_pred[1024], @@ -489,13 +571,18 @@ void uvg_chroma_transform_search( { ALIGNED(64) coeff_t u_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 5]; ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5]; - ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2]; + ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2]; // In case of JCCR the v channel does not have coefficients ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5]; + const int width = cu_loc->chroma_width; + const int height = cu_loc->chroma_height; + + const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + uvg_transform2d( - state->encoder_control, u_resi, u_coeff, width, COLOR_U, pred_cu + state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu ); uvg_transform2d( - state->encoder_control, v_resi, v_coeff, width, COLOR_V, pred_cu + state->encoder_control, v_resi, v_coeff, width, height, COLOR_V, pred_cu ); enum uvg_chroma_transforms transforms[5]; transforms[0] = DCT7_CHROMA; @@ -508,8 +595,8 @@ void uvg_chroma_transform_search( pred_cu->cr_lfnst_idx == 0 ; if (can_use_tr_skip) { - uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width); - uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width); + uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width, height); + uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width, height); transforms[num_transforms] = CHROMA_TS; num_transforms++; } @@ -526,6 +613,9 @@ void uvg_chroma_transform_search( trans_offset, &num_transforms); } + + double lambda = state->c_lambda; + chorma_ts_out->best_u_cost = MAX_DOUBLE; chorma_ts_out->best_v_cost = MAX_DOUBLE; chorma_ts_out->best_combined_cost = MAX_DOUBLE; @@ -537,58 +627,76 @@ void uvg_chroma_transform_search( coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C]; int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C]; int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C]; - const coeff_scan_order_t scan_order = - uvg_get_scan_order(pred_cu->type, mode, depth); bool u_has_coeffs = false; bool v_has_coeffs = false; + bool is_jccr = IS_JCCR_MODE(transforms[i]); if(pred_cu->cr_lfnst_idx) { - uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type); - if (!IS_JCCR_MODE(transforms[i])) { - uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type); + uvg_fwd_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode); + if (!is_jccr) { + uvg_fwd_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode); } } + uint8_t old_jccr = pred_cu->joint_cb_cr; + pred_cu->joint_cb_cr = 0; + if(is_jccr) { + state->c_lambda = lambda * (transforms[i] == JCCR_3 ? 0.5 : 0.8); + pred_cu->joint_cb_cr = transforms[i]; + } + else if(state->encoder_control->cfg.dep_quant) { + state->search_cabac.update = 1; + } + + double u_coeff_cost = 0; + double v_coeff_cost = 0; + unsigned ssd_u = 0; + unsigned ssd_v = 0; + double u_bits = 0; + double v_bits = 0; + quantize_chroma( state, - depth, - width, - height, - u_coeff, - v_coeff, - transforms, - trans_offset, - i, + pred_cu, + cu_loc, + &u_coeff[i * trans_offset], + &v_coeff[i * trans_offset], + transforms[i], u_quant_coeff, v_quant_coeff, - scan_order, + SCAN_DIAG, &u_has_coeffs, - &v_has_coeffs, - pred_cu->cr_lfnst_idx); + &v_has_coeffs, tree_type == UVG_CHROMA_T ? pred_cu->cr_lfnst_idx : pred_cu->lfnst_idx, + tree_type, + &u_coeff_cost, + &v_coeff_cost); + pred_cu->joint_cb_cr = old_jccr; + if (pred_cu->cr_lfnst_idx != 0 && !u_has_coeffs && !v_has_coeffs) goto reset_cabac; - if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) { + if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && tree_type == UVG_CHROMA_T) { bool constraints[2] = { false, false }; - uvg_derive_lfnst_constraints(pred_cu, depth, constraints, u_quant_coeff, width, height); - if(!IS_JCCR_MODE(transforms[i])) { - uvg_derive_lfnst_constraints(pred_cu, depth, constraints, v_quant_coeff, width, height); + uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U); + if(!is_jccr) { + uvg_derive_lfnst_constraints(pred_cu, constraints, v_quant_coeff, width, height, NULL, COLOR_V); } - if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) continue; + if (!constraints[1] && (u_has_coeffs || v_has_coeffs) && pred_cu->cr_lfnst_idx != 0) goto reset_cabac; } - if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue; + if (is_jccr && !u_has_coeffs) goto reset_cabac; if (u_has_coeffs) { - - uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, + uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu->type, transforms[i] == CHROMA_TS); + if (transforms[i] != CHROMA_TS) { if (pred_cu->cr_lfnst_idx) { - uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type); + uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type, state->collocated_luma_mode); } - uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, + uvg_itransform2d(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu); } else { - uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width); + uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height); } + if (transforms[i] != JCCR_1) { for (int j = 0; j < width * height; j++) { u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((uvg_pixel)(u_pred[j] + u_recon_resi[j])); @@ -603,24 +711,28 @@ void uvg_chroma_transform_search( else { uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width); } - if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) { - uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V, + + + if (v_has_coeffs && !is_jccr) { + uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, height, COLOR_V, pred_cu->type, transforms[i] == CHROMA_TS); + if (transforms[i] != CHROMA_TS) { if (pred_cu->cr_lfnst_idx) { - uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type); + uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type, state->collocated_luma_mode); } - uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, + uvg_itransform2d(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V, pred_cu); } else { - uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width); + uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height); } + for (int j = 0; j < width * height; j++) { v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]); } } - else if (u_has_coeffs && IS_JCCR_MODE(transforms[i])) { + else if (u_has_coeffs && is_jccr) { if (transforms[i] == JCCR_1) { for (int j = 0; j < width * height; j++) { v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + u_recon_resi[j]); @@ -641,19 +753,17 @@ void uvg_chroma_transform_search( uvg_pixels_blit(v_pred, &v_recon[trans_offset * i], width, height, width, width); } - unsigned ssd_u = 0; - unsigned ssd_v = 0; if (!state->encoder_control->cfg.lossless) { ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i], LCU_WIDTH_C, width, - width); + width, height); ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i], LCU_WIDTH_C, width, - width); + width, height); + ssd_u = (double)ssd_u * state->chroma_weights[1]; + ssd_v = (double)ssd_v * state->chroma_weights[2]; } - double u_bits = 0; - double v_bits = 0; state->search_cabac.update = 1; int cbf_u = transforms[i] & 2 || (u_has_coeffs && !(transforms[i] & 1)); @@ -677,33 +787,40 @@ void uvg_chroma_transform_search( transforms[i] == CHROMA_TS, u_bits, "tr_skip_u" ); } - double coeff_cost = uvg_get_coeff_cost( - state, - u_quant_coeff, - pred_cu, - width, - COLOR_U, - scan_order, - transforms[i] == CHROMA_TS); - u_bits += coeff_cost; + if(u_coeff_cost == 0) { + u_coeff_cost = uvg_get_coeff_cost( + state, + u_quant_coeff, + pred_cu, + cu_loc, + COLOR_U, + SCAN_DIAG, + transforms[i] == CHROMA_TS, + COEFF_ORDER_LINEAR); + } } - if (cbf_v && !IS_JCCR_MODE(transforms[i])) { + if (cbf_v && !is_jccr) { if (can_use_tr_skip) { CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma, transforms[i] == CHROMA_TS, v_bits, "tr_skip_v" ); } - v_bits += uvg_get_coeff_cost( - state, - v_quant_coeff, - pred_cu, - width, - COLOR_V, - scan_order, - transforms[i] == CHROMA_TS); + if (v_coeff_cost == 0) { + v_coeff_cost = uvg_get_coeff_cost( + state, + v_quant_coeff, + pred_cu, + cu_loc, + COLOR_V, + SCAN_DIAG, + transforms[i] == CHROMA_TS, + COEFF_ORDER_LINEAR); + } } + u_bits += u_coeff_cost; + v_bits += v_coeff_cost; if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst && 0) { - if(uvg_is_lfnst_allowed(state, pred_cu, width, height, 0, 0 , UVG_CHROMA_T, COLOR_UV, lcu)) { + if(uvg_is_lfnst_allowed(state, pred_cu, UVG_CHROMA_T, COLOR_UV, cu_loc, lcu)) { const int lfnst_idx = pred_cu->cr_lfnst_idx; CABAC_FBITS_UPDATE( &state->search_cabac, @@ -723,25 +840,35 @@ void uvg_chroma_transform_search( pred_cu->lfnst_last_scan_pos = false; pred_cu->violates_lfnst_constrained_chroma = false; } - if (!IS_JCCR_MODE(transforms[i])) { - double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda; - double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda; + + if (!is_jccr) { + double u_cost = UVG_CHROMA_MULT * ssd_u + u_bits * state->lambda; + double v_cost = UVG_CHROMA_MULT * ssd_v + v_bits * state->lambda; if (u_cost < chorma_ts_out->best_u_cost) { chorma_ts_out->best_u_cost = u_cost; chorma_ts_out->best_u_index = u_has_coeffs ? transforms[i] : NO_RESIDUAL; + chorma_ts_out->u_bits = u_bits; + chorma_ts_out->u_distortion = ssd_u; } if (v_cost < chorma_ts_out->best_v_cost) { chorma_ts_out->best_v_cost = v_cost; chorma_ts_out->best_v_index = v_has_coeffs ? transforms[i] : NO_RESIDUAL; + chorma_ts_out->v_bits = v_bits; + chorma_ts_out->v_distortion = ssd_v; } } else { - double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->frame->lambda; - if (cost < chorma_ts_out->best_combined_cost) { + double cost = UVG_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->lambda; + if (cost < chorma_ts_out->best_combined_cost && cost < chorma_ts_out->best_u_cost + chorma_ts_out->best_v_cost) { chorma_ts_out->best_combined_cost = cost; chorma_ts_out->best_combined_index = transforms[i]; + chorma_ts_out->u_bits = u_bits; + chorma_ts_out->u_distortion = ssd_u; + chorma_ts_out->v_bits = v_bits; + chorma_ts_out->v_distortion = ssd_v; } } +reset_cabac: memcpy(&state->search_cabac, temp_cabac, sizeof(cabac_data_t)); } } @@ -786,12 +913,52 @@ void uvg_fwd_lfnst_NxN(coeff_t *src, coeff_t *dst, const int8_t mode, const int8 } } -static inline bool get_transpose_flag(const int8_t intra_mode) +static uint32_t get_lfnst_intra_mode(int mode) +{ + uint32_t intraMode; + + if (mode < 0) + { + intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1) + NUM_LUMA_MODE); + } + else if (mode >= NUM_LUMA_MODE) + { + intraMode = (uint32_t)(mode + (NUM_EXT_LUMA_MODE >> 1)); + } + else + { + intraMode = (uint32_t)mode; + } + + return intraMode; +} + +static bool get_transpose_flag(const int8_t intra_mode) { return ((intra_mode >= NUM_LUMA_MODE) && (intra_mode >= (NUM_LUMA_MODE + (NUM_EXT_LUMA_MODE >> 1)))) || ((intra_mode < NUM_LUMA_MODE) && (intra_mode > DIA_IDX)); } + +static inline bool block_is_mip(const cu_info_t * const cur_cu, const color_t color, const bool is_sep_tree) +{ + if (cur_cu->type == CU_INTRA) { + if (color == COLOR_Y) { + return cur_cu->intra.mip_flag; + } + else { + // MIP_TODO: currently, only chroma 420 is supported. Therefore this will always return false + + //bool derived_mode = cur_cu->intra.mode_chroma == (!cur_cu->intra.mip_flag ? cur_cu->intra.mode : 0); + //bool is_chroma_mip = !is_sep_tree /*&& chroma_format == CHROMA_444*/ && cur_cu->intra.mip_flag; + //return is_chroma_mip && derived_mode; + + return false; + } + } + return false; +} + void uvg_fwd_lfnst( const cu_info_t* const cur_cu, const int width, @@ -799,42 +966,47 @@ void uvg_fwd_lfnst( const color_t color, const uint16_t lfnst_idx, coeff_t *coeffs, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + int8_t luma_mode) { const uint16_t lfnst_index = lfnst_idx; + const uint32_t log2_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_height = uvg_g_convert_to_log2[height]; int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma; - bool mts_skip = cur_cu->tr_idx == MTS_SKIP; - const int depth = cur_cu->depth; - bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; + bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y; + // This check is safe for 8x16 cus split with TT, since it is checking the dimensions of the + // last luma CU which will be 8x4, i.e., 3 + 2 < 6 + bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T; bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83] - bool is_mip = cur_cu->type == CU_INTRA ? cur_cu->intra.mip_flag : false; - bool is_wide_angle = false; // TODO: get wide angle mode when implemented + bool is_mip = block_is_mip(cur_cu, color, is_separate_tree); + + const int scan_order = SCAN_DIAG; - const int cu_type = cur_cu->type; - - const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth); - - if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) + if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree)) { - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; - assert(log2_block_size != -1 && "LFNST: invalid block width."); + assert(log2_width != -1 && "LFNST: invalid block width."); const bool whge3 = width >= 8 && height >= 8; - const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1]; + const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1]; if (is_cclm_mode) { - intra_mode = cur_cu->intra.mode; + intra_mode = luma_mode; } - if (is_mip) { + if (is_mip && color == COLOR_Y) { intra_mode = 0; // Set to planar mode } assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode."); assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]"); - - if (is_wide_angle) { - // Transform wide angle mode to intra mode - intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing. - } + int32_t wide_adjusted_mode = uvg_wide_angle_correction( + intra_mode, + color == COLOR_Y ? cur_cu->log2_width : log2_width, + color == COLOR_Y ? cur_cu->log2_height : log2_height, + true + ); + + // Transform wide angle mode to intra mode + intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); + bool transpose = get_transpose_flag(intra_mode); const int sb_size = whge3 ? 8 : 4; @@ -933,44 +1105,45 @@ void uvg_inv_lfnst( const color_t color, const uint16_t lfnst_idx, coeff_t *coeffs, - enum uvg_tree_type tree_type) + enum uvg_tree_type tree_type, + int8_t luma_mode) { // In VTM, max log2 dynamic range is something in range [15, 20] depending on whether extended precision processing is enabled // Such is not yet present in uvg266 so use 15 for now const int max_log2_dyn_range = 15; const uint32_t lfnst_index = lfnst_idx; + const uint32_t log2_width = uvg_g_convert_to_log2[width]; + const uint32_t log2_height = uvg_g_convert_to_log2[height]; int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma; - bool mts_skip = cur_cu->tr_idx == MTS_SKIP; - const int depth = cur_cu->depth; - bool is_separate_tree = depth == 4 || tree_type != UVG_BOTH_T; + bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y; + bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T; bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83] - bool is_mip = cur_cu->type == CU_INTRA && tree_type != UVG_CHROMA_T ? cur_cu->intra.mip_flag : false; - bool is_wide_angle = false; // TODO: get wide angle mode when implemented - - const int cu_type = cur_cu->type; - - const int scan_order = uvg_get_scan_order(cu_type, intra_mode, depth); + bool is_mip = block_is_mip(cur_cu, color, is_separate_tree); + const int scan_order = SCAN_DIAG; - if (lfnst_index && !mts_skip && (is_separate_tree || color == COLOR_Y)) { - const uint32_t log2_block_size = uvg_g_convert_to_bit[width] + 2; + if (lfnst_index && !mts_skip && (color == COLOR_Y || is_separate_tree)) { const bool whge3 = width >= 8 && height >= 8; - const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_block_size] : uvg_g_sig_last_scan[scan_order][log2_block_size - 1]; + const uint32_t* scan = whge3 ? uvg_coef_top_left_diag_scan_8x8[log2_width] : uvg_g_sig_last_scan[scan_order][log2_width - 1]; if (is_cclm_mode) { - intra_mode = cur_cu->intra.mip_flag ? 0 : cur_cu->intra.mode; + intra_mode = luma_mode; } - if (is_mip) { + if (is_mip && color == COLOR_Y) { intra_mode = 0; // Set to planar mode } assert(intra_mode < NUM_INTRA_MODE && "LFNST: Invalid intra mode."); assert(lfnst_index < 3 && "LFNST: Invalid LFNST index. Must be in [0, 2]"); + int32_t wide_adjusted_mode = uvg_wide_angle_correction( + intra_mode, + color == COLOR_Y ? cur_cu->log2_width : log2_width, + color == COLOR_Y ? cur_cu->log2_height : log2_height, + true + ); - if (is_wide_angle) { - // Transform wide angle mode to intra mode - intra_mode = intra_mode; // TODO: wide angle modes not implemented yet. Do nothing. - } - + + intra_mode = get_lfnst_intra_mode(wide_adjusted_mode); + bool transpose_flag = get_transpose_flag(intra_mode); const int sb_size = whge3 ? 8 : 4; bool tu_4x4_flag = (width == 4 && height == 4); @@ -1053,7 +1226,7 @@ void uvg_inv_lfnst( */ int uvg_quantize_residual_trskip( encoder_state_t *const state, - const cu_info_t *const cur_cu, const int width, const color_t color, + const cu_info_t *const cur_cu, const int width, const int height, const color_t color, const coeff_scan_order_t scan_order, int8_t *trskip_out, const int in_stride, const int out_stride, const uvg_pixel *const ref_in, const uvg_pixel *const pred_in, @@ -1074,7 +1247,7 @@ int uvg_quantize_residual_trskip( //noskip.cost += uvg_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost; skip.has_coeffs = uvg_quantize_residual( - state, cur_cu, width, color, scan_order, + state, cur_cu, width, height, color, scan_order, 1, in_stride, width, ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj, UVG_BOTH_T /* tree type doesn't matter for transformskip*/); @@ -1090,13 +1263,15 @@ int uvg_quantize_residual_trskip( if (best->has_coeffs || rec_out != pred_in) { // If there is no residual and reconstruction is already in rec_out, // we can skip this. - uvg_pixels_blit(best->rec, rec_out, width, width, width, out_stride); + uvg_pixels_blit(best->rec, rec_out, width, height, width, out_stride); } - copy_coeffs(best->coeff, coeff_out, width); + // TODO: copying coeffs here is very suspect + copy_coeffs(best->coeff, coeff_out, width, height, width); return best->has_coeffs; } + /** * Calculate the residual coefficients for a single TU. * @@ -1105,14 +1280,15 @@ int uvg_quantize_residual_trskip( static void quantize_tr_residual( encoder_state_t * const state, const color_t color, - const int32_t x, - const int32_t y, - const uint8_t depth, + const cu_loc_t *cu_loc, cu_info_t *cur_pu, lcu_t* lcu, bool early_skip, enum uvg_tree_type tree_type) { + const int x = cu_loc->x; + const int y = cu_loc->y; + const uvg_config *cfg = &state->encoder_control->cfg; const int32_t shift = color == COLOR_Y ? 0 : 1; const vector2d_t lcu_px = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift}; @@ -1120,9 +1296,10 @@ static void quantize_tr_residual( // If luma is 4x4, do chroma for the 8x8 luma area when handling the top // left PU because the coordinates are correct. bool handled_elsewhere = color != COLOR_Y && - depth == MAX_DEPTH && + cur_pu->log2_width + cur_pu-> log2_height < 6&& (x % 4 != 0 || y % 4 != 0); if (handled_elsewhere) { + assert(0); return; } @@ -1130,44 +1307,44 @@ static void quantize_tr_residual( // This should ensure that the CBF data doesn't get corrupted if this function // is called more than once. - int32_t tr_width; - if (color == COLOR_Y) { - tr_width = LCU_WIDTH >> depth; - } else { - const int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth); - tr_width = LCU_WIDTH_C >> chroma_depth; - } + const int32_t tr_width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width; + const int32_t tr_height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height; + const int32_t lcu_width = LCU_WIDTH >> shift; const int8_t mode = (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma; - const coeff_scan_order_t scan_idx = - uvg_get_scan_order(cur_pu->type, mode, depth); + + const coeff_scan_order_t scan_idx = SCAN_DIAG; const int offset = lcu_px.x + lcu_px.y * lcu_width; - const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y); + //const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y); // Pointers to current location in arrays with prediction. The // reconstruction will be written to this array. uvg_pixel *pred = NULL; // Pointers to current location in arrays with reference. const uvg_pixel *ref = NULL; - // Pointers to current location in arrays with quantized coefficients. - coeff_t *coeff = NULL; + // Temp coeff array + coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH]; + coeff_t *dst_coeff = NULL; switch (color) { case COLOR_Y: - pred = &lcu->rec.y[offset]; - ref = &lcu->ref.y[offset]; - coeff = &lcu->coeff.y[z_index]; + pred = &lcu->rec.y[offset]; + ref = &lcu->ref.y[offset]; + dst_coeff = &lcu->coeff.y[lcu_px.x + lcu_px.y * lcu_width]; break; case COLOR_U: - pred = &lcu->rec.u[offset]; - ref = &lcu->ref.u[offset]; - coeff = &lcu->coeff.u[z_index]; + pred = &lcu->rec.u[offset]; + ref = &lcu->ref.u[offset]; + dst_coeff = &lcu->coeff.u[lcu_px.x + lcu_px.y * lcu_width]; break; case COLOR_V: - pred = &lcu->rec.v[offset]; - ref = &lcu->ref.v[offset]; - coeff = &lcu->coeff.v[z_index]; + pred = &lcu->rec.v[offset]; + ref = &lcu->ref.v[offset]; + dst_coeff = &lcu->coeff.v[lcu_px.x + lcu_px.y * lcu_width]; + break; + case COLOR_UV: + dst_coeff = &lcu->coeff.joint_uv[lcu_px.x + lcu_px.y * lcu_width]; break; default: break; @@ -1187,6 +1364,7 @@ static void quantize_tr_residual( if (cfg->lossless) { has_coeffs = bypass_transquant(tr_width, + tr_height, lcu_width, // in stride lcu_width, // out stride ref, @@ -1196,9 +1374,9 @@ static void quantize_tr_residual( if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) { // implicit rdpcm for horizontal and vertical intra modes if (mode == 18) { - rdpcm(tr_width, RDPCM_HOR, coeff); + rdpcm(tr_width, tr_height, RDPCM_HOR, coeff); } else if (mode == 50) { - rdpcm(tr_width, RDPCM_VER, coeff); + rdpcm(tr_width, tr_height, RDPCM_VER, coeff); } } @@ -1209,6 +1387,7 @@ static void quantize_tr_residual( has_coeffs = uvg_quantize_residual_trskip(state, cur_pu, tr_width, + tr_height, color, scan_idx, &tr_skip, @@ -1225,24 +1404,37 @@ static void quantize_tr_residual( state, cur_pu, tr_width, + tr_height, scan_idx, lcu_width, lcu_width, &lcu->ref.u[offset], &lcu->ref.v[offset], &lcu->rec.u[offset], &lcu->rec.v[offset], &lcu->rec.u[offset], &lcu->rec.v[offset], - &lcu->coeff.joint_uv[z_index], + coeff, early_skip, lmcs_chroma_adj, tree_type ); cur_pu->joint_cb_cr = has_coeffs; + if (has_coeffs) { + for (int j = 0; j < tr_height; ++j) { + memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t)); + } + cbf_set(&cur_pu->cbf, COLOR_U); + } + else { + for (int j = 0; j < tr_height; ++j) { + memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width)); + } + } return; } has_coeffs = uvg_quantize_residual(state, cur_pu, tr_width, + tr_height, color, scan_idx, false, // tr skip @@ -1258,11 +1450,18 @@ static void quantize_tr_residual( } - cbf_clear(&cur_pu->cbf, depth, color); + cbf_clear(&cur_pu->cbf, color); if (has_coeffs) { - cbf_set(&cur_pu->cbf, depth, color); + for (int j = 0; j < tr_height; ++j) { + memcpy(&dst_coeff[j * lcu_width], &coeff[j * tr_width], tr_width * sizeof(coeff_t)); + } + cbf_set(&cur_pu->cbf, color); + } + else { + for (int j = 0; j < tr_height; ++j) { + memset(&dst_coeff[j * lcu_width], 0, (sizeof(coeff_t) * tr_width)); + } } - } /** @@ -1287,15 +1486,17 @@ void uvg_quantize_lcu_residual( const bool luma, const bool chroma, const bool jccr, - const int32_t x, - const int32_t y, - const uint8_t depth, + const cu_loc_t * cu_loc, cu_info_t *cur_pu, lcu_t* lcu, bool early_skip, enum uvg_tree_type tree_type) { - const int32_t width = LCU_WIDTH >> depth; + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = cu_loc->width; + const int height = cu_loc->height; + const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) }; if (cur_pu == NULL) { @@ -1304,7 +1505,10 @@ void uvg_quantize_lcu_residual( // Tell clang-analyzer what is up. For some reason it can't figure out from // asserting just depth. - assert(width == 4 || + // Width 2 is possible with ISP blocks // ISP_TODO: no, they actually are not + assert(width == 1 || + width == 2 || + width == 4 || width == 8 || width == 16 || width == 32 || @@ -1312,54 +1516,79 @@ void uvg_quantize_lcu_residual( // Reset CBFs because CBFs might have been set // for depth earlier + // ISP_TODO: does this cur_cu point to the correct place when ISP is used for small blocks? if (luma) { - cbf_clear(&cur_pu->cbf, depth, COLOR_Y); + cbf_clear(&cur_pu->cbf, COLOR_Y); } if (chroma || jccr) { - cbf_clear(&cur_pu->cbf, depth, COLOR_U); - cbf_clear(&cur_pu->cbf, depth, COLOR_V); + cbf_clear(&cur_pu->cbf, COLOR_U); + cbf_clear(&cur_pu->cbf, COLOR_V); } - if (depth == 0 || cur_pu->tr_depth > depth) { - - // Split transform and increase depth - const int offset = width / 2; - const int32_t x2 = x + offset; - const int32_t y2 = y + offset; - - // jccr is currently not supported if transform is split - uvg_quantize_lcu_residual(state, luma, chroma, 0, x, y, depth + 1, NULL, lcu, early_skip, tree_type); - uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y, depth + 1, NULL, lcu, early_skip, tree_type); - uvg_quantize_lcu_residual(state, luma, chroma, 0, x, y2, depth + 1, NULL, lcu, early_skip, tree_type); - uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip, tree_type); - - // Propagate coded block flags from child CUs to parent CU. - uint16_t child_cbfs[3] = { - LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y )->cbf, - LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y + offset)->cbf, - LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf, - }; - - if (depth <= MAX_DEPTH) { - cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y); - cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U); - cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V); + if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) { + enum split_type split; + if (cu_loc->width > TR_MAX_WIDTH && cu_loc->height > TR_MAX_WIDTH) { + split = QT_SPLIT; } + else if (cu_loc->width > TR_MAX_WIDTH) { + split = BT_VER_SPLIT; + } + else { + split = BT_HOR_SPLIT; + } + + cu_loc_t split_cu_loc[4]; + uint16_t child_cbfs[3]; + const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); + + for (int i = 0; i < split_count; ++i) { + uvg_quantize_lcu_residual(state, luma, chroma, 0, &split_cu_loc[i], NULL, lcu, early_skip, tree_type); + if(i != 0) { + child_cbfs[i - 1] = LCU_GET_CU_AT_PX(lcu, split_cu_loc[i].local_x, split_cu_loc[i].local_y)->cbf; + } + } + + + cur_pu->root_cbf = cbf_is_set_any(cur_pu->cbf) + || cbf_is_set_any(child_cbfs[0]) + || cbf_is_set_any(child_cbfs[1]) + || cbf_is_set_any(child_cbfs[2]); + } else { // Process a leaf TU. + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, x, y, width, height); + if (luma) { - quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu, early_skip, tree_type); + quantize_tr_residual(state, COLOR_Y, &loc, cur_pu, lcu, early_skip, tree_type); } + double c_lambda = state->c_lambda; + state->c_lambda = uvg_calculate_chroma_lambda(state, state->encoder_control->cfg.jccr, cur_pu->joint_cb_cr); if (chroma) { - quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip, tree_type); - quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip, tree_type); + state->rate_estimator[2].needs_init = true; + if(state->encoder_control->cfg.dep_quant) { + cabac_data_t temp_cabac; + memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t)); + state->search_cabac.update = 1; + quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type); + cu_loc_t temp_chroma_loc; + uvg_cu_loc_ctor(&temp_chroma_loc, (cu_loc->x >> 1) % LCU_WIDTH_C, (cu_loc->y >> 1) % LCU_WIDTH_C, cu_loc->width, cu_loc->height); + uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &temp_chroma_loc, COLOR_U, 0, (cur_pu->tr_skip & 2) >> 1, COEFF_ORDER_CU); + quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type); + memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t)); + } + else { + quantize_tr_residual(state, COLOR_U, &loc, cur_pu, lcu, early_skip, tree_type); + quantize_tr_residual(state, COLOR_V, &loc, cur_pu, lcu, early_skip, tree_type); + } } - if (jccr && cur_pu->tr_depth == cur_pu->depth) { - quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip, tree_type); + if (jccr && PU_IS_TU(cur_pu)) { + quantize_tr_residual(state, COLOR_UV, &loc, cur_pu, lcu, early_skip, tree_type); } - if(chroma && jccr && cur_pu->tr_depth == cur_pu->depth) { + if(chroma && jccr && PU_IS_TU(cur_pu)) { assert( 0 && "Trying to quantize both jccr and regular at the same time.\n"); } + state->c_lambda = c_lambda; } } diff --git a/src/transform.h b/src/transform.h index d3f44edf..be485f46 100644 --- a/src/transform.h +++ b/src/transform.h @@ -44,23 +44,28 @@ #include "global.h" // IWYU pragma: keep extern const uint8_t uvg_g_chroma_scale[58]; -extern const int16_t uvg_g_inv_quant_scales[6]; -extern const int16_t uvg_g_quant_scales[6]; +extern const int16_t uvg_g_inv_quant_scales[2][6]; +extern const int16_t uvg_g_quant_scales[2][6]; -void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); -void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); +#define COEFF_ORDER_LINEAR 0 +#define COEFF_ORDER_CU 1 + +void uvg_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height); +void uvg_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t width, int8_t height); void uvg_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu); void uvg_itransform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, - int8_t block_size, + int8_t block_width, + int8_t block_height, color_t color, const cu_info_t *tu); @@ -69,11 +74,12 @@ int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t con void uvg_derive_lfnst_constraints( cu_info_t* const pred_cu, - const int depth, bool* constraints, const coeff_t* coeff, const int width, - const int height); + const int height, + const vector2d_t * const , + color_t color); typedef struct { double best_u_cost; @@ -82,6 +88,10 @@ typedef struct { int best_u_index; int best_v_index; int best_combined_index; + uint64_t u_distortion; + uint64_t v_distortion; + double u_bits; + double v_bits; } uvg_chorma_ts_out_t; void uvg_quantize_lcu_residual( @@ -89,9 +99,7 @@ void uvg_quantize_lcu_residual( bool luma, bool chroma, const bool jccr, - int32_t x, - int32_t y, - uint8_t depth, + const cu_loc_t* cu_loc, cu_info_t *cur_cu, lcu_t* lcu, bool early_skip, @@ -99,13 +107,10 @@ void uvg_quantize_lcu_residual( void uvg_chroma_transform_search( encoder_state_t* const state, - int depth, lcu_t* const lcu, cabac_data_t* temp_cabac, - int8_t width, - int8_t height, + const cu_loc_t* const cu_loc, const int offset, - const uint8_t mode, cu_info_t* pred_cu, uvg_pixel u_pred[1024], uvg_pixel v_pred[1024], @@ -130,7 +135,8 @@ void uvg_fwd_lfnst( const color_t color, const uint16_t lfnst_idx, coeff_t *coeffs, - enum uvg_tree_type tree_type); + enum uvg_tree_type tree_type, + int8_t luma_mode); void uvg_inv_lfnst( const cu_info_t* cur_cu, @@ -139,6 +145,7 @@ void uvg_inv_lfnst( const color_t color, const uint16_t lfnst_idx, coeff_t* coeffs, - enum uvg_tree_type tree_type); + enum uvg_tree_type tree_type, + int8_t luma_mode); #endif diff --git a/src/uvg266.h b/src/uvg266.h index 3bec7756..c71a835a 100644 --- a/src/uvg266.h +++ b/src/uvg266.h @@ -338,7 +338,6 @@ typedef struct uvg_config int32_t trskip_max_size; /*!< \brief Transform skip max block size. */ enum uvg_mts mts; /*< \brief flag to enable multiple transform selection*/ int32_t mts_implicit; /*< \brief flag to enable implicit multiple transform selection*/ - int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */ enum uvg_ime_algorithm ime_algorithm; /*!< \brief Integer motion estimation algorithm. */ int32_t fme_level; /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */ int8_t source_scan_type; /*!< \brief Source scan type (0: progressive, 1: top field first, 2: bottom field first).*/ @@ -452,7 +451,7 @@ typedef struct uvg_config /** \brief Flag to enable/disable open GOP configuration */ int8_t open_gop; - int32_t vaq; /** \brief Enable variance adaptive quantization*/ + int32_t vaq; /** \brief Enable variance adaptive quantization*/ /** \brief Type of scaling lists to use */ int8_t scaling_list; @@ -526,6 +525,8 @@ typedef struct uvg_config /** \brief enable low frequency non-separable transform */ int8_t lfnst; + /** \brief enable intra sub partitions*/ + int8_t isp; int8_t jccr; @@ -542,9 +543,16 @@ typedef struct uvg_config uint8_t dual_tree; + uint8_t min_qt_size[3]; /* intra, inter, dual tree chroma*/ + uint8_t max_bt_size[3]; /* intra, inter, dual tree chroma*/ + uint8_t max_tt_size[3]; /* intra, inter, dual tree chroma*/ + + uint8_t max_btt_depth[3]; /* intra, inter, dual tree chroma*/ + uint8_t intra_rough_search_levels; uint8_t ibc; /* \brief Intra Block Copy parameter */ + uint8_t dep_quant; } uvg_config; /** diff --git a/src/videoframe.c b/src/videoframe.c index f5a4d8af..e9a43dc1 100644 --- a/src/videoframe.c +++ b/src/videoframe.c @@ -61,7 +61,7 @@ videoframe_t * uvg_videoframe_alloc(int32_t width, frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu); if (cclm) { assert(chroma_format == UVG_CSP_420); - frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4); + frame->cclm_luma_rec = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 15) & ~7) + FRAME_PADDING_LUMA) / 4); frame->cclm_luma_rec_top_line = MALLOC(uvg_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64)); } } diff --git a/tests/check_cabac_state_consistency.py b/tests/check_cabac_state_consistency.py index 4d7f970c..73a1dd72 100644 --- a/tests/check_cabac_state_consistency.py +++ b/tests/check_cabac_state_consistency.py @@ -30,7 +30,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int with open(state_file, "rb") as file: try: while True: - type_, x, y, depth, tree_type = file.read(15).decode().split() + type_, x, y, depth, tree_type = file.read(23).decode().split() # Reset stored data at the beginning of the frame if x == '0' and y == '0' and type_ == "S" and tree_type != "2": if not was_zero_last: @@ -38,7 +38,7 @@ def main(state_file: Path, ctx_names: list, ctx_count: int = 332, ctx_size: int ctx_store = dict() e_store = set() was_zero_last = True - else: + elif int(x) >= 64 and int(y) >= 64: was_zero_last = False ctx = file.read(ctx_count * ctx_size) diff --git a/tests/mts_tests.c b/tests/mts_tests.c index f607b77d..61f9fb2c 100644 --- a/tests/mts_tests.c +++ b/tests/mts_tests.c @@ -111,7 +111,8 @@ static void setup_tests() tu.tr_idx = MTS_DST7_DST7 + trafo; tu.lfnst_idx = 0; tu.cr_lfnst_idx = 0; - mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH); + tu.intra.isp_mode = 0; + mts_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo*NUM_SIZES+block], dct_result[trafo][block], UVG_MTS_BOTH); } } } @@ -134,7 +135,8 @@ static void setup_tests() tu.tr_idx = MTS_DST7_DST7 + trafo; tu.lfnst_idx = 0; tu.cr_lfnst_idx = 0; - idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH); + tu.intra.isp_mode = 0; + idct_generic(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + block), 1 << (LCU_MIN_LOG_W + block), dct_bufs[trafo * NUM_SIZES + block], idct_result[trafo][block], UVG_MTS_BOTH); } } @@ -156,6 +158,7 @@ TEST dct(void) { char testname[100]; for (int blocksize = 0; blocksize < NUM_SIZES; blocksize++) { + size_t size = 1 << (LCU_MIN_LOG_W + blocksize); for (int trafo = 0; trafo < NUM_TRANSFORM; trafo++) { sprintf(testname, "Block: %d x %d, trafo: %d", 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), trafo); cu_info_t tu; @@ -163,14 +166,20 @@ TEST dct(void) tu.tr_idx = MTS_DST7_DST7 + trafo; tu.lfnst_idx = 0; tu.cr_lfnst_idx = 0; + tu.intra.isp_mode = 0; int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize]; ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 }; - test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); + test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); - for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) { - ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]); + for (int y = 0; y < size; ++y) { + if (y>= 16) break; + for (int x = 0; x < size; ++x) { + if (x >= 16) break; + int i = y * size + x; + ASSERT_EQm(testname, test_result[i], dct_result[trafo][blocksize][i]); + } } //fprintf(stderr, "PASS: %s\r\n", testname); } @@ -188,11 +197,14 @@ TEST idct(void) cu_info_t tu; tu.type = CU_INTRA; tu.tr_idx = MTS_DST7_DST7 + trafo; + tu.lfnst_idx = 0; + tu.cr_lfnst_idx = 0; + tu.intra.isp_mode = 0; int16_t* buf = dct_bufs[trafo * NUM_SIZES + blocksize]; ALIGNED(32) int16_t test_result[LCU_WIDTH * LCU_WIDTH] = { 0 }; - test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); + test_env.tested_func(UVG_BIT_DEPTH, COLOR_Y, &tu, 1 << (LCU_MIN_LOG_W + blocksize), 1 << (LCU_MIN_LOG_W + blocksize), buf, test_result, UVG_MTS_BOTH); for (int i = 0; i < LCU_WIDTH * LCU_WIDTH; ++i) { ASSERT_EQm(testname, test_result[i], idct_result[trafo][blocksize][i]); diff --git a/tests/mv_cand_tests.c b/tests/mv_cand_tests.c index 84ab9328..849fec2d 100644 --- a/tests/mv_cand_tests.c +++ b/tests/mv_cand_tests.c @@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void) merge_candidates_t cand = { 0 }; - get_spatial_merge_candidates(64 + 32, 64, // x, y - 32, 24, // width, height + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y + 32, 24); // width, height) + + get_spatial_merge_candidates(&cu_loc, 1920, 1080, // picture size &lcu, &cand, diff --git a/tests/test_cabac_state.sh b/tests/test_cabac_state.sh index 519f9c40..6d60d1da 100755 --- a/tests/test_cabac_state.sh +++ b/tests/test_cabac_state.sh @@ -6,10 +6,10 @@ set -eu cabacfile="$(mktemp)" -valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}" +valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}" python3 check_cabac_state_consistency.py "${cabacfile}" -valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}" +valgrind_test 256x128 10 yuv420p --no-cpuid --preset veryslow --pu-depth-intra 0-8 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}" python3 check_cabac_state_consistency.py "${cabacfile}" rm -rf "${cabacfile}" diff --git a/tests/test_intra.sh b/tests/test_intra.sh index 3c37f82b..ea58b415 100755 --- a/tests/test_intra.sh +++ b/tests/test_intra.sh @@ -19,3 +19,5 @@ valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra valgrind_test $common_args --rd=3 --cclm --jccr valgrind_test $common_args --lfnst valgrind_test $common_args --lfnst --rd=3 --cclm --mip --dual-tree --fast-residual-cost 0 +valgrind_test $common_args --rd=2 --isp --cpuid=0 --fast-residual-cost 0 +valgrind_test $common_args --rd=2 --isp --cpuid=0 --lfnst --mts=intra --fast-residual-cost 0 diff --git a/tests/test_mtt.sh b/tests/test_mtt.sh new file mode 100755 index 00000000..5fc5587b --- /dev/null +++ b/tests/test_mtt.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +# Test all-intra coding. + +set -eu + +. "${0%/*}/util.sh" + +common_args='264x130 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-cpuid --no-wpp --fast-residual-cost 0' +valgrind_test $common_args --rd=0 --mtt-depth-intra 1 --pu-depth-intra 2-3 +valgrind_test $common_args --rd=3 --mtt-depth-intra 1 --pu-depth-intra 0-5 +valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --pu-depth-intra 0-8 +valgrind_test $common_args --rd=3 --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8 +valgrind_test $common_args --rd=3 --rdoq --jccr --isp --lfnst --mip --mrl --mts intra --cclm --mtt-depth-intra 3 --mtt-depth-intra-chroma 3 --dual-tree --pu-depth-intra 0-8 diff --git a/tools/generate_tables.c b/tools/generate_tables.c index d50c889f..6bd2497e 100644 --- a/tools/generate_tables.c +++ b/tools/generate_tables.c @@ -51,7 +51,7 @@ static void init_sig_last_scan(uint32_t *buff_d, uint32_t *buff_h, uint32_t *buff_v, int32_t width, int32_t height) { - uint32_t num_scan_pos = width * width; + uint32_t num_scan_pos = width * height; uint32_t next_scan_pos = 0; int32_t xx, yy, x, y; uint32_t scan_line;