From b0b2b0e536bb928b57bed9814a8a4713efebd0d7 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 29 Aug 2022 14:11:08 +0300 Subject: [PATCH 1/3] Try making ultrafast all intra a bit faster --- src/search.c | 21 +++++++++++---------- src/search_intra.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/search.c b/src/search.c index 2e594126..b76c169a 100644 --- a/src/search.c +++ b/src/search.c @@ -974,13 +974,14 @@ static double search_cu( else { intra_search.pred_cu.intra.mode_chroma = 0; } - uvg_intra_recon_cu(state, - x, y, - depth, &intra_search, - &intra_search.pred_cu, - lcu, - tree_type, false, true); - if(tree_type != UVG_CHROMA_T) { + + if(tree_type != UVG_CHROMA_T && ctrl->cfg.rdo >= 2) { + uvg_intra_recon_cu(state, + x, y, + depth, &intra_search, + &intra_search.pred_cu, + lcu, + tree_type, false, true); intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu); } else { @@ -1201,9 +1202,9 @@ static double search_cu( // the split costs at least as much as not splitting. if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) { if (split_cost < cost) split_cost += search_cu(state, x, y, depth + 1, work_tree, tree_type); - if (split_cost < cost || 1) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type); - if (split_cost < cost || 1) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type); - if (split_cost < cost || 1) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type); + if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type); + if (split_cost < cost) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type); + if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type); } else { split_cost = INT_MAX; } diff --git a/src/search_intra.c b/src/search_intra.c index 9c7b4115..a314d5d3 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1075,7 +1075,7 @@ static uint8_t search_intra_rough( FILL(search_proxy, 0); search_proxy.pred_cu = *pred_cu; - int offset = 4; + int offset = 8; search_proxy.pred_cu.intra.mode = 0; uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T); search_proxy.pred_cu.intra.mode = 1; From ed6a0528fe99fb4d6ae73850da4eb504cb6517ae Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 30 Aug 2022 15:17:05 +0300 Subject: [PATCH 2/3] Further make things faster --- src/search_intra.c | 2 +- src/strategies/avx2/intra-avx2.c | 79 ++++++++++++++++---------------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/src/search_intra.c b/src/search_intra.c index a314d5d3..bd16b29b 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -975,7 +975,7 @@ static int16_t search_intra_rough( }*/ -static double count_bits( +static INLINE double count_bits( encoder_state_t* const state, int8_t* intra_preds, const double not_mrl, diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5450f3d2..53282e87 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -166,10 +166,8 @@ static void uvg_angular_pred_avx2( // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { - for (int i = 0; i <= width + 1 + multi_ref_index; i++) { - temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]); - temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]); - } + memcpy(&temp_main[width], vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); + memcpy(&temp_side[width], vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1)); ref_main = temp_main + width; ref_side = temp_side + width; @@ -213,19 +211,15 @@ static void uvg_angular_pred_avx2( //tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset]; } else { - - for (int i = 0; i <= (width << 1) + multi_ref_index; i++) { - temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]); - temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]); - } + + memcpy(temp_main, vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1)); + memcpy(temp_side, vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1)); const int s = 0; const int max_index = (multi_ref_index << s) + 2; const int ref_length = width << 1; const uvg_pixel val = temp_main[ref_length + multi_ref_index]; - for (int j = 0; j <= max_index; j++) { - temp_main[ref_length + multi_ref_index + j] = val; - } + memset(temp_main + ref_length + multi_ref_index, val, max_index + 1); ref_main = temp_main; ref_side = temp_side; @@ -245,12 +239,28 @@ static void uvg_angular_pred_avx2( ref_main += multi_ref_index; ref_side += multi_ref_index; + static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; + int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; + int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); + + bool use_cubic = true; // Default to cubic filter + if (dist_from_vert_or_hor > filter_threshold) { + if ((abs(sample_disp) & 0x1F) != 0) + { + use_cubic = false; + } + } + // Cubic must be used if ref line != 0 + if (multi_ref_index) { + use_cubic = true; + } + if (sample_disp != 0) { // The mode is not horizontal or vertical, we have to do interpolation. int_fast32_t delta_pos = sample_disp * multi_ref_index; - int_fast32_t delta_int[4] = { 0 }; - int_fast32_t delta_fract[4] = { 0 }; + int64_t delta_int[4] = { 0 }; + int16_t delta_fract[4] = { 0 }; for (int_fast32_t y = 0; y + 3 < width; y += 4) { for (int yy = 0; yy < 4; ++yy) { @@ -263,38 +273,27 @@ static void uvg_angular_pred_avx2( // Luma Channel if (channel_type == 0) { - - int64_t ref_main_index[4] = { 0 }; + int16_t f[4][4] = { { 0 } }; - - for (int yy = 0; yy < 4; ++yy) { - - ref_main_index[yy] = delta_int[yy]; - bool use_cubic = true; // Default to cubic filter - static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 }; - int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width]; - int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18)); - if (dist_from_vert_or_hor > filter_threshold) { - static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 }; - const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode; - const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; - if ((abs(sample_disp) & 0x1F) != 0) - { - use_cubic = false; - } + if (use_cubic) { + memcpy(f[0], cubic_filter[delta_fract[0]], 8); + memcpy(f[1], cubic_filter[delta_fract[1]], 8); + memcpy(f[2], cubic_filter[delta_fract[2]], 8); + memcpy(f[3], cubic_filter[delta_fract[3]], 8); + } + else { + for(int yy = 0; yy < 4; ++yy) { + const int16_t offset = (delta_fract[yy] >> 1); + f[yy][0] = 16 - offset; + f[yy][1] = 32 - offset; + f[yy][2] = 16 + offset; + f[yy][3] = offset; } - // Cubic must be used if ref line != 0 - if (multi_ref_index) { - use_cubic = true; - } - const int16_t filter_coeff[4] = { 16 - (delta_fract[yy] >> 1), 32 - (delta_fract[yy] >> 1), 16 + (delta_fract[yy] >> 1), delta_fract[yy] >> 1 }; - const int16_t *temp_f = use_cubic ? cubic_filter[delta_fract[yy]] : filter_coeff; - memcpy(f[yy], temp_f, 4 * sizeof(*temp_f)); } // Do 4-tap intra interpolation filtering uvg_pixel *p = (uvg_pixel*)ref_main; - __m256i vidx = _mm256_loadu_si256((__m256i *)ref_main_index); + __m256i vidx = _mm256_loadu_si256((__m256i *)delta_int); __m256i all_weights = _mm256_loadu_si256((__m256i *)f); __m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01); __m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23); From e636db489f046cc689ac47e2c70593b474a72ed4 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 28 Sep 2022 08:49:38 +0300 Subject: [PATCH 3/3] [cfg] Parameterize intra rough search granularity --- src/cfg.c | 9 +++++++++ src/cli.c | 8 ++++++++ src/search_intra.c | 4 ++-- src/uvg266.h | 2 ++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/cfg.c b/src/cfg.c index 6f3cbfef..843729a6 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -221,6 +221,7 @@ int uvg_config_init(uvg_config *cfg) cfg->cabac_debug_file_name = NULL; cfg->dual_tree = 0; + cfg->intra_rough_search_levels = 2; return 1; } @@ -1475,6 +1476,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value) else if OPT("dual-tree") { cfg->dual_tree = atobool(value); } + else if OPT("intra-rough-granularity") { + cfg->intra_rough_search_levels = atoi(value); + } else { return 0; } @@ -1838,6 +1842,11 @@ int uvg_config_validate(const uvg_config *const cfg) error = 1; } + if(cfg->intra_rough_search_levels > 4) { + fprintf(stderr, "intra-rough-granularity must be between [0..4].\n"); + error = 1; + } + return !error; } diff --git a/src/cli.c b/src/cli.c index 53f2df9b..6d0c13f6 100644 --- a/src/cli.c +++ b/src/cli.c @@ -191,6 +191,7 @@ static const struct option long_options[] = { { "dual-tree", no_argument, NULL, 0 }, { "no-dual-tree", no_argument, NULL, 0 }, { "cabac-debug-file", required_argument, NULL, 0 }, + { "intra-rough-granularity",required_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -615,6 +616,13 @@ void print_help(void) " --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n" " learning trees, overrides the\n" " --pu-depth-intra parameter. [disabled]\n" + " --intra-rough-granularity : How many levels are used for the\n" + " logarithmic intra rough search. 0..4\n" + " With 0 all of the modes are checked \n" + " in a single level, 1 checks every second\n" + " mode is checked on first level and then\n" + " second level checks the modes surrounding\n" + " the three best modes. [2]\n" " --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n" " on lower depth even when search is not\n" " performed on said depth. Should only\n" diff --git a/src/search_intra.c b/src/search_intra.c index bd16b29b..226c40c3 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -1075,7 +1075,7 @@ static uint8_t search_intra_rough( FILL(search_proxy, 0); search_proxy.pred_cu = *pred_cu; - int offset = 8; + int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels; search_proxy.pred_cu.intra.mode = 0; uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T); search_proxy.pred_cu.intra.mode = 1; @@ -1123,7 +1123,7 @@ static uint8_t search_intra_rough( best_six_modes[3].cost = MAX_DOUBLE; best_six_modes[4].cost = MAX_DOUBLE; best_six_modes[5].cost = MAX_DOUBLE; - for (int mode = 4; mode <= 66; mode += PARALLEL_BLKS * offset) { + for (int mode = 2 + offset / 2; mode <= 66; mode += PARALLEL_BLKS * offset) { double costs_out[PARALLEL_BLKS] = { 0 }; for (int i = 0; i < PARALLEL_BLKS; ++i) { diff --git a/src/uvg266.h b/src/uvg266.h index 1801c8ac..e2ad9597 100644 --- a/src/uvg266.h +++ b/src/uvg266.h @@ -541,6 +541,8 @@ typedef struct uvg_config char* cabac_debug_file_name; uint8_t dual_tree; + + uint8_t intra_rough_search_levels; } uvg_config; /**