Merge branch 'faster_all_intra' into 'master'

improve intra search

See merge request cs/ultravideo/vvc/uvg266!13
This commit is contained in:
Joose Sainio 2022-09-28 08:58:50 +03:00
commit c6764dfc66
6 changed files with 72 additions and 53 deletions

View file

@ -221,6 +221,7 @@ int uvg_config_init(uvg_config *cfg)
cfg->cabac_debug_file_name = NULL;
cfg->dual_tree = 0;
cfg->intra_rough_search_levels = 2;
return 1;
}
@ -1475,6 +1476,9 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
else if OPT("dual-tree") {
cfg->dual_tree = atobool(value);
}
else if OPT("intra-rough-granularity") {
cfg->intra_rough_search_levels = atoi(value);
}
else {
return 0;
}
@ -1838,6 +1842,11 @@ int uvg_config_validate(const uvg_config *const cfg)
error = 1;
}
if(cfg->intra_rough_search_levels > 4) {
fprintf(stderr, "intra-rough-granularity must be between [0..4].\n");
error = 1;
}
return !error;
}

View file

@ -191,6 +191,7 @@ static const struct option long_options[] = {
{ "dual-tree", no_argument, NULL, 0 },
{ "no-dual-tree", no_argument, NULL, 0 },
{ "cabac-debug-file", required_argument, NULL, 0 },
{ "intra-rough-granularity",required_argument, NULL, 0 },
{0, 0, 0, 0}
};
@ -615,6 +616,13 @@ void print_help(void)
" --ml-pu-depth-intra : Predict the pu-depth-intra using machine\n"
" learning trees, overrides the\n"
" --pu-depth-intra parameter. [disabled]\n"
" --intra-rough-granularity : How many levels are used for the\n"
" logarithmic intra rough search. 0..4\n"
" With 0 all of the modes are checked \n"
" in a single level, 1 checks every second\n"
" mode is checked on first level and then\n"
" second level checks the modes surrounding\n"
" the three best modes. [2]\n"
" --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
" on lower depth even when search is not\n"
" performed on said depth. Should only\n"

View file

@ -974,13 +974,14 @@ static double search_cu(
else {
intra_search.pred_cu.intra.mode_chroma = 0;
}
uvg_intra_recon_cu(state,
x, y,
depth, &intra_search,
&intra_search.pred_cu,
lcu,
tree_type, false, true);
if(tree_type != UVG_CHROMA_T) {
if(tree_type != UVG_CHROMA_T && ctrl->cfg.rdo >= 2) {
uvg_intra_recon_cu(state,
x, y,
depth, &intra_search,
&intra_search.pred_cu,
lcu,
tree_type, false, true);
intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
}
else {
@ -1201,9 +1202,9 @@ static double search_cu(
// the split costs at least as much as not splitting.
if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) {
if (split_cost < cost) split_cost += search_cu(state, x, y, depth + 1, work_tree, tree_type);
if (split_cost < cost || 1) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type);
if (split_cost < cost || 1) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type);
if (split_cost < cost || 1) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type);
if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type);
if (split_cost < cost) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type);
if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type);
} else {
split_cost = INT_MAX;
}

View file

@ -975,7 +975,7 @@ static int16_t search_intra_rough(
}*/
static double count_bits(
static INLINE double count_bits(
encoder_state_t* const state,
int8_t* intra_preds,
const double not_mrl,
@ -1075,7 +1075,7 @@ static uint8_t search_intra_rough(
FILL(search_proxy, 0);
search_proxy.pred_cu = *pred_cu;
int offset = 4;
int offset = 1 << state->encoder_control->cfg.intra_rough_search_levels;
search_proxy.pred_cu.intra.mode = 0;
uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL, UVG_LUMA_T);
search_proxy.pred_cu.intra.mode = 1;
@ -1123,7 +1123,7 @@ static uint8_t search_intra_rough(
best_six_modes[3].cost = MAX_DOUBLE;
best_six_modes[4].cost = MAX_DOUBLE;
best_six_modes[5].cost = MAX_DOUBLE;
for (int mode = 4; mode <= 66; mode += PARALLEL_BLKS * offset) {
for (int mode = 2 + offset / 2; mode <= 66; mode += PARALLEL_BLKS * offset) {
double costs_out[PARALLEL_BLKS] = { 0 };
for (int i = 0; i < PARALLEL_BLKS; ++i) {

View file

@ -166,10 +166,8 @@ static void uvg_angular_pred_avx2(
// Set ref_main and ref_side such that, when indexed with 0, they point to
// index 0 in block coordinates.
if (sample_disp < 0) {
for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
}
memcpy(&temp_main[width], vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1));
memcpy(&temp_side[width], vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel) * (width + 1 + multi_ref_index + 1));
ref_main = temp_main + width;
ref_side = temp_side + width;
@ -213,19 +211,15 @@ static void uvg_angular_pred_avx2(
//tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
}
else {
for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
}
memcpy(temp_main, vertical_mode ? in_ref_above : in_ref_left, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1));
memcpy(temp_side, vertical_mode ? in_ref_left : in_ref_above, sizeof(uvg_pixel)* (width * 2 + multi_ref_index + 1));
const int s = 0;
const int max_index = (multi_ref_index << s) + 2;
const int ref_length = width << 1;
const uvg_pixel val = temp_main[ref_length + multi_ref_index];
for (int j = 0; j <= max_index; j++) {
temp_main[ref_length + multi_ref_index + j] = val;
}
memset(temp_main + ref_length + multi_ref_index, val, max_index + 1);
ref_main = temp_main;
ref_side = temp_side;
@ -245,12 +239,28 @@ static void uvg_angular_pred_avx2(
ref_main += multi_ref_index;
ref_side += multi_ref_index;
static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
bool use_cubic = true; // Default to cubic filter
if (dist_from_vert_or_hor > filter_threshold) {
if ((abs(sample_disp) & 0x1F) != 0)
{
use_cubic = false;
}
}
// Cubic must be used if ref line != 0
if (multi_ref_index) {
use_cubic = true;
}
if (sample_disp != 0) {
// The mode is not horizontal or vertical, we have to do interpolation.
int_fast32_t delta_pos = sample_disp * multi_ref_index;
int_fast32_t delta_int[4] = { 0 };
int_fast32_t delta_fract[4] = { 0 };
int64_t delta_int[4] = { 0 };
int16_t delta_fract[4] = { 0 };
for (int_fast32_t y = 0; y + 3 < width; y += 4) {
for (int yy = 0; yy < 4; ++yy) {
@ -263,38 +273,27 @@ static void uvg_angular_pred_avx2(
// Luma Channel
if (channel_type == 0) {
int64_t ref_main_index[4] = { 0 };
int16_t f[4][4] = { { 0 } };
for (int yy = 0; yy < 4; ++yy) {
ref_main_index[yy] = delta_int[yy];
bool use_cubic = true; // Default to cubic filter
static const int uvg_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
if (dist_from_vert_or_hor > filter_threshold) {
static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 };
const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
if ((abs(sample_disp) & 0x1F) != 0)
{
use_cubic = false;
}
if (use_cubic) {
memcpy(f[0], cubic_filter[delta_fract[0]], 8);
memcpy(f[1], cubic_filter[delta_fract[1]], 8);
memcpy(f[2], cubic_filter[delta_fract[2]], 8);
memcpy(f[3], cubic_filter[delta_fract[3]], 8);
}
else {
for(int yy = 0; yy < 4; ++yy) {
const int16_t offset = (delta_fract[yy] >> 1);
f[yy][0] = 16 - offset;
f[yy][1] = 32 - offset;
f[yy][2] = 16 + offset;
f[yy][3] = offset;
}
// Cubic must be used if ref line != 0
if (multi_ref_index) {
use_cubic = true;
}
const int16_t filter_coeff[4] = { 16 - (delta_fract[yy] >> 1), 32 - (delta_fract[yy] >> 1), 16 + (delta_fract[yy] >> 1), delta_fract[yy] >> 1 };
const int16_t *temp_f = use_cubic ? cubic_filter[delta_fract[yy]] : filter_coeff;
memcpy(f[yy], temp_f, 4 * sizeof(*temp_f));
}
// Do 4-tap intra interpolation filtering
uvg_pixel *p = (uvg_pixel*)ref_main;
__m256i vidx = _mm256_loadu_si256((__m256i *)ref_main_index);
__m256i vidx = _mm256_loadu_si256((__m256i *)delta_int);
__m256i all_weights = _mm256_loadu_si256((__m256i *)f);
__m256i w01 = _mm256_shuffle_epi8(all_weights, w_shuf_01);
__m256i w23 = _mm256_shuffle_epi8(all_weights, w_shuf_23);

View file

@ -541,6 +541,8 @@ typedef struct uvg_config
char* cabac_debug_file_name;
uint8_t dual_tree;
uint8_t intra_rough_search_levels;
} uvg_config;
/**