From 3efaeede763ac5615607628881d3fea8fac050e6 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 19 Aug 2021 17:04:35 +0300 Subject: [PATCH 01/13] [alf] Define the strategy for alf_derive_classification_blk() --- build/kvazaar_lib/kvazaar_lib.vcxproj | 4 + build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 12 + src/Makefile.am | 4 + src/alf.c | 243 +-------------- src/rdo.h | 1 - src/strategies/generic/alf-generic.c | 281 ++++++++++++++++++ src/strategies/generic/alf-generic.h | 31 ++ src/strategies/strategies-alf.c | 41 +++ src/strategies/strategies-alf.h | 56 ++++ src/strategyselector.c | 5 + src/strategyselector.h | 2 + 11 files changed, 437 insertions(+), 243 deletions(-) create mode 100644 src/strategies/generic/alf-generic.c create mode 100644 src/strategies/generic/alf-generic.h create mode 100644 src/strategies/strategies-alf.c create mode 100644 src/strategies/strategies-alf.h diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj index dd08a6e4..ef459fe1 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj @@ -188,10 +188,12 @@ AdvancedVectorExtensions2 AdvancedVectorExtensions2 + + @@ -263,9 +265,11 @@ + + diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters index bc68cd26..bc207ca8 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters @@ -254,6 +254,12 @@ Reconstruction + + Optimization\strategies\generic + + + Optimization\strategies + @@ -473,6 +479,12 @@ Reconstruction + + Optimization\strategies\generic + + + Optimization\strategies + diff --git a/src/Makefile.am b/src/Makefile.am index 6a13c407..4cdf6a34 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -117,6 +117,8 @@ libkvazaar_la_SOURCES = \ transform.h \ videoframe.c \ videoframe.h \ + strategies/generic/alf-generic.c \ + strategies/generic/alf-generic.h \ strategies/generic/dct-generic.c \ strategies/generic/dct-generic.h \ strategies/generic/intra-generic.c \ @@ -137,6 +139,8 @@ libkvazaar_la_SOURCES = \ strategies/optimized_sad_func_ptr_t.h \ strategies/generic/sao_shared_generics.h \ strategies/strategies-common.h \ + strategies/strategies-alf.c \ + strategies/strategies-alf.h \ strategies/strategies-dct.c \ strategies/strategies-dct.h \ strategies/strategies-intra.c \ diff --git a/src/alf.c b/src/alf.c index dec4d22e..9de923b3 100644 --- a/src/alf.c +++ b/src/alf.c @@ -10,7 +10,7 @@ #include "cabac.h" #include "rdo.h" -#include "strategies/strategies-sao.h" +#include "strategies/strategies-alf.h" #include "kvz_math.h" #include "reshape.h" @@ -5852,247 +5852,6 @@ static void alf_reconstruct(encoder_state_t * const state, } } -static void alf_derive_classification_blk(encoder_state_t * const state, - const int shift, - const int n_height, - const int n_width, - const int blk_pos_x, - const int blk_pos_y, - const int blk_dst_x, - const int blk_dst_y, - const int vb_ctu_height, - int vb_pos) -{ - videoframe_t* const frame = state->tile->frame; - //int ***g_laplacian = state->tile->frame->alf_info->g_laplacian; - //alf_classifier **g_classifier = state->tile->frame->alf_info->g_classifier; - //CHECK((vb_ctu_height & (vb_ctu_height - 1)) != 0, "vb_ctu_height must be a power of 2"); - - static const int th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; - int laplacian[NUM_DIRECTIONS][CLASSIFICATION_BLK_SIZE + 5][CLASSIFICATION_BLK_SIZE + 5]; - memset(laplacian, 0, sizeof(laplacian)); - alf_classifier **classifier = state->tile->frame->alf_info->classifier; - - const int stride = frame->rec->stride; - kvz_pixel *src = state->tile->frame->rec->y; - const int max_activity = 15; - - int fl = 2; - int fl_p1 = fl + 1; - int fl2 = 2 * fl; - - int main_direction, secondary_direction, dir_temp_hv, dir_temp_d; - int pix_y; - - int height = n_height + fl2; - int width = n_width + fl2; - int pos_x = blk_pos_x; - int pos_y = blk_pos_y; - int start_height = pos_y - fl_p1; - - for (int i = 0; i < height; i += 2) - { - int yoffset = (i + 1 + start_height) * stride - fl_p1; - const kvz_pixel *src0 = &src[yoffset - stride]; - const kvz_pixel *src1 = &src[yoffset]; - const kvz_pixel *src2 = &src[yoffset + stride]; - const kvz_pixel *src3 = &src[yoffset + stride * 2]; - - const int y = blk_dst_y - 2 + i; - if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos - 2) - { - src3 = &src[yoffset + stride]; - } - else if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos) - { - src0 = &src[yoffset]; - } - - int *p_y_ver = laplacian[ALF_VER][i]; - int *p_y_hor = laplacian[ALF_HOR][i]; - int *p_y_dig_0 = laplacian[ALF_DIAG0][i]; - int *p_y_dig_1 = laplacian[ALF_DIAG1][i]; - - for (int j = 0; j < width; j += 2) - { - pix_y = j + 1 + pos_x; - const kvz_pixel *p_y = src1 + pix_y; - const kvz_pixel *p_y_down = src0 + pix_y; - const kvz_pixel *p_y_up = src2 + pix_y; - const kvz_pixel *p_y_up2 = src3 + pix_y; - - const int16_t y0 = p_y[0] << 1; - const int16_t y_up1 = p_y_up[1] << 1; - - p_y_ver[j] = abs(y0 - p_y_down[0] - p_y_up[0]) + abs(y_up1 - p_y[1] - p_y_up2[1]); - p_y_hor[j] = abs(y0 - p_y[1] - p_y[-1]) + abs(y_up1 - p_y_up[2] - p_y_up[0]); - p_y_dig_0[j] = abs(y0 - p_y_down[-1] - p_y_up[1]) + abs(y_up1 - p_y[0] - p_y_up2[2]); - p_y_dig_1[j] = abs(y0 - p_y_up[-1] - p_y_down[1]) + abs(y_up1 - p_y_up2[0] - p_y[2]); - - if (j > 4 && (j - 6) % 4 == 0) - { - int j_m_6 = j - 6; - int j_m_4 = j - 4; - int j_m_2 = j - 2; - - p_y_ver[j_m_6] += p_y_ver[j_m_4] + p_y_ver[j_m_2] + p_y_ver[j]; - p_y_hor[j_m_6] += p_y_hor[j_m_4] + p_y_hor[j_m_2] + p_y_hor[j]; - p_y_dig_0[j_m_6] += p_y_dig_0[j_m_4] + p_y_dig_0[j_m_2] + p_y_dig_0[j]; - p_y_dig_1[j_m_6] += p_y_dig_1[j_m_4] + p_y_dig_1[j_m_2] + p_y_dig_1[j]; - } - } - } - - // classification block size - const int cls_size_y = 4; - const int cls_size_x = 4; - - //for (int i = 0; i < blk.height; i += cls_size_y) - for (int i = 0; i < n_height; i += cls_size_y) - { - int* p_y_ver = laplacian[ALF_VER][i]; - int* p_y_ver2 = laplacian[ALF_VER][i + 2]; - int* p_y_ver4 = laplacian[ALF_VER][i + 4]; - int* p_y_ver6 = laplacian[ALF_VER][i + 6]; - - int* p_y_hor = laplacian[ALF_HOR][i]; - int* p_y_hor2 = laplacian[ALF_HOR][i + 2]; - int* p_y_hor4 = laplacian[ALF_HOR][i + 4]; - int* p_y_hor6 = laplacian[ALF_HOR][i + 6]; - - int* p_y_dig0 = laplacian[ALF_DIAG0][i]; - int* p_y_dig02 = laplacian[ALF_DIAG0][i + 2]; - int* p_y_dig04 = laplacian[ALF_DIAG0][i + 4]; - int* p_y_dig06 = laplacian[ALF_DIAG0][i + 6]; - - int* p_y_dig1 = laplacian[ALF_DIAG1][i]; - int* p_y_dig12 = laplacian[ALF_DIAG1][i + 2]; - int* p_y_dig14 = laplacian[ALF_DIAG1][i + 4]; - int* p_y_dig16 = laplacian[ALF_DIAG1][i + 6]; - - //for (int j = 0; j < blk.width; j += cls_size_x) - for (int j = 0; j < n_width; j += cls_size_x) - { - int sum_v = 0; int sum_h = 0; int sum_d0 = 0; int sum_d1 = 0; - - if (((i + blk_dst_y) % vb_ctu_height) == (vb_pos - 4)) - { - sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j]; - sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j]; - sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j]; - sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j]; - } - else if (((i + blk_dst_y) % vb_ctu_height) == vb_pos) - { - sum_v = p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j]; - sum_h = p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j]; - sum_d0 = p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j]; - sum_d1 = p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j]; - } - else - { - sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j]; - sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j]; - sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j]; - sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j]; - } - - int temp_act = sum_v + sum_h; - int activity = 0; - - const int y = (i + blk_dst_y) & (vb_ctu_height - 1); - if (y == vb_pos - 4 || y == vb_pos) - { - activity = CLIP(0, max_activity, (temp_act * 96) >> shift); - } - else - { - activity = CLIP(0, max_activity, (temp_act * 64) >> shift); - } - - int class_idx = th[activity]; - - int hv1, hv0, d1, d0, hvd1, hvd0; - - if (sum_v > sum_h) - { - hv1 = sum_v; - hv0 = sum_h; - dir_temp_hv = 1; - } - else - { - hv1 = sum_h; - hv0 = sum_v; - dir_temp_hv = 3; - } - if (sum_d0 > sum_d1) - { - d1 = sum_d0; - d0 = sum_d1; - dir_temp_d = 0; - } - else - { - d1 = sum_d1; - d0 = sum_d0; - dir_temp_d = 2; - } - if ((uint32_t)d1 * (uint32_t)hv0 > (uint32_t)hv1 * (uint32_t)d0) - { - hvd1 = d1; - hvd0 = d0; - main_direction = dir_temp_d; - secondary_direction = dir_temp_hv; - } - else - { - hvd1 = hv1; - hvd0 = hv0; - main_direction = dir_temp_hv; - secondary_direction = dir_temp_d; - } - - int direction_strength = 0; - if (hvd1 > 2 * hvd0) - { - direction_strength = 1; - } - if (hvd1 * 2 > 9 * hvd0) - { - direction_strength = 2; - } - - if (direction_strength) - { - class_idx += (((main_direction & 0x1) << 1) + direction_strength) * 5; - } - - static const int transpose_table[8] = { 0, 1, 0, 2, 2, 3, 1, 3 }; - int transpose_idx = transpose_table[main_direction * 2 + (secondary_direction >> 1)]; - - int y_offset = i + blk_dst_y; - int x_offset = j + blk_dst_x; - - alf_classifier *cl0 = classifier[y_offset] + x_offset; - alf_classifier *cl1 = classifier[y_offset + 1] + x_offset; - alf_classifier *cl2 = classifier[y_offset + 2] + x_offset; - alf_classifier *cl3 = classifier[y_offset + 3] + x_offset; - - cl0[0].class_idx = cl0[1].class_idx = cl0[2].class_idx = cl0[3].class_idx = - cl1[0].class_idx = cl1[1].class_idx = cl1[2].class_idx = cl1[3].class_idx = - cl2[0].class_idx = cl2[1].class_idx = cl2[2].class_idx = cl2[3].class_idx = - cl3[0].class_idx = cl3[1].class_idx = cl3[2].class_idx = cl3[3].class_idx = class_idx; - - cl0[0].transpose_idx = cl0[1].transpose_idx = cl0[2].transpose_idx = cl0[3].transpose_idx = - cl1[0].transpose_idx = cl1[1].transpose_idx = cl1[2].transpose_idx = cl1[3].transpose_idx = - cl2[0].transpose_idx = cl2[1].transpose_idx = cl2[2].transpose_idx = cl2[3].transpose_idx = - cl3[0].transpose_idx = cl3[1].transpose_idx = cl3[2].transpose_idx = cl3[3].transpose_idx = transpose_idx; - - } - } -} - static void alf_derive_classification(encoder_state_t * const state, const int width, const int height, diff --git a/src/rdo.h b/src/rdo.h index 70055fee..91db4d34 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -80,7 +80,6 @@ extern const uint32_t kvz_entropy_bits[512]; // Floating point fractional bits, derived from kvz_entropy_bits extern const float kvz_f_entropy_bits[512]; -// ToDo: generate a new table for VVC? #define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)] #endif diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c new file mode 100644 index 00000000..35d622a7 --- /dev/null +++ b/src/strategies/generic/alf-generic.c @@ -0,0 +1,281 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +#include "strategies/generic/alf-generic.h" + +#include "cu.h" +#include "encoder.h" +#include "encoderstate.h" +#include "kvazaar.h" +#include "alf.h" +#include "strategyselector.h" + + +static void alf_derive_classification_blk_generic(encoder_state_t * const state, + const int shift, + const int n_height, + const int n_width, + const int blk_pos_x, + const int blk_pos_y, + const int blk_dst_x, + const int blk_dst_y, + const int vb_ctu_height, + int vb_pos) +{ + videoframe_t* const frame = state->tile->frame; + //int ***g_laplacian = state->tile->frame->alf_info->g_laplacian; + //alf_classifier **g_classifier = state->tile->frame->alf_info->g_classifier; + //CHECK((vb_ctu_height & (vb_ctu_height - 1)) != 0, "vb_ctu_height must be a power of 2"); + + static const int th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; + int laplacian[NUM_DIRECTIONS][CLASSIFICATION_BLK_SIZE + 5][CLASSIFICATION_BLK_SIZE + 5]; + memset(laplacian, 0, sizeof(laplacian)); + alf_classifier **classifier = state->tile->frame->alf_info->classifier; + + const int stride = frame->rec->stride; + kvz_pixel *src = state->tile->frame->rec->y; + const int max_activity = 15; + + int fl = 2; + int fl_p1 = fl + 1; + int fl2 = 2 * fl; + + int main_direction, secondary_direction, dir_temp_hv, dir_temp_d; + int pix_y; + + int height = n_height + fl2; + int width = n_width + fl2; + int pos_x = blk_pos_x; + int pos_y = blk_pos_y; + int start_height = pos_y - fl_p1; + + for (int i = 0; i < height; i += 2) + { + int yoffset = (i + 1 + start_height) * stride - fl_p1; + const kvz_pixel *src0 = &src[yoffset - stride]; + const kvz_pixel *src1 = &src[yoffset]; + const kvz_pixel *src2 = &src[yoffset + stride]; + const kvz_pixel *src3 = &src[yoffset + stride * 2]; + + const int y = blk_dst_y - 2 + i; + if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos - 2) + { + src3 = &src[yoffset + stride]; + } + else if (y > 0 && (y & (vb_ctu_height - 1)) == vb_pos) + { + src0 = &src[yoffset]; + } + + int *p_y_ver = laplacian[ALF_VER][i]; + int *p_y_hor = laplacian[ALF_HOR][i]; + int *p_y_dig_0 = laplacian[ALF_DIAG0][i]; + int *p_y_dig_1 = laplacian[ALF_DIAG1][i]; + + for (int j = 0; j < width; j += 2) + { + pix_y = j + 1 + pos_x; + const kvz_pixel *p_y = src1 + pix_y; + const kvz_pixel *p_y_down = src0 + pix_y; + const kvz_pixel *p_y_up = src2 + pix_y; + const kvz_pixel *p_y_up2 = src3 + pix_y; + + const int16_t y0 = p_y[0] << 1; + const int16_t y_up1 = p_y_up[1] << 1; + + p_y_ver[j] = abs(y0 - p_y_down[0] - p_y_up[0]) + abs(y_up1 - p_y[1] - p_y_up2[1]); + p_y_hor[j] = abs(y0 - p_y[1] - p_y[-1]) + abs(y_up1 - p_y_up[2] - p_y_up[0]); + p_y_dig_0[j] = abs(y0 - p_y_down[-1] - p_y_up[1]) + abs(y_up1 - p_y[0] - p_y_up2[2]); + p_y_dig_1[j] = abs(y0 - p_y_up[-1] - p_y_down[1]) + abs(y_up1 - p_y_up2[0] - p_y[2]); + + if (j > 4 && (j - 6) % 4 == 0) + { + int j_m_6 = j - 6; + int j_m_4 = j - 4; + int j_m_2 = j - 2; + + p_y_ver[j_m_6] += p_y_ver[j_m_4] + p_y_ver[j_m_2] + p_y_ver[j]; + p_y_hor[j_m_6] += p_y_hor[j_m_4] + p_y_hor[j_m_2] + p_y_hor[j]; + p_y_dig_0[j_m_6] += p_y_dig_0[j_m_4] + p_y_dig_0[j_m_2] + p_y_dig_0[j]; + p_y_dig_1[j_m_6] += p_y_dig_1[j_m_4] + p_y_dig_1[j_m_2] + p_y_dig_1[j]; + } + } + } + + // classification block size + const int cls_size_y = 4; + const int cls_size_x = 4; + + //for (int i = 0; i < blk.height; i += cls_size_y) + for (int i = 0; i < n_height; i += cls_size_y) + { + int* p_y_ver = laplacian[ALF_VER][i]; + int* p_y_ver2 = laplacian[ALF_VER][i + 2]; + int* p_y_ver4 = laplacian[ALF_VER][i + 4]; + int* p_y_ver6 = laplacian[ALF_VER][i + 6]; + + int* p_y_hor = laplacian[ALF_HOR][i]; + int* p_y_hor2 = laplacian[ALF_HOR][i + 2]; + int* p_y_hor4 = laplacian[ALF_HOR][i + 4]; + int* p_y_hor6 = laplacian[ALF_HOR][i + 6]; + + int* p_y_dig0 = laplacian[ALF_DIAG0][i]; + int* p_y_dig02 = laplacian[ALF_DIAG0][i + 2]; + int* p_y_dig04 = laplacian[ALF_DIAG0][i + 4]; + int* p_y_dig06 = laplacian[ALF_DIAG0][i + 6]; + + int* p_y_dig1 = laplacian[ALF_DIAG1][i]; + int* p_y_dig12 = laplacian[ALF_DIAG1][i + 2]; + int* p_y_dig14 = laplacian[ALF_DIAG1][i + 4]; + int* p_y_dig16 = laplacian[ALF_DIAG1][i + 6]; + + //for (int j = 0; j < blk.width; j += cls_size_x) + for (int j = 0; j < n_width; j += cls_size_x) + { + int sum_v = 0; int sum_h = 0; int sum_d0 = 0; int sum_d1 = 0; + + if (((i + blk_dst_y) % vb_ctu_height) == (vb_pos - 4)) + { + sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j]; + sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j]; + sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j]; + sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j]; + } + else if (((i + blk_dst_y) % vb_ctu_height) == vb_pos) + { + sum_v = p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j]; + sum_h = p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j]; + sum_d0 = p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j]; + sum_d1 = p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j]; + } + else + { + sum_v = p_y_ver[j] + p_y_ver2[j] + p_y_ver4[j] + p_y_ver6[j]; + sum_h = p_y_hor[j] + p_y_hor2[j] + p_y_hor4[j] + p_y_hor6[j]; + sum_d0 = p_y_dig0[j] + p_y_dig02[j] + p_y_dig04[j] + p_y_dig06[j]; + sum_d1 = p_y_dig1[j] + p_y_dig12[j] + p_y_dig14[j] + p_y_dig16[j]; + } + + int temp_act = sum_v + sum_h; + int activity = 0; + + const int y = (i + blk_dst_y) & (vb_ctu_height - 1); + if (y == vb_pos - 4 || y == vb_pos) + { + activity = CLIP(0, max_activity, (temp_act * 96) >> shift); + } + else + { + activity = CLIP(0, max_activity, (temp_act * 64) >> shift); + } + + int class_idx = th[activity]; + + int hv1, hv0, d1, d0, hvd1, hvd0; + + if (sum_v > sum_h) + { + hv1 = sum_v; + hv0 = sum_h; + dir_temp_hv = 1; + } + else + { + hv1 = sum_h; + hv0 = sum_v; + dir_temp_hv = 3; + } + if (sum_d0 > sum_d1) + { + d1 = sum_d0; + d0 = sum_d1; + dir_temp_d = 0; + } + else + { + d1 = sum_d1; + d0 = sum_d0; + dir_temp_d = 2; + } + if ((uint32_t)d1 * (uint32_t)hv0 > (uint32_t)hv1 * (uint32_t)d0) + { + hvd1 = d1; + hvd0 = d0; + main_direction = dir_temp_d; + secondary_direction = dir_temp_hv; + } + else + { + hvd1 = hv1; + hvd0 = hv0; + main_direction = dir_temp_hv; + secondary_direction = dir_temp_d; + } + + int direction_strength = 0; + if (hvd1 > 2 * hvd0) + { + direction_strength = 1; + } + if (hvd1 * 2 > 9 * hvd0) + { + direction_strength = 2; + } + + if (direction_strength) + { + class_idx += (((main_direction & 0x1) << 1) + direction_strength) * 5; + } + + static const int transpose_table[8] = { 0, 1, 0, 2, 2, 3, 1, 3 }; + int transpose_idx = transpose_table[main_direction * 2 + (secondary_direction >> 1)]; + + int y_offset = i + blk_dst_y; + int x_offset = j + blk_dst_x; + + alf_classifier *cl0 = classifier[y_offset] + x_offset; + alf_classifier *cl1 = classifier[y_offset + 1] + x_offset; + alf_classifier *cl2 = classifier[y_offset + 2] + x_offset; + alf_classifier *cl3 = classifier[y_offset + 3] + x_offset; + + cl0[0].class_idx = cl0[1].class_idx = cl0[2].class_idx = cl0[3].class_idx = + cl1[0].class_idx = cl1[1].class_idx = cl1[2].class_idx = cl1[3].class_idx = + cl2[0].class_idx = cl2[1].class_idx = cl2[2].class_idx = cl2[3].class_idx = + cl3[0].class_idx = cl3[1].class_idx = cl3[2].class_idx = cl3[3].class_idx = class_idx; + + cl0[0].transpose_idx = cl0[1].transpose_idx = cl0[2].transpose_idx = cl0[3].transpose_idx = + cl1[0].transpose_idx = cl1[1].transpose_idx = cl1[2].transpose_idx = cl1[3].transpose_idx = + cl2[0].transpose_idx = cl2[1].transpose_idx = cl2[2].transpose_idx = cl2[3].transpose_idx = + cl3[0].transpose_idx = cl3[1].transpose_idx = cl3[2].transpose_idx = cl3[3].transpose_idx = transpose_idx; + + } + } +} + + + +int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic); + + return success; +} diff --git a/src/strategies/generic/alf-generic.h b/src/strategies/generic/alf-generic.h new file mode 100644 index 00000000..edbb9d94 --- /dev/null +++ b/src/strategies/generic/alf-generic.h @@ -0,0 +1,31 @@ +#pragma once +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Generic C implementations of optimized functions. + */ + +#include "global.h" // IWYU pragma: keep + +int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth); + diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c new file mode 100644 index 00000000..d6da9332 --- /dev/null +++ b/src/strategies/strategies-alf.c @@ -0,0 +1,41 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +#include "strategies/strategies-alf.h" +//#include "strategies/avx2/alf-avx2.h" +#include "strategies/generic/alf-generic.h" +#include "strategyselector.h" + + +// Define function pointers. +alf_derive_classification_blk_func* alf_derive_classification_blk; + + +int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { + bool success = true; + + success &= kvz_strategy_register_alf_generic(opaque, bitdepth); + + if (kvz_g_hardware_flags.intel_flags.avx2) { + //success &= kvz_strategy_register_alf_avx2(opaque, bitdepth); + } + + return success; +} \ No newline at end of file diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h new file mode 100644 index 00000000..563697c4 --- /dev/null +++ b/src/strategies/strategies-alf.h @@ -0,0 +1,56 @@ +#pragma once +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Interface for sao functions. + */ + +#include "encoder.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "kvazaar.h" +#include "alf.h" + + +// Declare function pointers. +typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state, + const int shift, + const int n_height, + const int n_width, + const int blk_pos_x, + const int blk_pos_y, + const int blk_dst_x, + const int blk_dst_y, + const int vb_ctu_height, + int vb_pos); + +// Declare function pointers. +extern alf_derive_classification_blk_func * alf_derive_classification_blk; + +int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); + + +#define STRATEGIES_ALF_EXPORTS \ + {"alf_derive_classification_blk", (void**) &alf_derive_classification_blk}, \ + + diff --git a/src/strategyselector.c b/src/strategyselector.c index fc97635b..80281940 100644 --- a/src/strategyselector.c +++ b/src/strategyselector.c @@ -90,6 +90,11 @@ int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth) { fprintf(stderr, "kvz_strategy_register_encode failed!\n"); return 0; } + + if (!kvz_strategy_register_alf(&strategies, bitdepth)) { + fprintf(stderr, "kvz_strategy_register_encode failed!\n"); + return 0; + } while(cur_strategy_to_select->fptr) { *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type); diff --git a/src/strategyselector.h b/src/strategyselector.h index 575accc3..c4820153 100644 --- a/src/strategyselector.h +++ b/src/strategyselector.h @@ -96,6 +96,7 @@ int kvz_strategyselector_register(void *opaque, const char *type, const char *st #include "strategies/strategies-intra.h" #include "strategies/strategies-sao.h" #include "strategies/strategies-encode.h" +#include "strategies/strategies-alf.h" static const strategy_to_select_t strategies_to_select[] = { STRATEGIES_NAL_EXPORTS @@ -106,6 +107,7 @@ static const strategy_to_select_t strategies_to_select[] = { STRATEGIES_INTRA_EXPORTS STRATEGIES_SAO_EXPORTS STRATEGIES_ENCODE_EXPORTS + STRATEGIES_ALF_EXPORTS { NULL, NULL }, }; From b158d05bcad1d5c004294fbdb3eef8efaa8547a2 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 19 Aug 2021 17:19:17 +0300 Subject: [PATCH 02/13] [alf] rename strategy function to include prefix --- src/alf.c | 2 +- src/strategies/strategies-alf.c | 2 +- src/strategies/strategies-alf.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alf.c b/src/alf.c index 9de923b3..8b2d876b 100644 --- a/src/alf.c +++ b/src/alf.c @@ -5899,7 +5899,7 @@ static void alf_derive_classification(encoder_state_t * const state, { int n_width = MIN(j + CLASSIFICATION_BLK_SIZE, max_width) - j; - alf_derive_classification_blk(state, state->encoder_control->cfg.input_bitdepth + 4, n_height, n_width, j, i, + kvz_alf_derive_classification_blk(state, state->encoder_control->cfg.input_bitdepth + 4, n_height, n_width, j, i, j - x_pos + blk_dst_x, i - y_pos + blk_dst_y, alf_vb_luma_ctu_height, alf_vb_luma_pos); diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c index d6da9332..6bb7645e 100644 --- a/src/strategies/strategies-alf.c +++ b/src/strategies/strategies-alf.c @@ -25,7 +25,7 @@ // Define function pointers. -alf_derive_classification_blk_func* alf_derive_classification_blk; +alf_derive_classification_blk_func* kvz_alf_derive_classification_blk; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h index 563697c4..8c9c2924 100644 --- a/src/strategies/strategies-alf.h +++ b/src/strategies/strategies-alf.h @@ -45,12 +45,12 @@ typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state, int vb_pos); // Declare function pointers. -extern alf_derive_classification_blk_func * alf_derive_classification_blk; +extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); #define STRATEGIES_ALF_EXPORTS \ - {"alf_derive_classification_blk", (void**) &alf_derive_classification_blk}, \ + {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \ From c3c96d69c2f3293dfeed41c97390325758aa5eb7 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 20 Aug 2021 11:45:02 +0300 Subject: [PATCH 03/13] [alf] Add modified alf_derive_classification_blk_sse41() from VTM 13.0 * Modified to work with bitdepth 8 --- build/kvazaar_lib/kvazaar_lib.vcxproj | 4 +- build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 6 + src/Makefile.am | 8 +- src/strategies/sse41/alf-sse41.c | 356 ++++++++++++++++++ src/strategies/sse41/alf-sse41.h | 32 ++ src/strategies/strategies-alf.c | 6 +- 6 files changed, 405 insertions(+), 7 deletions(-) create mode 100644 src/strategies/sse41/alf-sse41.c create mode 100644 src/strategies/sse41/alf-sse41.h diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj index ef459fe1..2ac2e406 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj @@ -193,6 +193,7 @@ + @@ -269,6 +270,7 @@ + @@ -343,4 +345,4 @@ - + \ No newline at end of file diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters index bc207ca8..87f212e0 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters @@ -260,6 +260,9 @@ Optimization\strategies + + Optimization\strategies\sse41 + @@ -485,6 +488,9 @@ Optimization\strategies + + Optimization\strategies\sse41 + diff --git a/src/Makefile.am b/src/Makefile.am index 4cdf6a34..af098c39 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -117,7 +117,7 @@ libkvazaar_la_SOURCES = \ transform.h \ videoframe.c \ videoframe.h \ - strategies/generic/alf-generic.c \ + strategies/generic/alf-generic.c \ strategies/generic/alf-generic.h \ strategies/generic/dct-generic.c \ strategies/generic/dct-generic.h \ @@ -139,7 +139,7 @@ libkvazaar_la_SOURCES = \ strategies/optimized_sad_func_ptr_t.h \ strategies/generic/sao_shared_generics.h \ strategies/strategies-common.h \ - strategies/strategies-alf.c \ + strategies/strategies-alf.c \ strategies/strategies-alf.h \ strategies/strategies-dct.c \ strategies/strategies-dct.h \ @@ -207,7 +207,9 @@ libsse2_la_SOURCES = \ libsse41_la_SOURCES = \ strategies/sse41/picture-sse41.c \ strategies/sse41/picture-sse41.h \ - strategies/sse41/reg_sad_pow2_widths-sse41.h + strategies/sse41/reg_sad_pow2_widths-sse41.h \ + strategies/sse41/alf-sse41.c \ + strategies/sse41/alf-sse41.h if HAVE_PPC diff --git a/src/strategies/sse41/alf-sse41.c b/src/strategies/sse41/alf-sse41.c new file mode 100644 index 00000000..2d7ded14 --- /dev/null +++ b/src/strategies/sse41/alf-sse41.c @@ -0,0 +1,356 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +#include "global.h" + +#if COMPILE_INTEL_SSE41 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 +#include "strategies/sse41/alf-sse41.h" + +#include +#include + +#include "strategyselector.h" + +static void alf_derive_classification_blk_sse41(encoder_state_t * const state, + const int shift, + const int n_height, + const int n_width, + const int blk_pos_x, + const int blk_pos_y, + const int blk_dst_x, + const int blk_dst_y, + const int vb_ctu_height, + int vb_pos) +{ + videoframe_t* const frame = state->tile->frame; + const size_t imgStride = frame->rec->stride; + const kvz_pixel * srcExt = state->tile->frame->rec->y; + + const int imgHExtended = n_height + 4; + const int imgWExtended = n_width + 4; + + const int posX = blk_pos_x; + const int posY = blk_pos_y; + + alf_classifier** classifier = state->tile->frame->alf_info->classifier; + + // 18x40 array + uint16_t colSums[(CLASSIFICATION_BLK_SIZE + 4) >> 1] + [CLASSIFICATION_BLK_SIZE + 8]; + + for (int i = 0; i < imgHExtended; i += 2) + { + const size_t offset = (i + posY - 3) * imgStride + posX - 3; + + const kvz_pixel*imgY0 = &srcExt[offset]; + const kvz_pixel*imgY1 = &srcExt[offset + imgStride]; + const kvz_pixel*imgY2 = &srcExt[offset + imgStride * 2]; + const kvz_pixel*imgY3 = &srcExt[offset + imgStride * 3]; + + // pixel padding for gradient calculation + int pos = blk_dst_y - 2 + i; + int posInCTU = pos & (vb_ctu_height - 1); + if (pos > 0 && posInCTU == vb_pos - 2) + { + imgY3 = imgY2; + } + else if (pos > 0 && posInCTU == vb_pos) + { + imgY0 = imgY1; + } + + __m128i prev = _mm_setzero_si128(); + + for (int j = 0; j < imgWExtended; j += 8) + { + const __m128i x00 = _mm_loadu_si128((const __m128i*) (imgY0 + j)); + const __m128i x01 = _mm_loadu_si128((const __m128i*) (imgY1 + j)); + const __m128i x02 = _mm_loadu_si128((const __m128i*) (imgY2 + j)); + const __m128i x03 = _mm_loadu_si128((const __m128i*) (imgY3 + j)); + + const __m128i x04 = _mm_loadu_si128((const __m128i*) (imgY0 + j + 2)); + const __m128i x05 = _mm_loadu_si128((const __m128i*) (imgY1 + j + 2)); + const __m128i x06 = _mm_loadu_si128((const __m128i*) (imgY2 + j + 2)); + const __m128i x07 = _mm_loadu_si128((const __m128i*) (imgY3 + j + 2)); + + const __m128i x0 = _mm_unpacklo_epi8(x00, _mm_setzero_si128()); + const __m128i x1 = _mm_unpacklo_epi8(x01, _mm_setzero_si128()); + const __m128i x2 = _mm_unpacklo_epi8(x02, _mm_setzero_si128()); + const __m128i x3 = _mm_unpacklo_epi8(x03, _mm_setzero_si128()); + + const __m128i x4 = _mm_unpacklo_epi8(x04, _mm_setzero_si128()); + const __m128i x5 = _mm_unpacklo_epi8(x05, _mm_setzero_si128()); + const __m128i x6 = _mm_unpacklo_epi8(x06, _mm_setzero_si128()); + const __m128i x7 = _mm_unpacklo_epi8(x07, _mm_setzero_si128()); + + const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa); + const __m128i n = _mm_blend_epi16(x0, x5, 0x55); + const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa); + const __m128i w = _mm_blend_epi16(x1, x2, 0xaa); + const __m128i e = _mm_blend_epi16(x5, x6, 0xaa); + const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa); + const __m128i s = _mm_blend_epi16(x2, x7, 0x55); + const __m128i se = _mm_blend_epi16(x6, x7, 0xaa); + + __m128i c = _mm_blend_epi16(x1, x6, 0x55); + c = _mm_add_epi16(c, c); + __m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13)); + + const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s))); + const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e))); + const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se))); + const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw))); + + const __m128i hv = _mm_hadd_epi16(ver, hor); + const __m128i di = _mm_hadd_epi16(di0, di1); + const __m128i all = _mm_hadd_epi16(hv, di); + + const __m128i t = _mm_blend_epi16(all, prev, 0xaa); + _mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all)); + prev = all; + + if (j + 8 < imgWExtended) + { + j += 8; + + const __m128i x0 = _mm_unpackhi_epi8(x00, _mm_setzero_si128()); + const __m128i x1 = _mm_unpackhi_epi8(x01, _mm_setzero_si128()); + const __m128i x2 = _mm_unpackhi_epi8(x02, _mm_setzero_si128()); + const __m128i x3 = _mm_unpackhi_epi8(x03, _mm_setzero_si128()); + + const __m128i x4 = _mm_unpackhi_epi8(x04, _mm_setzero_si128()); + const __m128i x5 = _mm_unpackhi_epi8(x05, _mm_setzero_si128()); + const __m128i x6 = _mm_unpackhi_epi8(x06, _mm_setzero_si128()); + const __m128i x7 = _mm_unpackhi_epi8(x07, _mm_setzero_si128()); + + const __m128i nw = _mm_blend_epi16(x0, x1, 0xaa); + const __m128i n = _mm_blend_epi16(x0, x5, 0x55); + const __m128i ne = _mm_blend_epi16(x4, x5, 0xaa); + const __m128i w = _mm_blend_epi16(x1, x2, 0xaa); + const __m128i e = _mm_blend_epi16(x5, x6, 0xaa); + const __m128i sw = _mm_blend_epi16(x2, x3, 0xaa); + const __m128i s = _mm_blend_epi16(x2, x7, 0x55); + const __m128i se = _mm_blend_epi16(x6, x7, 0xaa); + + __m128i c = _mm_blend_epi16(x1, x6, 0x55); + c = _mm_add_epi16(c, c); + __m128i d = _mm_shuffle_epi8(c, _mm_setr_epi8(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13)); + + const __m128i ver = _mm_abs_epi16(_mm_sub_epi16(c, _mm_add_epi16(n, s))); + const __m128i hor = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(w, e))); + const __m128i di0 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(nw, se))); + const __m128i di1 = _mm_abs_epi16(_mm_sub_epi16(d, _mm_add_epi16(ne, sw))); + + const __m128i hv = _mm_hadd_epi16(ver, hor); + const __m128i di = _mm_hadd_epi16(di0, di1); + const __m128i all = _mm_hadd_epi16(hv, di); + + const __m128i t = _mm_blend_epi16(all, prev, 0xaa); + _mm_storeu_si128((__m128i*) & colSums[i >> 1][j], _mm_hadd_epi16(t, all)); + prev = all; + } + } + } + + for (int i = 0; i < (n_height >> 1); i += 4) + { + for (int j = 0; j < n_width; j += 8) + { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + const uint32_t z = (2 * i + blk_pos_y) & (vb_ctu_height - 1); + const uint32_t z2 = (2 * i + 4 + blk_pos_y) & (vb_ctu_height - 1); + + x0 = (z == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 0][j + 4]); + x1 = _mm_loadu_si128((__m128i *) &colSums[i + 1][j + 4]); + x2 = _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]); + x3 = (z == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]); + + x4 = (z2 == vb_pos) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 2][j + 4]); + x5 = _mm_loadu_si128((__m128i *) &colSums[i + 3][j + 4]); + x6 = _mm_loadu_si128((__m128i *) &colSums[i + 4][j + 4]); + x7 = (z2 == vb_pos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 5][j + 4]); + + __m128i x0l = _mm_cvtepu16_epi32(x0); + __m128i x0h = _mm_unpackhi_epi16(x0, _mm_setzero_si128()); + __m128i x1l = _mm_cvtepu16_epi32(x1); + __m128i x1h = _mm_unpackhi_epi16(x1, _mm_setzero_si128()); + __m128i x2l = _mm_cvtepu16_epi32(x2); + __m128i x2h = _mm_unpackhi_epi16(x2, _mm_setzero_si128()); + __m128i x3l = _mm_cvtepu16_epi32(x3); + __m128i x3h = _mm_unpackhi_epi16(x3, _mm_setzero_si128()); + __m128i x4l = _mm_cvtepu16_epi32(x4); + __m128i x4h = _mm_unpackhi_epi16(x4, _mm_setzero_si128()); + __m128i x5l = _mm_cvtepu16_epi32(x5); + __m128i x5h = _mm_unpackhi_epi16(x5, _mm_setzero_si128()); + __m128i x6l = _mm_cvtepu16_epi32(x6); + __m128i x6h = _mm_unpackhi_epi16(x6, _mm_setzero_si128()); + __m128i x7l = _mm_cvtepu16_epi32(x7); + __m128i x7h = _mm_unpackhi_epi16(x7, _mm_setzero_si128()); + + x0l = _mm_add_epi32(x0l, x1l); + x2l = _mm_add_epi32(x2l, x3l); + x4l = _mm_add_epi32(x4l, x5l); + x6l = _mm_add_epi32(x6l, x7l); + x0h = _mm_add_epi32(x0h, x1h); + x2h = _mm_add_epi32(x2h, x3h); + x4h = _mm_add_epi32(x4h, x5h); + x6h = _mm_add_epi32(x6h, x7h); + + x0l = _mm_add_epi32(x0l, x2l); + x4l = _mm_add_epi32(x4l, x6l); + x0h = _mm_add_epi32(x0h, x2h); + x4h = _mm_add_epi32(x4h, x6h); + + x2l = _mm_unpacklo_epi32(x0l, x4l); + x2h = _mm_unpackhi_epi32(x0l, x4l); + x6l = _mm_unpacklo_epi32(x0h, x4h); + x6h = _mm_unpackhi_epi32(x0h, x4h); + + __m128i sumV = _mm_unpacklo_epi32(x2l, x6l); + __m128i sumH = _mm_unpackhi_epi32(x2l, x6l); + __m128i sumD0 = _mm_unpacklo_epi32(x2h, x6h); + __m128i sumD1 = _mm_unpackhi_epi32(x2h, x6h); + + // uint32_t tempAct = sumV + sumH; + __m128i tempAct = _mm_add_epi32(sumV, sumH); + + // const uint32_t activity = std::min(15, tempAct * scale >> shift); + // static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; + // uint8_t classIdx = th[activity]; + const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64; + const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64; + __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2))); + activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift)); + activity = _mm_min_epi32(activity, _mm_set1_epi32(15)); + __m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); + + // if (sumV > sumH) + // { + // hv1 = sumV; + // hv0 = sumH; + // dirTempHV = 0; + // } + // else + // { + // hv1 = sumH; + // hv0 = sumV; + // dirTempHV = 1; + // } + __m128i dirTempHVMinus1 = _mm_cmpgt_epi32(sumV, sumH); + __m128i hv1 = _mm_max_epi32(sumV, sumH); + __m128i hv0 = _mm_min_epi32(sumV, sumH); + + // if (sumD0 > sumD1) + // { + // d1 = sumD0; + // d0 = sumD1; + // dirTempD = 0; + // } + // else + // { + // d1 = sumD1; + // d0 = sumD0; + // dirTempD = 1; + // } + __m128i dirTempDMinus1 = _mm_cmpgt_epi32(sumD0, sumD1); + __m128i d1 = _mm_max_epi32(sumD0, sumD1); + __m128i d0 = _mm_min_epi32(sumD0, sumD1); + + // int dirIdx; + // if (d1 * hv0 > hv1 * d0) + // { + // hvd1 = d1; + // hvd0 = d0; + // dirIdx = 0; + // } + // else + // { + // hvd1 = hv1; + // hvd0 = hv0; + // dirIdx = 2; + // } + __m128i a = _mm_xor_si128(_mm_mullo_epi32(d1, hv0), _mm_set1_epi32(0x80000000)); + __m128i b = _mm_xor_si128(_mm_mullo_epi32(hv1, d0), _mm_set1_epi32(0x80000000)); + __m128i dirIdx = _mm_cmpgt_epi32(a, b); + __m128i hvd1 = _mm_blendv_epi8(hv1, d1, dirIdx); + __m128i hvd0 = _mm_blendv_epi8(hv0, d0, dirIdx); + + // if (hvd1 * 2 > 9 * hvd0) + // { + // classIdx += (dirIdx + 2) * 5; + // } + // else if (hvd1 > 2 * hvd0) + // { + // classIdx += (dirIdx + 1) * 5; + // } + __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0)); + __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3))); + __m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5)); + classIdx = _mm_add_epi32(classIdx, offset); + classIdx = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5))); + offset = _mm_andnot_si128(dirIdx, offset); + offset = _mm_add_epi32(offset, offset); + classIdx = _mm_add_epi32(classIdx, offset); + + // uint8_t transposeIdx = 2 * dirTempD + dirTempHV; + __m128i transposeIdx = _mm_set1_epi32(3); + transposeIdx = _mm_add_epi32(transposeIdx, dirTempHVMinus1); + transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); + transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); + + int yOffset = 2 * i + blk_pos_y; + int xOffset = j + blk_pos_x; + + static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide"); + __m128i v; + v = _mm_unpacklo_epi8(classIdx, transposeIdx); + v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); + _mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v); + v = _mm_unpackhi_epi8(classIdx, transposeIdx); + v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v); + _mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v); + } + } +} + +#endif // KVZ_BIT_DEPTH == 8 +#endif //COMPILE_INTEL_SSE41 + + +int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) { + bool success = true; +#if COMPILE_INTEL_SSE41 +#if KVZ_BIT_DEPTH == 8 + if (bitdepth == 8){ + success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41); + } +#endif // KVZ_BIT_DEPTH == 8 +#endif + return success; +} diff --git a/src/strategies/sse41/alf-sse41.h b/src/strategies/sse41/alf-sse41.h new file mode 100644 index 00000000..d41556e4 --- /dev/null +++ b/src/strategies/sse41/alf-sse41.h @@ -0,0 +1,32 @@ +#pragma once +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for SSE4.1. + */ + +#include "global.h" // IWYU pragma: keep +#include "kvazaar.h" + +int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth); + diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c index 6bb7645e..537dadbe 100644 --- a/src/strategies/strategies-alf.c +++ b/src/strategies/strategies-alf.c @@ -19,7 +19,7 @@ ****************************************************************************/ #include "strategies/strategies-alf.h" -//#include "strategies/avx2/alf-avx2.h" +#include "strategies/sse41/alf-sse41.h" #include "strategies/generic/alf-generic.h" #include "strategyselector.h" @@ -33,8 +33,8 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { success &= kvz_strategy_register_alf_generic(opaque, bitdepth); - if (kvz_g_hardware_flags.intel_flags.avx2) { - //success &= kvz_strategy_register_alf_avx2(opaque, bitdepth); + if (kvz_g_hardware_flags.intel_flags.sse41) { + success &= kvz_strategy_register_alf_sse41(opaque, bitdepth); } return success; From dc6a29b0d8f1280dc9212da2ccf3b206cc1c631c Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 25 Aug 2021 10:50:00 +0300 Subject: [PATCH 04/13] [alf] Initial generic strategies for 5x5 and 7x7 filtering --- src/alf.c | 414 +----------------------- src/strategies/generic/alf-generic.c | 455 +++++++++++++++++++++++++++ src/strategies/strategies-alf.c | 3 +- src/strategies/strategies-alf.h | 40 ++- 4 files changed, 500 insertions(+), 412 deletions(-) diff --git a/src/alf.c b/src/alf.c index 8b2d876b..d591aaed 100644 --- a/src/alf.c +++ b/src/alf.c @@ -5339,412 +5339,6 @@ static void alf_encoder_ctb(encoder_state_t * const state, } } -static void alf_filter_block(encoder_state_t * const state, - const kvz_pixel *src_pixels, - kvz_pixel *dst_pixels, - const int src_stride, - const int dst_stride, - const short* filter_set, - const int16_t *fClipSet, - clp_rng clp_rng, - alf_component_id component_id, - const int width, - const int height, - int x_pos, - int y_pos, - int blk_dst_x, - int blk_dst_y, - int vb_pos, - const int vb_ctu_height) -{ - alf_filter_type const filter_type = component_id == COMPONENT_Y ? ALF_FILTER_7X7 : ALF_FILTER_5X5; - const bool chroma = component_id == COMPONENT_Y ? 0 : 1; - const int8_t bit_depth = state->encoder_control->bitdepth; - - if (chroma) - { - assert((int)filter_type == 0); //Chroma needs to have filtType == 0 - } - - const int start_height = y_pos; - const int end_height = start_height + height; - const int start_width = x_pos; - const int end_width = start_width + width; - - const kvz_pixel *src = src_pixels; - kvz_pixel *dst = dst_pixels + blk_dst_y * dst_stride; - - const kvz_pixel *p_img_y_pad_0, *p_img_y_pad_1, *p_img_y_pad_2, *p_img_y_pad_3, *p_img_y_pad_4, *p_img_y_pad_5, *p_img_y_pad_6; - const kvz_pixel *p_img_0, *p_img_1, *p_img_2, *p_img_3, *p_img_4, *p_img_5, *p_img_6; - - const short *coef = filter_set; - const int16_t *clip = fClipSet; - - const int shift = bit_depth - 1; - - const int offset = 1 << (shift - 1); - - int transpose_idx = 0; - const int cls_size_y = 4; - const int cls_size_x = 4; - - assert((start_height % cls_size_y) == 0); //Wrong startHeight in filtering - assert((start_width % cls_size_x) == 0); //Wrong startWidth in filtering - assert(((end_height - start_height) % cls_size_y) == 0); //Wrong endHeight in filtering - assert(((end_width - start_width) % cls_size_x) == 0); //Wrong endWidth in filtering - - alf_classifier *p_class = NULL; - - int dst_stride2 = dst_stride * cls_size_y; - int src_stride2 = src_stride * cls_size_y; - - //std::vector filter_coeff(MAX_NUM_ALF_LUMA_COEFF); - int filter_coeff[MAX_NUM_ALF_LUMA_COEFF]; - memset(filter_coeff, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int)); - //std::array filterClipp; - int filter_clipp[MAX_NUM_ALF_LUMA_COEFF]; - memset(filter_clipp, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int)); - - p_img_y_pad_0 = src + start_height * src_stride + start_width; - p_img_y_pad_1 = p_img_y_pad_0 + src_stride; - p_img_y_pad_2 = p_img_y_pad_0 - src_stride; - p_img_y_pad_3 = p_img_y_pad_1 + src_stride; - p_img_y_pad_4 = p_img_y_pad_2 - src_stride; - p_img_y_pad_5 = p_img_y_pad_3 + src_stride; - p_img_y_pad_6 = p_img_y_pad_4 - src_stride; - - kvz_pixel* p_rec_0 = dst + blk_dst_x;//start_width; - kvz_pixel* p_rec_1 = p_rec_0 + dst_stride; - - for (int i = 0; i < end_height - start_height; i += cls_size_y) - { - if (!chroma) - { - p_class = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x; - } - - for (int j = 0; j < end_width - start_width; j += cls_size_x) - { - if (!chroma) - { - alf_classifier cl = p_class[j]; - transpose_idx = cl.transpose_idx; - coef = filter_set + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF; - clip = fClipSet + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF; - } - - if (filter_type == ALF_FILTER_7X7) - { - if (transpose_idx == 1) - { - filter_coeff[0] = coef[9]; - filter_coeff[1] = coef[4]; - filter_coeff[2] = coef[10]; - filter_coeff[3] = coef[8]; - filter_coeff[4] = coef[1]; - filter_coeff[5] = coef[5]; - filter_coeff[6] = coef[11]; - filter_coeff[7] = coef[7]; - filter_coeff[8] = coef[3]; - filter_coeff[9] = coef[0]; - filter_coeff[10] = coef[2]; - filter_coeff[11] = coef[6]; - filter_coeff[12] = coef[12]; - - filter_clipp[0] = clip[9]; - filter_clipp[1] = clip[4]; - filter_clipp[2] = clip[10]; - filter_clipp[3] = clip[8]; - filter_clipp[4] = clip[1]; - filter_clipp[5] = clip[5]; - filter_clipp[6] = clip[11]; - filter_clipp[7] = clip[7]; - filter_clipp[8] = clip[3]; - filter_clipp[9] = clip[0]; - filter_clipp[10] = clip[2]; - filter_clipp[11] = clip[6]; - filter_clipp[12] = clip[12]; - } - else if (transpose_idx == 2) - { - filter_coeff[0] = coef[0]; - filter_coeff[1] = coef[3]; - filter_coeff[2] = coef[2]; - filter_coeff[3] = coef[1]; - filter_coeff[4] = coef[8]; - filter_coeff[5] = coef[7]; - filter_coeff[6] = coef[6]; - filter_coeff[7] = coef[5]; - filter_coeff[8] = coef[4]; - filter_coeff[9] = coef[9]; - filter_coeff[10] = coef[10]; - filter_coeff[11] = coef[11]; - filter_coeff[12] = coef[12]; - - filter_clipp[0] = clip[0]; - filter_clipp[1] = clip[3]; - filter_clipp[2] = clip[2]; - filter_clipp[3] = clip[1]; - filter_clipp[4] = clip[8]; - filter_clipp[5] = clip[7]; - filter_clipp[6] = clip[6]; - filter_clipp[7] = clip[5]; - filter_clipp[8] = clip[4]; - filter_clipp[9] = clip[9]; - filter_clipp[10] = clip[10]; - filter_clipp[11] = clip[11]; - filter_clipp[12] = clip[12]; - - } - else if (transpose_idx == 3) - { - filter_coeff[0] = coef[9]; - filter_coeff[1] = coef[8]; - filter_coeff[2] = coef[10]; - filter_coeff[3] = coef[4]; - filter_coeff[4] = coef[3]; - filter_coeff[5] = coef[7]; - filter_coeff[6] = coef[11]; - filter_coeff[7] = coef[5]; - filter_coeff[8] = coef[1]; - filter_coeff[9] = coef[0]; - filter_coeff[10] = coef[2]; - filter_coeff[11] = coef[6]; - filter_coeff[12] = coef[12]; - - filter_clipp[0] = clip[9]; - filter_clipp[1] = clip[8]; - filter_clipp[2] = clip[10]; - filter_clipp[3] = clip[4]; - filter_clipp[4] = clip[3]; - filter_clipp[5] = clip[7]; - filter_clipp[6] = clip[11]; - filter_clipp[7] = clip[5]; - filter_clipp[8] = clip[1]; - filter_clipp[9] = clip[0]; - filter_clipp[10] = clip[2]; - filter_clipp[11] = clip[6]; - filter_clipp[12] = clip[12]; - } - else - { - filter_coeff[0] = coef[0]; - filter_coeff[1] = coef[1]; - filter_coeff[2] = coef[2]; - filter_coeff[3] = coef[3]; - filter_coeff[4] = coef[4]; - filter_coeff[5] = coef[5]; - filter_coeff[6] = coef[6]; - filter_coeff[7] = coef[7]; - filter_coeff[8] = coef[8]; - filter_coeff[9] = coef[9]; - filter_coeff[10] = coef[10]; - filter_coeff[11] = coef[11]; - filter_coeff[12] = coef[12]; - - filter_clipp[0] = clip[0]; - filter_clipp[1] = clip[1]; - filter_clipp[2] = clip[2]; - filter_clipp[3] = clip[3]; - filter_clipp[4] = clip[4]; - filter_clipp[5] = clip[5]; - filter_clipp[6] = clip[6]; - filter_clipp[7] = clip[7]; - filter_clipp[8] = clip[8]; - filter_clipp[9] = clip[9]; - filter_clipp[10] = clip[10]; - filter_clipp[11] = clip[11]; - filter_clipp[12] = clip[12]; - } - } - else - { - if (transpose_idx == 1) - { - filter_coeff[0] = coef[4]; - filter_coeff[1] = coef[1]; - filter_coeff[2] = coef[5]; - filter_coeff[3] = coef[3]; - filter_coeff[4] = coef[0]; - filter_coeff[5] = coef[2]; - filter_coeff[6] = coef[6]; - - filter_clipp[0] = clip[4]; - filter_clipp[1] = clip[1]; - filter_clipp[2] = clip[5]; - filter_clipp[3] = clip[3]; - filter_clipp[4] = clip[0]; - filter_clipp[5] = clip[2]; - filter_clipp[6] = clip[6]; - - } - else if (transpose_idx == 2) - { - filter_coeff[0] = coef[0]; - filter_coeff[1] = coef[3]; - filter_coeff[2] = coef[2]; - filter_coeff[3] = coef[1]; - filter_coeff[4] = coef[4]; - filter_coeff[5] = coef[5]; - filter_coeff[6] = coef[6]; - - filter_clipp[0] = clip[0]; - filter_clipp[1] = clip[3]; - filter_clipp[2] = clip[2]; - filter_clipp[3] = clip[1]; - filter_clipp[4] = clip[4]; - filter_clipp[5] = clip[5]; - filter_clipp[6] = clip[6]; - - } - else if (transpose_idx == 3) - { - filter_coeff[0] = coef[4]; - filter_coeff[1] = coef[3]; - filter_coeff[2] = coef[5]; - filter_coeff[3] = coef[1]; - filter_coeff[4] = coef[0]; - filter_coeff[5] = coef[2]; - filter_coeff[6] = coef[6]; - - filter_clipp[0] = clip[4]; - filter_clipp[1] = clip[3]; - filter_clipp[2] = clip[5]; - filter_clipp[3] = clip[1]; - filter_clipp[4] = clip[0]; - filter_clipp[5] = clip[2]; - filter_clipp[6] = clip[6]; - - } - else - { - filter_coeff[0] = coef[0]; - filter_coeff[1] = coef[1]; - filter_coeff[2] = coef[2]; - filter_coeff[3] = coef[3]; - filter_coeff[4] = coef[4]; - filter_coeff[5] = coef[5]; - filter_coeff[6] = coef[6]; - - filter_clipp[0] = clip[0]; - filter_clipp[1] = clip[1]; - filter_clipp[2] = clip[2]; - filter_clipp[3] = clip[3]; - filter_clipp[4] = clip[4]; - filter_clipp[5] = clip[5]; - filter_clipp[6] = clip[6]; - - } - } - - for (int ii = 0; ii < cls_size_y; ii++) - { - p_img_0 = p_img_y_pad_0 + j + ii * src_stride; - p_img_1 = p_img_y_pad_1 + j + ii * src_stride; - p_img_2 = p_img_y_pad_2 + j + ii * src_stride; - p_img_3 = p_img_y_pad_3 + j + ii * src_stride; - p_img_4 = p_img_y_pad_4 + j + ii * src_stride; - p_img_5 = p_img_y_pad_5 + j + ii * src_stride; - p_img_6 = p_img_y_pad_6 + j + ii * src_stride; - - p_rec_1 = p_rec_0 + j + ii * dst_stride; - - const int y_vb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); - - if (y_vb < vb_pos && (y_vb >= vb_pos - (chroma ? 2 : 4))) // above - { - p_img_1 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_1; - p_img_3 = (y_vb >= vb_pos - 2) ? p_img_1 : p_img_3; - p_img_5 = (y_vb >= vb_pos - 3) ? p_img_3 : p_img_5; - - p_img_2 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_2; - p_img_4 = (y_vb >= vb_pos - 2) ? p_img_2 : p_img_4; - p_img_6 = (y_vb >= vb_pos - 3) ? p_img_4 : p_img_6; - } - - else if (y_vb >= vb_pos && (y_vb <= vb_pos + (chroma ? 1 : 3))) // bottom - { - p_img_2 = (y_vb == vb_pos) ? p_img_0 : p_img_2; - p_img_4 = (y_vb <= vb_pos + 1) ? p_img_2 : p_img_4; - p_img_6 = (y_vb <= vb_pos + 2) ? p_img_4 : p_img_6; - - p_img_1 = (y_vb == vb_pos) ? p_img_0 : p_img_1; - p_img_3 = (y_vb <= vb_pos + 1) ? p_img_1 : p_img_3; - p_img_5 = (y_vb <= vb_pos + 2) ? p_img_3 : p_img_5; - } - - bool is_near_vb_above = y_vb < vb_pos && (y_vb >= vb_pos - 1); - bool is_near_vb_below = y_vb >= vb_pos && (y_vb <= vb_pos); - for (int jj = 0; jj < cls_size_x; jj++) - { - int sum = 0; - const kvz_pixel curr = p_img_0[+0]; - - if (filter_type == ALF_FILTER_7X7) - { - sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_5[+0], p_img_6[+0])); - - sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_3[+1], p_img_4[-1])); - sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_3[+0], p_img_4[+0])); - sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_3[-1], p_img_4[+1])); - - sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_1[+2], p_img_2[-2])); - sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_1[+1], p_img_2[-1])); - sum += filter_coeff[6] * (clip_alf(filter_clipp[6], curr, p_img_1[+0], p_img_2[+0])); - sum += filter_coeff[7] * (clip_alf(filter_clipp[7], curr, p_img_1[-1], p_img_2[+1])); - sum += filter_coeff[8] * (clip_alf(filter_clipp[8], curr, p_img_1[-2], p_img_2[+2])); - - sum += filter_coeff[9] * (clip_alf(filter_clipp[9], curr, p_img_0[+3], p_img_0[-3])); - sum += filter_coeff[10] * (clip_alf(filter_clipp[10], curr, p_img_0[+2], p_img_0[-2])); - sum += filter_coeff[11] * (clip_alf(filter_clipp[11], curr, p_img_0[+1], p_img_0[-1])); - } - else - { - sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_3[+0], p_img_4[+0])); - - sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_1[+1], p_img_2[-1])); - sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_1[+0], p_img_2[+0])); - sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_1[-1], p_img_2[+1])); - - sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_0[+2], p_img_0[-2])); - sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_0[+1], p_img_0[-1])); - } - - if (!(is_near_vb_above || is_near_vb_below)) - { - sum = (sum + offset) >> shift; - } - else - { - sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3); - } - sum += curr; - - p_rec_1[jj] = kvz_fast_clip_32bit_to_pixel(sum); - - p_img_0++; - p_img_1++; - p_img_2++; - p_img_3++; - p_img_4++; - p_img_5++; - p_img_6++; - } - } - } - - p_rec_0 += dst_stride2; - p_rec_1 += dst_stride2; - - p_img_y_pad_0 += src_stride2; - p_img_y_pad_1 += src_stride2; - p_img_y_pad_2 += src_stride2; - p_img_y_pad_3 += src_stride2; - p_img_y_pad_4 += src_stride2; - p_img_y_pad_5 += src_stride2; - p_img_y_pad_6 += src_stride2; - } -} static void alf_reconstruct(encoder_state_t * const state, array_variables *arr_vars) @@ -5819,10 +5413,10 @@ static void alf_reconstruct(encoder_state_t * const state, coeff = arr_vars->fixed_filter_set_coeff_dec[filter_set_index]; clip = arr_vars->clip_default; } - alf_filter_block(state, + kvz_alf_filter_7x7_blk(state, alf_info->alf_tmp_y, state->tile->frame->rec->y, luma_stride, luma_stride, - coeff, clip, arr_vars->clp_rngs.comp[COMPONENT_Y], COMPONENT_Y, + coeff, clip, arr_vars->clp_rngs.comp[COMPONENT_Y], width, height, x_pos, y_pos, x_pos, y_pos, alf_vb_luma_pos, alf_vb_luma_ctu_height); } @@ -5836,10 +5430,10 @@ static void alf_reconstruct(encoder_state_t * const state, const kvz_pixel *src_pixels = comp_id - 1 ? alf_info->alf_tmp_v : alf_info->alf_tmp_u; const int alt_num = alf_info->ctu_alternative[comp_id][ctu_idx]; - alf_filter_block(state, + kvz_alf_filter_5x5_blk(state, src_pixels, dst_pixels, chroma_stride, chroma_stride, - arr_vars->chroma_coeff_final[alt_num], arr_vars->chroma_clipp_final[alt_num], arr_vars->clp_rngs.comp[comp_idx], comp_idx, + arr_vars->chroma_coeff_final[alt_num], arr_vars->chroma_clipp_final[alt_num], arr_vars->clp_rngs.comp[comp_idx], width >> chroma_scale_x, height >> chroma_scale_y, x_pos >> chroma_scale_x, y_pos >> chroma_scale_y, x_pos >> chroma_scale_x, y_pos >> chroma_scale_y, diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index 35d622a7..d9aa3b95 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -27,6 +27,12 @@ #include "alf.h" #include "strategyselector.h" +extern kvz_pixel kvz_fast_clip_32bit_to_pixel(int32_t value); + +static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1) +{ + return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref); +} static void alf_derive_classification_blk_generic(encoder_state_t * const state, const int shift, @@ -269,13 +275,462 @@ static void alf_derive_classification_blk_generic(encoder_state_t * const state, } } +static void alf_filter_block_generic(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + alf_component_id component_id, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + alf_filter_type const filter_type = component_id == COMPONENT_Y ? ALF_FILTER_7X7 : ALF_FILTER_5X5; + const bool chroma = component_id == COMPONENT_Y ? 0 : 1; + const int8_t bit_depth = state->encoder_control->bitdepth; + if (chroma) + { + assert((int)filter_type == 0); //Chroma needs to have filtType == 0 + } + + const int start_height = y_pos; + const int end_height = start_height + height; + const int start_width = x_pos; + const int end_width = start_width + width; + + const kvz_pixel* src = src_pixels; + kvz_pixel* dst = dst_pixels + blk_dst_y * dst_stride; + + const kvz_pixel* p_img_y_pad_0, * p_img_y_pad_1, * p_img_y_pad_2, * p_img_y_pad_3, * p_img_y_pad_4, * p_img_y_pad_5, * p_img_y_pad_6; + const kvz_pixel* p_img_0, * p_img_1, * p_img_2, * p_img_3, * p_img_4, * p_img_5, * p_img_6; + + const short* coef = filter_set; + const int16_t* clip = fClipSet; + + const int shift = bit_depth - 1; + + const int offset = 1 << (shift - 1); + + int transpose_idx = 0; + const int cls_size_y = 4; + const int cls_size_x = 4; + + assert((start_height % cls_size_y) == 0); //Wrong startHeight in filtering + assert((start_width % cls_size_x) == 0); //Wrong startWidth in filtering + assert(((end_height - start_height) % cls_size_y) == 0); //Wrong endHeight in filtering + assert(((end_width - start_width) % cls_size_x) == 0); //Wrong endWidth in filtering + + alf_classifier* p_class = NULL; + + int dst_stride2 = dst_stride * cls_size_y; + int src_stride2 = src_stride * cls_size_y; + + //std::vector filter_coeff(MAX_NUM_ALF_LUMA_COEFF); + int filter_coeff[MAX_NUM_ALF_LUMA_COEFF]; + memset(filter_coeff, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int)); + //std::array filterClipp; + int filter_clipp[MAX_NUM_ALF_LUMA_COEFF]; + memset(filter_clipp, 0, MAX_NUM_ALF_LUMA_COEFF * sizeof(int)); + + p_img_y_pad_0 = src + start_height * src_stride + start_width; + p_img_y_pad_1 = p_img_y_pad_0 + src_stride; + p_img_y_pad_2 = p_img_y_pad_0 - src_stride; + p_img_y_pad_3 = p_img_y_pad_1 + src_stride; + p_img_y_pad_4 = p_img_y_pad_2 - src_stride; + p_img_y_pad_5 = p_img_y_pad_3 + src_stride; + p_img_y_pad_6 = p_img_y_pad_4 - src_stride; + + kvz_pixel* p_rec_0 = dst + blk_dst_x;//start_width; + kvz_pixel* p_rec_1 = p_rec_0 + dst_stride; + + for (int i = 0; i < end_height - start_height; i += cls_size_y) + { + if (!chroma) + { + p_class = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x; + } + + for (int j = 0; j < end_width - start_width; j += cls_size_x) + { + if (!chroma) + { + alf_classifier cl = p_class[j]; + transpose_idx = cl.transpose_idx; + coef = filter_set + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF; + clip = fClipSet + cl.class_idx * MAX_NUM_ALF_LUMA_COEFF; + } + + if (filter_type == ALF_FILTER_7X7) + { + if (transpose_idx == 1) + { + filter_coeff[0] = coef[9]; + filter_coeff[1] = coef[4]; + filter_coeff[2] = coef[10]; + filter_coeff[3] = coef[8]; + filter_coeff[4] = coef[1]; + filter_coeff[5] = coef[5]; + filter_coeff[6] = coef[11]; + filter_coeff[7] = coef[7]; + filter_coeff[8] = coef[3]; + filter_coeff[9] = coef[0]; + filter_coeff[10] = coef[2]; + filter_coeff[11] = coef[6]; + filter_coeff[12] = coef[12]; + + filter_clipp[0] = clip[9]; + filter_clipp[1] = clip[4]; + filter_clipp[2] = clip[10]; + filter_clipp[3] = clip[8]; + filter_clipp[4] = clip[1]; + filter_clipp[5] = clip[5]; + filter_clipp[6] = clip[11]; + filter_clipp[7] = clip[7]; + filter_clipp[8] = clip[3]; + filter_clipp[9] = clip[0]; + filter_clipp[10] = clip[2]; + filter_clipp[11] = clip[6]; + filter_clipp[12] = clip[12]; + } + else if (transpose_idx == 2) + { + filter_coeff[0] = coef[0]; + filter_coeff[1] = coef[3]; + filter_coeff[2] = coef[2]; + filter_coeff[3] = coef[1]; + filter_coeff[4] = coef[8]; + filter_coeff[5] = coef[7]; + filter_coeff[6] = coef[6]; + filter_coeff[7] = coef[5]; + filter_coeff[8] = coef[4]; + filter_coeff[9] = coef[9]; + filter_coeff[10] = coef[10]; + filter_coeff[11] = coef[11]; + filter_coeff[12] = coef[12]; + + filter_clipp[0] = clip[0]; + filter_clipp[1] = clip[3]; + filter_clipp[2] = clip[2]; + filter_clipp[3] = clip[1]; + filter_clipp[4] = clip[8]; + filter_clipp[5] = clip[7]; + filter_clipp[6] = clip[6]; + filter_clipp[7] = clip[5]; + filter_clipp[8] = clip[4]; + filter_clipp[9] = clip[9]; + filter_clipp[10] = clip[10]; + filter_clipp[11] = clip[11]; + filter_clipp[12] = clip[12]; + + } + else if (transpose_idx == 3) + { + filter_coeff[0] = coef[9]; + filter_coeff[1] = coef[8]; + filter_coeff[2] = coef[10]; + filter_coeff[3] = coef[4]; + filter_coeff[4] = coef[3]; + filter_coeff[5] = coef[7]; + filter_coeff[6] = coef[11]; + filter_coeff[7] = coef[5]; + filter_coeff[8] = coef[1]; + filter_coeff[9] = coef[0]; + filter_coeff[10] = coef[2]; + filter_coeff[11] = coef[6]; + filter_coeff[12] = coef[12]; + + filter_clipp[0] = clip[9]; + filter_clipp[1] = clip[8]; + filter_clipp[2] = clip[10]; + filter_clipp[3] = clip[4]; + filter_clipp[4] = clip[3]; + filter_clipp[5] = clip[7]; + filter_clipp[6] = clip[11]; + filter_clipp[7] = clip[5]; + filter_clipp[8] = clip[1]; + filter_clipp[9] = clip[0]; + filter_clipp[10] = clip[2]; + filter_clipp[11] = clip[6]; + filter_clipp[12] = clip[12]; + } + else + { + filter_coeff[0] = coef[0]; + filter_coeff[1] = coef[1]; + filter_coeff[2] = coef[2]; + filter_coeff[3] = coef[3]; + filter_coeff[4] = coef[4]; + filter_coeff[5] = coef[5]; + filter_coeff[6] = coef[6]; + filter_coeff[7] = coef[7]; + filter_coeff[8] = coef[8]; + filter_coeff[9] = coef[9]; + filter_coeff[10] = coef[10]; + filter_coeff[11] = coef[11]; + filter_coeff[12] = coef[12]; + + filter_clipp[0] = clip[0]; + filter_clipp[1] = clip[1]; + filter_clipp[2] = clip[2]; + filter_clipp[3] = clip[3]; + filter_clipp[4] = clip[4]; + filter_clipp[5] = clip[5]; + filter_clipp[6] = clip[6]; + filter_clipp[7] = clip[7]; + filter_clipp[8] = clip[8]; + filter_clipp[9] = clip[9]; + filter_clipp[10] = clip[10]; + filter_clipp[11] = clip[11]; + filter_clipp[12] = clip[12]; + } + } + else + { + if (transpose_idx == 1) + { + filter_coeff[0] = coef[4]; + filter_coeff[1] = coef[1]; + filter_coeff[2] = coef[5]; + filter_coeff[3] = coef[3]; + filter_coeff[4] = coef[0]; + filter_coeff[5] = coef[2]; + filter_coeff[6] = coef[6]; + + filter_clipp[0] = clip[4]; + filter_clipp[1] = clip[1]; + filter_clipp[2] = clip[5]; + filter_clipp[3] = clip[3]; + filter_clipp[4] = clip[0]; + filter_clipp[5] = clip[2]; + filter_clipp[6] = clip[6]; + + } + else if (transpose_idx == 2) + { + filter_coeff[0] = coef[0]; + filter_coeff[1] = coef[3]; + filter_coeff[2] = coef[2]; + filter_coeff[3] = coef[1]; + filter_coeff[4] = coef[4]; + filter_coeff[5] = coef[5]; + filter_coeff[6] = coef[6]; + + filter_clipp[0] = clip[0]; + filter_clipp[1] = clip[3]; + filter_clipp[2] = clip[2]; + filter_clipp[3] = clip[1]; + filter_clipp[4] = clip[4]; + filter_clipp[5] = clip[5]; + filter_clipp[6] = clip[6]; + + } + else if (transpose_idx == 3) + { + filter_coeff[0] = coef[4]; + filter_coeff[1] = coef[3]; + filter_coeff[2] = coef[5]; + filter_coeff[3] = coef[1]; + filter_coeff[4] = coef[0]; + filter_coeff[5] = coef[2]; + filter_coeff[6] = coef[6]; + + filter_clipp[0] = clip[4]; + filter_clipp[1] = clip[3]; + filter_clipp[2] = clip[5]; + filter_clipp[3] = clip[1]; + filter_clipp[4] = clip[0]; + filter_clipp[5] = clip[2]; + filter_clipp[6] = clip[6]; + + } + else + { + filter_coeff[0] = coef[0]; + filter_coeff[1] = coef[1]; + filter_coeff[2] = coef[2]; + filter_coeff[3] = coef[3]; + filter_coeff[4] = coef[4]; + filter_coeff[5] = coef[5]; + filter_coeff[6] = coef[6]; + + filter_clipp[0] = clip[0]; + filter_clipp[1] = clip[1]; + filter_clipp[2] = clip[2]; + filter_clipp[3] = clip[3]; + filter_clipp[4] = clip[4]; + filter_clipp[5] = clip[5]; + filter_clipp[6] = clip[6]; + + } + } + + for (int ii = 0; ii < cls_size_y; ii++) + { + p_img_0 = p_img_y_pad_0 + j + ii * src_stride; + p_img_1 = p_img_y_pad_1 + j + ii * src_stride; + p_img_2 = p_img_y_pad_2 + j + ii * src_stride; + p_img_3 = p_img_y_pad_3 + j + ii * src_stride; + p_img_4 = p_img_y_pad_4 + j + ii * src_stride; + p_img_5 = p_img_y_pad_5 + j + ii * src_stride; + p_img_6 = p_img_y_pad_6 + j + ii * src_stride; + + p_rec_1 = p_rec_0 + j + ii * dst_stride; + + const int y_vb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); + + if (y_vb < vb_pos && (y_vb >= vb_pos - (chroma ? 2 : 4))) // above + { + p_img_1 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_1; + p_img_3 = (y_vb >= vb_pos - 2) ? p_img_1 : p_img_3; + p_img_5 = (y_vb >= vb_pos - 3) ? p_img_3 : p_img_5; + + p_img_2 = (y_vb == vb_pos - 1) ? p_img_0 : p_img_2; + p_img_4 = (y_vb >= vb_pos - 2) ? p_img_2 : p_img_4; + p_img_6 = (y_vb >= vb_pos - 3) ? p_img_4 : p_img_6; + } + + else if (y_vb >= vb_pos && (y_vb <= vb_pos + (chroma ? 1 : 3))) // bottom + { + p_img_2 = (y_vb == vb_pos) ? p_img_0 : p_img_2; + p_img_4 = (y_vb <= vb_pos + 1) ? p_img_2 : p_img_4; + p_img_6 = (y_vb <= vb_pos + 2) ? p_img_4 : p_img_6; + + p_img_1 = (y_vb == vb_pos) ? p_img_0 : p_img_1; + p_img_3 = (y_vb <= vb_pos + 1) ? p_img_1 : p_img_3; + p_img_5 = (y_vb <= vb_pos + 2) ? p_img_3 : p_img_5; + } + + bool is_near_vb_above = y_vb < vb_pos && (y_vb >= vb_pos - 1); + bool is_near_vb_below = y_vb >= vb_pos && (y_vb <= vb_pos); + for (int jj = 0; jj < cls_size_x; jj++) + { + int sum = 0; + const kvz_pixel curr = p_img_0[+0]; + + if (filter_type == ALF_FILTER_7X7) + { + sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_5[+0], p_img_6[+0])); + + sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_3[+1], p_img_4[-1])); + sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_3[+0], p_img_4[+0])); + sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_3[-1], p_img_4[+1])); + + sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_1[+2], p_img_2[-2])); + sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_1[+1], p_img_2[-1])); + sum += filter_coeff[6] * (clip_alf(filter_clipp[6], curr, p_img_1[+0], p_img_2[+0])); + sum += filter_coeff[7] * (clip_alf(filter_clipp[7], curr, p_img_1[-1], p_img_2[+1])); + sum += filter_coeff[8] * (clip_alf(filter_clipp[8], curr, p_img_1[-2], p_img_2[+2])); + + sum += filter_coeff[9] * (clip_alf(filter_clipp[9], curr, p_img_0[+3], p_img_0[-3])); + sum += filter_coeff[10] * (clip_alf(filter_clipp[10], curr, p_img_0[+2], p_img_0[-2])); + sum += filter_coeff[11] * (clip_alf(filter_clipp[11], curr, p_img_0[+1], p_img_0[-1])); + } + else + { + sum += filter_coeff[0] * (clip_alf(filter_clipp[0], curr, p_img_3[+0], p_img_4[+0])); + + sum += filter_coeff[1] * (clip_alf(filter_clipp[1], curr, p_img_1[+1], p_img_2[-1])); + sum += filter_coeff[2] * (clip_alf(filter_clipp[2], curr, p_img_1[+0], p_img_2[+0])); + sum += filter_coeff[3] * (clip_alf(filter_clipp[3], curr, p_img_1[-1], p_img_2[+1])); + + sum += filter_coeff[4] * (clip_alf(filter_clipp[4], curr, p_img_0[+2], p_img_0[-2])); + sum += filter_coeff[5] * (clip_alf(filter_clipp[5], curr, p_img_0[+1], p_img_0[-1])); + } + + if (!(is_near_vb_above || is_near_vb_below)) + { + sum = (sum + offset) >> shift; + } + else + { + sum = (sum + (1 << ((shift + 3) - 1))) >> (shift + 3); + } + sum += curr; + + p_rec_1[jj] = kvz_fast_clip_32bit_to_pixel(sum); + + p_img_0++; + p_img_1++; + p_img_2++; + p_img_3++; + p_img_4++; + p_img_5++; + p_img_6++; + } + } + } + + p_rec_0 += dst_stride2; + p_rec_1 += dst_stride2; + + p_img_y_pad_0 += src_stride2; + p_img_y_pad_1 += src_stride2; + p_img_y_pad_2 += src_stride2; + p_img_y_pad_3 += src_stride2; + p_img_y_pad_4 += src_stride2; + p_img_y_pad_5 += src_stride2; + p_img_y_pad_6 += src_stride2; + } +} + + +static void alf_filter_5x5_block_generic(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, + filter_set, fClipSet, clp_rng, COMPONENT_Cb, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height); +} + +static void alf_filter_7x7_block_generic(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height); +} int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) { bool success = true; success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic); + success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic); + success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic); return success; } diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c index 537dadbe..d5d99936 100644 --- a/src/strategies/strategies-alf.c +++ b/src/strategies/strategies-alf.c @@ -26,7 +26,8 @@ // Define function pointers. alf_derive_classification_blk_func* kvz_alf_derive_classification_blk; - +alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; +alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { bool success = true; diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h index 8c9c2924..8ad15384 100644 --- a/src/strategies/strategies-alf.h +++ b/src/strategies/strategies-alf.h @@ -22,7 +22,7 @@ /** * \ingroup Optimization * \file - * Interface for sao functions. + * Interface for alf functions. */ #include "encoder.h" @@ -44,13 +44,51 @@ typedef void (alf_derive_classification_blk_func)(encoder_state_t * const state, const int vb_ctu_height, int vb_pos); +typedef void (alf_filter_5x5_blk_func)(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height); + +typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height); + // Declare function pointers. extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk; +extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; +extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); #define STRATEGIES_ALF_EXPORTS \ {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \ + {"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \ + {"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \ From f61b9138cd4d9acf65f2b4f104dd6d6c6dee1635 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 25 Aug 2021 11:36:04 +0300 Subject: [PATCH 05/13] [alf] Import SSE4.1 optimized 5x5 and 7x7 filters from VTM13 * Modified to work with 8-bit pixels --- src/strategies/sse41/alf-sse41.c | 452 +++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 23 deletions(-) diff --git a/src/strategies/sse41/alf-sse41.c b/src/strategies/sse41/alf-sse41.c index 2d7ded14..a803e4cc 100644 --- a/src/strategies/sse41/alf-sse41.c +++ b/src/strategies/sse41/alf-sse41.c @@ -236,13 +236,13 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state, // const uint32_t activity = std::min(15, tempAct * scale >> shift); // static const uint8_t th[16] = { 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; - // uint8_t classIdx = th[activity]; + // uint8_t class_idx = th[activity]; const uint32_t scale = (z == vb_pos - 4 || z == vb_pos) ? 96 : 64; const uint32_t scale2 = (z2 == vb_pos - 4 || z2 == vb_pos) ? 96 : 64; __m128i activity = _mm_mullo_epi32(tempAct, _mm_unpacklo_epi64(_mm_set1_epi32(scale), _mm_set1_epi32(scale2))); activity = _mm_srl_epi32(activity, _mm_cvtsi32_si128(shift)); activity = _mm_min_epi32(activity, _mm_set1_epi32(15)); - __m128i classIdx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); + __m128i class_idx = _mm_shuffle_epi8(_mm_setr_epi8(0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4), activity); // if (sumV > sumH) // { @@ -297,48 +297,452 @@ static void alf_derive_classification_blk_sse41(encoder_state_t * const state, // if (hvd1 * 2 > 9 * hvd0) // { - // classIdx += (dirIdx + 2) * 5; + // class_idx += (dirIdx + 2) * 5; // } // else if (hvd1 > 2 * hvd0) // { - // classIdx += (dirIdx + 1) * 5; + // class_idx += (dirIdx + 1) * 5; // } __m128i strength1 = _mm_cmpgt_epi32(hvd1, _mm_add_epi32(hvd0, hvd0)); __m128i strength2 = _mm_cmpgt_epi32(_mm_add_epi32(hvd1, hvd1), _mm_add_epi32(hvd0, _mm_slli_epi32(hvd0, 3))); __m128i offset = _mm_and_si128(strength1, _mm_set1_epi32(5)); - classIdx = _mm_add_epi32(classIdx, offset); - classIdx = _mm_add_epi32(classIdx, _mm_and_si128(strength2, _mm_set1_epi32(5))); + class_idx = _mm_add_epi32(class_idx, offset); + class_idx = _mm_add_epi32(class_idx, _mm_and_si128(strength2, _mm_set1_epi32(5))); offset = _mm_andnot_si128(dirIdx, offset); offset = _mm_add_epi32(offset, offset); - classIdx = _mm_add_epi32(classIdx, offset); + class_idx = _mm_add_epi32(class_idx, offset); - // uint8_t transposeIdx = 2 * dirTempD + dirTempHV; - __m128i transposeIdx = _mm_set1_epi32(3); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempHVMinus1); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); - transposeIdx = _mm_add_epi32(transposeIdx, dirTempDMinus1); + // uint8_t transpose_idx = 2 * dirTempD + dirTempHV; + __m128i transpose_idx = _mm_set1_epi32(3); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempHVMinus1); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1); + transpose_idx = _mm_add_epi32(transpose_idx, dirTempDMinus1); int yOffset = 2 * i + blk_pos_y; int xOffset = j + blk_pos_x; - static_assert(sizeof(alf_classifier) == 2, "ALFClassifier type must be 16 bits wide"); + static_assert(sizeof(alf_classifier) == 2, "alf_classifier type must be 16 bits wide"); __m128i v; - v = _mm_unpacklo_epi8(classIdx, transposeIdx); + v = _mm_unpacklo_epi8(class_idx, transpose_idx); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); - _mm_storeu_si128((__m128i *) (classifier[yOffset] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 1] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 2] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 3] + xOffset), v); - v = _mm_unpackhi_epi8(classIdx, transposeIdx); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 1] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 2] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 3] + xOffset), v); + v = _mm_unpackhi_epi8(class_idx, transpose_idx); v = _mm_shuffle_epi8(v, _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9)); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 4] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 5] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 6] + xOffset), v); - _mm_storeu_si128((__m128i *) (classifier[yOffset + 7] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 4] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 5] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 6] + xOffset), v); + _mm_storeu_si128((__m128i *) (state->tile->frame->alf_info->classifier[yOffset + 7] + xOffset), v); } } } + +INLINE static void process2coeffs_5x5(__m128i params[2][3], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) { + const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur); + const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur); + const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur); + const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur); + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m128i val01C = _mm_unpacklo_epi16(val01, val11); + __m128i val01D = _mm_unpackhi_epi16(val01, val11); + + __m128i limit01A = params[1][i]; + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01A); + val01C = _mm_min_epi16(val01C, limit01A); + val01D = _mm_min_epi16(val01D, limit01A); + + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01A); + val01C = _mm_max_epi16(val01C, limit01A); + val01D = _mm_max_epi16(val01D, limit01A); + + val01A = _mm_add_epi16(val01A, val01C); + val01B = _mm_add_epi16(val01B, val01D); + + __m128i coeff01A = params[0][i]; + + *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A)); + *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01A)); +}; + + +static void alf_filter_5x5_block_sse41(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + + + assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2"); + + alf_component_id compId = COMPONENT_Cb; + + const size_t srcStride = src_stride; + const size_t dstStride = dst_stride; + + const int SHIFT = state->encoder_control->bitdepth - 1; + const int ROUND = 1 << (SHIFT - 1); + const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND); + + const size_t STEP_X = 8; + const size_t STEP_Y = 4; + + assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering"); + assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering"); + assert(height % STEP_Y == 0 && "Wrong endHeight in filtering"); + assert(width % 4 == 0 && "Wrong endWidth in filtering"); + + const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos; + kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x; + + + + const __m128i mmOffset = _mm_set1_epi32(ROUND); + const __m128i mmMin = _mm_set1_epi16(clp_rng.min); + const __m128i mmMax = _mm_set1_epi16(clp_rng.max); + + __m128i params[2][3]; + __m128i fs = _mm_loadu_si128((__m128i*) filter_set); + params[0][0] = _mm_shuffle_epi32(fs, 0x00); + params[0][1] = _mm_shuffle_epi32(fs, 0x55); + params[0][2] = _mm_shuffle_epi32(fs, 0xaa); + __m128i fc = _mm_loadu_si128((__m128i*) fClipSet); + params[1][0] = _mm_shuffle_epi32(fc, 0x00); + params[1][1] = _mm_shuffle_epi32(fc, 0x55); + params[1][2] = _mm_shuffle_epi32(fc, 0xaa); + + const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); + + for (size_t i = 0; i < height; i += STEP_Y) + { + for (size_t j = 0; j < width; j += STEP_X) + { + + for (size_t ii = 0; ii < STEP_Y; ii++) + { + const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4; + + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + + const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); + if (yVb < vb_pos && (yVb >= vb_pos - 2)) // above + { + pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1; + pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3; + + pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2; + pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4; + } + else if (yVb >= vb_pos && (yVb <= vb_pos + 1)) // bottom + { + pImg2 = (yVb == vb_pos) ? pImg0 : pImg2; + pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4; + + pImg1 = (yVb == vb_pos) ? pImg0 : pImg1; + pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3; + } + __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128()); + + __m128i accumA = mmOffset; + __m128i accumB = mmOffset; + + + + process2coeffs_5x5(params, &cur, &accumA, &accumB, 0, pImg3 + 0, pImg4 + 0, pImg1 + 1, pImg2 - 1); + process2coeffs_5x5(params, &cur, &accumA, &accumB, 1, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1); + process2coeffs_5x5(params, &cur, &accumA, &accumB, 2, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1); + bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos); + if (!(isNearVBabove || isNearVBbelow)) + { + accumA = _mm_srai_epi32(accumA, SHIFT); + accumB = _mm_srai_epi32(accumB, SHIFT); + } + else + { + accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3); + accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3); + } + accumA = _mm_packs_epi32(accumA, accumB); + accumA = _mm_add_epi16(accumA, cur); + accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); + + if (j + STEP_X <= width) + { + //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask)); + } + else + { + //_mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_store_ss((float*) (dst + ii * dstStride + j), _mm_castsi128_ps(_mm_shuffle_epi8(accumA, mask))); + } + } + + } + + src += srcStride * STEP_Y; + dst += dstStride * STEP_Y; + } +} + +#define sh(x) 0x0202 * (x & 7) + 0x0100 + 0x1010 * (x & 8) + +static const uint16_t shuffleTab[4][2][8] = { + { + { sh(0), sh(1), sh(2), sh(3), sh(4), sh(5), sh(6), sh(7) }, + { sh(8), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(9), sh(4), sh(10), sh(8), sh(1), sh(5), sh(11), sh(7) }, + { sh(3), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(0), sh(3), sh(2), sh(1), sh(8), sh(7), sh(6), sh(5) }, + { sh(4), sh(9), sh(10), sh(11), sh(12), sh(13), sh(14), sh(15) }, + }, + { + { sh(9), sh(8), sh(10), sh(4), sh(3), sh(7), sh(11), sh(5) }, + { sh(1), sh(0), sh(2), sh(6), sh(12), sh(13), sh(14), sh(15) }, + }, +}; + + + +INLINE static void process2coeffs_7x7(__m128i params[2][2][6], __m128i *cur, __m128i *accumA, __m128i *accumB, const int i, const kvz_pixel* ptr0, const kvz_pixel* ptr1, const kvz_pixel* ptr2, const kvz_pixel* ptr3) { + const __m128i val00 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr0), _mm_setzero_si128()), *cur); + const __m128i val10 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr2), _mm_setzero_si128()), *cur); + const __m128i val01 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr1), _mm_setzero_si128()), *cur); + const __m128i val11 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) ptr3), _mm_setzero_si128()), *cur); + + __m128i val01A = _mm_unpacklo_epi16(val00, val10); + __m128i val01B = _mm_unpackhi_epi16(val00, val10); + __m128i val01C = _mm_unpacklo_epi16(val01, val11); + __m128i val01D = _mm_unpackhi_epi16(val01, val11); + + __m128i limit01A = params[0][1][i]; + __m128i limit01B = params[1][1][i]; + + val01A = _mm_min_epi16(val01A, limit01A); + val01B = _mm_min_epi16(val01B, limit01B); + val01C = _mm_min_epi16(val01C, limit01A); + val01D = _mm_min_epi16(val01D, limit01B); + + limit01A = _mm_sub_epi16(_mm_setzero_si128(), limit01A); + limit01B = _mm_sub_epi16(_mm_setzero_si128(), limit01B); + + val01A = _mm_max_epi16(val01A, limit01A); + val01B = _mm_max_epi16(val01B, limit01B); + val01C = _mm_max_epi16(val01C, limit01A); + val01D = _mm_max_epi16(val01D, limit01B); + + val01A = _mm_add_epi16(val01A, val01C); + val01B = _mm_add_epi16(val01B, val01D); + + const __m128i coeff01A = params[0][0][i]; + const __m128i coeff01B = params[1][0][i]; + + *accumA = _mm_add_epi32(*accumA, _mm_madd_epi16(val01A, coeff01A)); + *accumB = _mm_add_epi32(*accumB, _mm_madd_epi16(val01B, coeff01B)); +}; + + + + +static void alf_filter_7x7_block_sse41(encoder_state_t* const state, + const kvz_pixel* src_pixels, + kvz_pixel* dst_pixels, + const int src_stride, + const int dst_stride, + const short* filter_set, + const int16_t* fClipSet, + clp_rng clp_rng, + const int width, + const int height, + int x_pos, + int y_pos, + int blk_dst_x, + int blk_dst_y, + int vb_pos, + const int vb_ctu_height) +{ + assert((vb_ctu_height & (vb_ctu_height - 1)) == 0 && "vb_ctu_height must be a power of 2"); + alf_component_id compId = COMPONENT_Y; + + + const size_t srcStride = src_stride; + const size_t dstStride = dst_stride; + + const int SHIFT = state->encoder_control->bitdepth - 1; + const int ROUND = 1 << (SHIFT - 1); + + const size_t STEP_X = 8; + const size_t STEP_Y = 4; + + assert(y_pos % STEP_Y == 0 && "Wrong startHeight in filtering"); + assert(x_pos % STEP_X == 0 && "Wrong startWidth in filtering"); + assert(height % STEP_Y == 0 && "Wrong endHeight in filtering"); + assert(width % STEP_X == 0 && "Wrong endWidth in filtering"); + + const kvz_pixel* src = src_pixels + y_pos * srcStride + x_pos; + kvz_pixel* dst = dst_pixels + blk_dst_y * dstStride + blk_dst_x; + + const __m128i mmOffset = _mm_set1_epi32(ROUND); + const __m128i mmOffset1 = _mm_set1_epi32((1 << ((SHIFT + 3) - 1)) - ROUND); + const __m128i mmMin = _mm_set1_epi16(clp_rng.min); + const __m128i mmMax = _mm_set1_epi16(clp_rng.max); + + const __m128i mask = _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); + + for (size_t i = 0; i < height; i += STEP_Y) + { + const alf_classifier* pClass = state->tile->frame->alf_info->classifier[blk_dst_y + i] + blk_dst_x; + + for (size_t j = 0; j < width; j += STEP_X) + { + __m128i params[2][2][6]; + + for (int k = 0; k < 2; ++k) + { + const alf_classifier* cl = &pClass[j + 4 * k]; + + const int transpose_idx = cl->transpose_idx; + const int class_idx = cl->class_idx; + + static_assert(sizeof(*filter_set) == 2, "ALF coeffs must be 16-bit wide"); + static_assert(sizeof(*fClipSet) == 2, "ALF clip values must be 16-bit wide"); + + __m128i rawCoeff0, rawCoeff1; + __m128i rawClip0, rawClip1; + + rawCoeff0 = _mm_loadu_si128((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF)); + rawCoeff1 = _mm_loadl_epi64((const __m128i*) (filter_set + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8)); + + rawClip0 = _mm_loadu_si128((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF)); + rawClip1 = _mm_loadl_epi64((const __m128i*) (fClipSet + class_idx * MAX_NUM_ALF_LUMA_COEFF + 8)); + + const __m128i s0 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][0]); + const __m128i s1 = _mm_xor_si128(s0, _mm_set1_epi8((char)0x80)); + const __m128i s2 = _mm_loadu_si128((const __m128i*) shuffleTab[transpose_idx][1]); + const __m128i s3 = _mm_xor_si128(s2, _mm_set1_epi8((char)0x80)); + + const __m128i rawCoeffLo = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s0), _mm_shuffle_epi8(rawCoeff1, s1)); + const __m128i rawCoeffHi = _mm_or_si128(_mm_shuffle_epi8(rawCoeff0, s2), _mm_shuffle_epi8(rawCoeff1, s3)); + const __m128i rawClipLo = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s0), _mm_shuffle_epi8(rawClip1, s1)); + const __m128i rawClipHi = _mm_or_si128(_mm_shuffle_epi8(rawClip0, s2), _mm_shuffle_epi8(rawClip1, s3)); + + params[k][0][0] = _mm_shuffle_epi32(rawCoeffLo, 0x00); + params[k][0][1] = _mm_shuffle_epi32(rawCoeffLo, 0x55); + params[k][0][2] = _mm_shuffle_epi32(rawCoeffLo, 0xaa); + params[k][0][3] = _mm_shuffle_epi32(rawCoeffLo, 0xff); + params[k][0][4] = _mm_shuffle_epi32(rawCoeffHi, 0x00); + params[k][0][5] = _mm_shuffle_epi32(rawCoeffHi, 0x55); + params[k][1][0] = _mm_shuffle_epi32(rawClipLo, 0x00); + params[k][1][1] = _mm_shuffle_epi32(rawClipLo, 0x55); + params[k][1][2] = _mm_shuffle_epi32(rawClipLo, 0xaa); + params[k][1][3] = _mm_shuffle_epi32(rawClipLo, 0xff); + params[k][1][4] = _mm_shuffle_epi32(rawClipHi, 0x00); + params[k][1][5] = _mm_shuffle_epi32(rawClipHi, 0x55); + } + + for (size_t ii = 0; ii < STEP_Y; ii++) + { + const kvz_pixel* pImg0, * pImg1, * pImg2, * pImg3, * pImg4, * pImg5, * pImg6; + + pImg0 = src + j + ii * srcStride; + pImg1 = pImg0 + srcStride; + pImg2 = pImg0 - srcStride; + pImg3 = pImg1 + srcStride; + pImg4 = pImg2 - srcStride; + pImg5 = pImg3 + srcStride; + pImg6 = pImg4 - srcStride; + + const int yVb = (blk_dst_y + i + ii) & (vb_ctu_height - 1); + if (yVb < vb_pos && (yVb >= vb_pos - 4)) // above + { + pImg1 = (yVb == vb_pos - 1) ? pImg0 : pImg1; + pImg3 = (yVb >= vb_pos - 2) ? pImg1 : pImg3; + pImg5 = (yVb >= vb_pos - 3) ? pImg3 : pImg5; + + pImg2 = (yVb == vb_pos - 1) ? pImg0 : pImg2; + pImg4 = (yVb >= vb_pos - 2) ? pImg2 : pImg4; + pImg6 = (yVb >= vb_pos - 3) ? pImg4 : pImg6; + } + else if (yVb >= vb_pos && (yVb <= vb_pos + 3)) // bottom + { + pImg2 = (yVb == vb_pos) ? pImg0 : pImg2; + pImg4 = (yVb <= vb_pos + 1) ? pImg2 : pImg4; + pImg6 = (yVb <= vb_pos + 2) ? pImg4 : pImg6; + + pImg1 = (yVb == vb_pos) ? pImg0 : pImg1; + pImg3 = (yVb <= vb_pos + 1) ? pImg1 : pImg3; + pImg5 = (yVb <= vb_pos + 2) ? pImg3 : pImg5; + } + __m128i cur = _mm_unpacklo_epi8(_mm_loadu_si128((const __m128i*) pImg0), _mm_setzero_si128()); + + __m128i accumA = mmOffset; + __m128i accumB = mmOffset; + + process2coeffs_7x7(params, &cur, &accumA, &accumB, 0, pImg5 + 0, pImg6 + 0, pImg3 + 1, pImg4 - 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 1, pImg3 + 0, pImg4 + 0, pImg3 - 1, pImg4 + 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 2, pImg1 + 2, pImg2 - 2, pImg1 + 1, pImg2 - 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 3, pImg1 + 0, pImg2 + 0, pImg1 - 1, pImg2 + 1); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 4, pImg1 - 2, pImg2 + 2, pImg0 + 3, pImg0 - 3); + process2coeffs_7x7(params, &cur, &accumA, &accumB, 5, pImg0 + 2, pImg0 - 2, pImg0 + 1, pImg0 - 1); + + + bool isNearVBabove = yVb < vb_pos && (yVb >= vb_pos - 1); + bool isNearVBbelow = yVb >= vb_pos && (yVb <= vb_pos); + if (!(isNearVBabove || isNearVBbelow)) + { + accumA = _mm_srai_epi32(accumA, SHIFT); + accumB = _mm_srai_epi32(accumB, SHIFT); + } + else + { + accumA = _mm_srai_epi32(_mm_add_epi32(accumA, mmOffset1), SHIFT + 3); + accumB = _mm_srai_epi32(_mm_add_epi32(accumB, mmOffset1), SHIFT + 3); + } + accumA = _mm_packs_epi32(accumA, accumB); + accumA = _mm_add_epi16(accumA, cur); + accumA = _mm_min_epi16(mmMax, _mm_max_epi16(accumA, mmMin)); + + //_mm_storeu_si128((__m128i*) (dst + ii * dstStride + j), accumA); + _mm_storel_epi64((__m128i*) (dst + ii * dstStride + j), _mm_shuffle_epi8(accumA, mask)); + } + } + + src += srcStride * STEP_Y; + dst += dstStride * STEP_Y; + } +} + + + #endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_SSE41 @@ -349,6 +753,8 @@ int kvz_strategy_register_alf_sse41(void* opaque, uint8_t bitdepth) { #if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "sse41", 20, &alf_derive_classification_blk_sse41); + success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "sse41", 0, &alf_filter_5x5_block_sse41); + success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "sse41", 0, &alf_filter_7x7_block_sse41); } #endif // KVZ_BIT_DEPTH == 8 #endif From 8ef3e6a12651d4696712738a67159dfb149b0bd4 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 25 Aug 2021 20:22:24 +0300 Subject: [PATCH 06/13] [alf] Add strategy for alf_get_blk_stats() and an initial AVX2 version --- build/kvazaar_lib/kvazaar_lib.vcxproj | 7 + build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 6 + src/Makefile.am | 4 +- src/alf.c | 260 +----------- src/strategies/avx2/alf-avx2.c | 393 ++++++++++++++++++ src/strategies/avx2/alf-avx2.h | 32 ++ src/strategies/generic/alf-generic.c | 265 ++++++++++++ src/strategies/strategies-alf.c | 5 + src/strategies/strategies-alf.h | 20 + 9 files changed, 732 insertions(+), 260 deletions(-) create mode 100644 src/strategies/avx2/alf-avx2.c create mode 100644 src/strategies/avx2/alf-avx2.h diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj index 2ac2e406..67ee5ac4 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj @@ -170,6 +170,12 @@ + + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 AdvancedVectorExtensions2 @@ -264,6 +270,7 @@ + diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters index 87f212e0..d547dfb4 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters @@ -263,6 +263,9 @@ Optimization\strategies\sse41 + + Optimization\strategies\avx2 + @@ -491,6 +494,9 @@ Optimization\strategies\sse41 + + Optimization\strategies\avx2 + diff --git a/src/Makefile.am b/src/Makefile.am index af098c39..706dc9aa 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -196,7 +196,9 @@ libavx2_la_SOURCES = \ strategies/avx2/quant-avx2.h \ strategies/avx2/reg_sad_pow2_widths-avx2.h \ strategies/avx2/sao-avx2.c \ - strategies/avx2/sao-avx2.h + strategies/avx2/sao-avx2.h \ + strategies/avx2/alf-avx2.c \ + strategies/avx2/alf-avx2.h # strategies/avx2/encode_coding_tree-avx2.c \ # strategies/avx2/encode_coding_tree-avx2.h diff --git a/src/alf.c b/src/alf.c index d591aaed..fa5d7aa3 100644 --- a/src/alf.c +++ b/src/alf.c @@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state, assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma } -static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], - const kvz_pixel *rec, - const int stride, - const channel_type channel, - const int transpose_idx, - int vb_distance, - short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) -{ - static const int alf_pattern_5[13] = { - 0, - 1, 2, 3, - 4, 5, 6, 5, 4, - 3, 2, 1, - 0 - }; - - static const int alf_pattern_7[25] = { - 0, - 1, 2, 3, - 4, 5, 6, 7, 8, - 9, 10, 11, 12, 11, 10, 9, - 8, 7, 6, 5, 4, - 3, 2, 1, - 0 - }; - - int clip_top_row = -4; - int clip_bot_row = 4; - if (vb_distance >= -3 && vb_distance < 0) - { - clip_bot_row = -vb_distance - 1; - clip_top_row = -clip_bot_row; // symmetric - } - else if (vb_distance >= 0 && vb_distance < 3) - { - clip_top_row = -vb_distance; - clip_bot_row = -clip_top_row; // symmetric - } - - const bool is_luma = channel == CHANNEL_TYPE_LUMA; - const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; - const int half_filter_length = (is_luma ? 7 : 5) >> 1; - const short* clip = alf_clipping_values[channel]; - const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; - - int k = 0; - - const int16_t curr = rec[0]; - - if (transpose_idx == 0) - { - for (int i = -half_filter_length; i < 0; i++) - { - const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; - const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; - for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); - } - } - } - for (int j = -half_filter_length; j < 0; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); - } - } - } - else if (transpose_idx == 1) - { - for (int j = -half_filter_length; j < 0; j++) - { - const kvz_pixel* rec0 = rec + j; - const kvz_pixel* rec1 = rec - j; - - for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int i = -half_filter_length; i < 0; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); - } - } - } - else if (transpose_idx == 2) - { - for (int i = -half_filter_length; i < 0; i++) - { - const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; - const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; - - for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); - } - } - } - for (int j = -half_filter_length; j < 0; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); - } - } - } - else - { - for (int j = -half_filter_length; j < 0; j++) - { - const kvz_pixel* rec0 = rec + j; - const kvz_pixel* rec1 = rec - j; - - for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int i = -half_filter_length; i < 0; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += curr; - } -} - -static void alf_get_blk_stats(encoder_state_t * const state, - channel_type channel, - alf_covariance *alf_covariance, - alf_classifier **g_classifier, - kvz_pixel *org, - int32_t org_stride, - kvz_pixel *rec, - int32_t rec_stride, - const int x_pos, - const int y_pos, - const int x_dst, - const int y_dst, - const int width, - const int height, - int vb_ctu_height, - int vb_pos, - short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) -{ - int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; - - const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; - - int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; - int transpose_idx = 0; - int class_idx = 0; - - for (int i = 0; i < height; i++) - { - int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; - for (int j = 0; j < width; j++) - { - if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) - { - continue; - } - memset(e_local, 0, sizeof(e_local)); - if (g_classifier) - { - alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; - transpose_idx = cl->transpose_idx; - class_idx = cl->class_idx; - } - - double weight = 1.0; - if (0/*m_alfWSSD*/) - { - //weight = g_luma_level_to_weight_plut[org[j]]; - } - int16_t y_local = org[j] - rec[j]; - alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); - for (int k = 0; k < num_coeff; k++) - { - for (int l = k; l < num_coeff; l++) - { - for (int b0 = 0; b0 < num_bins; b0++) - { - for (int b1 = 0; b1 < num_bins; b1++) - { - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); - } - else - { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; - } - } - } - } - for (int b = 0; b < num_bins; b++) - { - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local); - } - else - { - alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; - } - } - } - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local); - } - else - { - alf_covariance[class_idx].pix_acc += y_local * (double)y_local; - } - } - org += org_stride; - rec += rec_stride; - } - - int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; - for (class_idx = 0; class_idx < num_classes; class_idx++) - { - for (int k = 1; k < num_coeff; k++) - { - for (int l = 0; l < k; l++) - { - for (int b0 = 0; b0 < num_bins; b0++) - { - for (int b1 = 0; b1 < num_bins; b1++) - { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; - } - } - } - } - } -} static void alf_derive_stats_for_filtering(encoder_state_t * const state, short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) @@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state, const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1; const int cov_index = ctu_rs_addr * num_classes; - alf_get_blk_stats(state, ch_type, + kvz_alf_get_blk_stats(state, ch_type, &alf_cov[cov_index], comp_idx ? NULL : alf_info->classifier, org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h, diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c new file mode 100644 index 00000000..70e5afe5 --- /dev/null +++ b/src/strategies/avx2/alf-avx2.c @@ -0,0 +1,393 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +#include "global.h" + +#include "strategies/avx2/alf-avx2.h" + +#if COMPILE_INTEL_AVX2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 + +#include +#include + +#include "strategyselector.h" + +static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1) +{ + return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref); +} + + +static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], + const kvz_pixel* rec, + const int stride, + const channel_type channel, + const int transpose_idx, + int vb_distance, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + static const int alf_pattern_5[13] = { + 0, + 1, 2, 3, + 4, 5, 6, 5, 4, + 3, 2, 1, + 0 + }; + + static const int alf_pattern_7[25] = { + 0, + 1, 2, 3, + 4, 5, 6, 7, 8, + 9, 10, 11, 12, 11, 10, 9, + 8, 7, 6, 5, 4, + 3, 2, 1, + 0 + }; + + int clip_top_row = -4; + int clip_bot_row = 4; + if (vb_distance >= -3 && vb_distance < 0) + { + clip_bot_row = -vb_distance - 1; + clip_top_row = -clip_bot_row; // symmetric + } + else if (vb_distance >= 0 && vb_distance < 3) + { + clip_top_row = -vb_distance; + clip_bot_row = -clip_top_row; // symmetric + } + + const bool is_luma = channel == CHANNEL_TYPE_LUMA; + const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; + const int half_filter_length = (is_luma ? 7 : 5) >> 1; + const short* clip = alf_clipping_values[channel]; + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int k = 0; + + const int16_t curr = rec[0]; + + const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1); + + if (transpose_idx == 0) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + else if (transpose_idx == 1) + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + else if (transpose_idx == 2) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + + for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + else + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + +} + +static void alf_get_blk_stats_avx2(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; + + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; + int transpose_idx = 0; + int class_idx = 0; + + for (int i = 0; i < height; i++) + { + int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; + for (int j = 0; j < width; j++) + { + if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) + { + continue; + } + memset(e_local, 0, sizeof(e_local)); + if (g_classifier) + { + alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; + transpose_idx = cl->transpose_idx; + class_idx = cl->class_idx; + } + + int16_t y_local = org[j] - rec[j]; + + __m256d y_local_d = _mm256_set1_pd((double)y_local); + alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); + for (int k = 0; k < num_coeff; k++) + { + for (int l = k; l < num_coeff; l++) + { + for (int b0 = 0; b0 < 4; b0++) + { + __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]); + //for (int b1 = 0; b1 < 4; b1++) + { + + //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); + __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); + __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); + + __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d); + double data[4]; + _mm256_store_pd(data, multiplied); + + //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; + alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; + alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; + alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; + } + } + } + //for (int b = 0; b < 4; b++) + { + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); + __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); + __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); + + __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d); + + __m128i output = _mm256_cvtpd_epi32(multiplied); + + int32_t data[4]; + _mm_storeu_si128((__m128i*)data, output); + //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + alf_covariance[class_idx].y[0][k] += data[0]; + alf_covariance[class_idx].y[1][k] += data[1]; + alf_covariance[class_idx].y[2][k] += data[2]; + alf_covariance[class_idx].y[3][k] += data[3]; + } + } + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; + } + org += org_stride; + rec += rec_stride; + } + + int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; + for (class_idx = 0; class_idx < num_classes; class_idx++) + { + for (int k = 1; k < num_coeff; k++) + { + for (int l = 0; l < k; l++) + { + for (int b0 = 0; b0 < 4; b0++) + { + for (int b1 = 0; b1 < 4; b1++) + { + alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + } + } + } + } + } +} + +#endif // KVZ_BIT_DEPTH == 8 +#endif //COMPILE_INTEL_AVX2 + + +int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) { + bool success = true; +#if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 + if (bitdepth == 8){ + success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2); + } +#endif // KVZ_BIT_DEPTH == 8 +#endif + return success; +} diff --git a/src/strategies/avx2/alf-avx2.h b/src/strategies/avx2/alf-avx2.h new file mode 100644 index 00000000..a791b6f8 --- /dev/null +++ b/src/strategies/avx2/alf-avx2.h @@ -0,0 +1,32 @@ +#pragma once +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for AVX2. + */ + +#include "global.h" // IWYU pragma: keep +#include "kvazaar.h" + +int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth); + diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index d9aa3b95..e37acbb6 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state, alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height); } + + + +static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], + const kvz_pixel* rec, + const int stride, + const channel_type channel, + const int transpose_idx, + int vb_distance, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + static const int alf_pattern_5[13] = { + 0, + 1, 2, 3, + 4, 5, 6, 5, 4, + 3, 2, 1, + 0 + }; + + static const int alf_pattern_7[25] = { + 0, + 1, 2, 3, + 4, 5, 6, 7, 8, + 9, 10, 11, 12, 11, 10, 9, + 8, 7, 6, 5, 4, + 3, 2, 1, + 0 + }; + + int clip_top_row = -4; + int clip_bot_row = 4; + if (vb_distance >= -3 && vb_distance < 0) + { + clip_bot_row = -vb_distance - 1; + clip_top_row = -clip_bot_row; // symmetric + } + else if (vb_distance >= 0 && vb_distance < 3) + { + clip_top_row = -vb_distance; + clip_bot_row = -clip_top_row; // symmetric + } + + const bool is_luma = channel == CHANNEL_TYPE_LUMA; + const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; + const int half_filter_length = (is_luma ? 7 : 5) >> 1; + const short* clip = alf_clipping_values[channel]; + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int k = 0; + + const int16_t curr = rec[0]; + + if (transpose_idx == 0) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); + } + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); + } + } + } + else if (transpose_idx == 1) + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); + } + } + } + else if (transpose_idx == 2) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + + for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); + } + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); + } + } + } + else + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += curr; + } +} + +static void alf_get_blk_stats_generic(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; + + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; + int transpose_idx = 0; + int class_idx = 0; + + for (int i = 0; i < height; i++) + { + int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; + for (int j = 0; j < width; j++) + { + if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) + { + continue; + } + memset(e_local, 0, sizeof(e_local)); + if (g_classifier) + { + alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; + transpose_idx = cl->transpose_idx; + class_idx = cl->class_idx; + } + + double weight = 1.0; + if (0/*m_alfWSSD*/) + { + //weight = g_luma_level_to_weight_plut[org[j]]; + } + int16_t y_local = org[j] - rec[j]; + alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); + for (int k = 0; k < num_coeff; k++) + { + for (int l = k; l < num_coeff; l++) + { + for (int b0 = 0; b0 < num_bins; b0++) + { + for (int b1 = 0; b1 < num_bins; b1++) + { + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + } + else + { + alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + } + } + } + } + for (int b = 0; b < num_bins; b++) + { + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local); + } + else + { + alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + } + } + } + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local); + } + else + { + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; + } + } + org += org_stride; + rec += rec_stride; + } + + int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; + for (class_idx = 0; class_idx < num_classes; class_idx++) + { + for (int k = 1; k < num_coeff; k++) + { + for (int l = 0; l < k; l++) + { + for (int b0 = 0; b0 < num_bins; b0++) + { + for (int b1 = 0; b1 < num_bins; b1++) + { + alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + } + } + } + } + } +} + + int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic); success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic); success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic); + success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic); + return success; } diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c index d5d99936..666e8cb5 100644 --- a/src/strategies/strategies-alf.c +++ b/src/strategies/strategies-alf.c @@ -20,6 +20,7 @@ #include "strategies/strategies-alf.h" #include "strategies/sse41/alf-sse41.h" +#include "strategies/avx2/alf-avx2.h" #include "strategies/generic/alf-generic.h" #include "strategyselector.h" @@ -28,6 +29,7 @@ alf_derive_classification_blk_func* kvz_alf_derive_classification_blk; alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; +alf_get_blk_stats_func* kvz_alf_get_blk_stats; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { bool success = true; @@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { if (kvz_g_hardware_flags.intel_flags.sse41) { success &= kvz_strategy_register_alf_sse41(opaque, bitdepth); } + if (kvz_g_hardware_flags.intel_flags.avx2) { + success &= kvz_strategy_register_alf_avx2(opaque, bitdepth); + } return success; } \ No newline at end of file diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h index 8ad15384..32a650f7 100644 --- a/src/strategies/strategies-alf.h +++ b/src/strategies/strategies-alf.h @@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state, int vb_pos, const int vb_ctu_height); +typedef void (alf_get_blk_stats_func)(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]); + // Declare function pointers. extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk; extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; +extern alf_get_blk_stats_func* kvz_alf_get_blk_stats; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); @@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \ {"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \ {"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \ + {"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \ From 915bf3ca2456372c50887d459c41213ba28bf784 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Wed, 25 Aug 2021 20:29:58 +0300 Subject: [PATCH 07/13] [alf] Fix AVX2 priority --- src/strategies/avx2/alf-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 70e5afe5..a2e09483 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -385,7 +385,7 @@ int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) { #if COMPILE_INTEL_AVX2 #if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ - success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2); + success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "avx2", 40, &alf_get_blk_stats_avx2); } #endif // KVZ_BIT_DEPTH == 8 #endif From f4de5cfd0f7594970bcd01f467708776ad6f8eae Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 26 Aug 2021 10:20:57 +0300 Subject: [PATCH 08/13] [alf] Cleanup alf_calc_covariance_avx2() and use integers in alf_get_blk_stats_avx2() --- src/strategies/avx2/alf-avx2.c | 196 ++++++++++----------------------- 1 file changed, 58 insertions(+), 138 deletions(-) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index a2e09483..7fd625b4 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -36,6 +36,19 @@ static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref); } +#define ALF_CLIP_AND_ADD(VAL0,VAL1) __m128i clips = _mm_loadl_epi64((__m128i*) clip); \ +__m128i neg_clips = _mm_sign_epi16(clips, negate); \ +__m128i val0 = _mm_set1_epi16((VAL0 - curr));\ +__m128i val1 = _mm_set1_epi16((VAL1 - curr));\ +__m128i min_clips_val0 = _mm_min_epi16(clips, val0);\ +__m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);\ +\ +__m128i min_clips_val1 = _mm_min_epi16(clips, val1);\ +__m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);\ +\ +__m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);\ +__m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));\ +_mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], const kvz_pixel* rec, @@ -96,36 +109,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); - __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec0[j], rec1[-j]); } } for (int j = -half_filter_length; j < 0; j++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec[j] - curr)); - __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec[j], rec[-j]); } } else if (transpose_idx == 1) @@ -137,36 +126,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); - __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); } } for (int i = -half_filter_length; i < 0; i++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); - __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); } } else if (transpose_idx == 2) @@ -178,36 +143,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); - __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec0[j], rec1[-j]); } } for (int j = -half_filter_length; j < 0; j++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec[j] - curr)); - __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec[j], rec[-j]); } } else @@ -219,36 +160,12 @@ static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); - __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); } } for (int i = -half_filter_length; i < 0; i++, k++) { - __m128i clips = _mm_loadl_epi64((__m128i*) clip); - __m128i neg_clips = _mm_sign_epi16(clips, negate); - __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); - __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); - __m128i min_clips_val0 = _mm_min_epi16(clips, val0); - __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); - - __m128i min_clips_val1 = _mm_min_epi16(clips, val1); - __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); - - __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); - __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); - _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + ALF_CLIP_AND_ADD(rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); } } @@ -303,7 +220,9 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, int16_t y_local = org[j] - rec[j]; - __m256d y_local_d = _mm256_set1_pd((double)y_local); + //__m256i const perm_mask = _mm256_set_epi32(14, 12, 10, 8, 6, 4, 2, 0); + + __m256i y_local_32 = _mm256_set1_epi32(y_local); alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); for (int k = 0; k < num_coeff; k++) { @@ -311,45 +230,46 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, { for (int b0 = 0; b0 < 4; b0++) { - __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]); - //for (int b1 = 0; b1 < 4; b1++) + if (!e_local[k][b0]) continue; + __m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]); + /*for (int b1 = 0; b1 < 4; b1++) { + alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + }*/ + + //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); + __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); + __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32); + int64_t data[4]; + _mm256_storeu_si256((__m256i*)data, multiplied); - //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) - __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); - __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); - __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); - __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d); - double data[4]; - _mm256_store_pd(data, multiplied); - - //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; - alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; - alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; - alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; - alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; - } + alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; + alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; + alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; + alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; + } } - //for (int b = 0; b < 4; b++) + /* + for (int b = 0; b < 4; b++) { - __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); - __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); - __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); - - __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d); - - __m128i output = _mm256_cvtpd_epi32(multiplied); - - int32_t data[4]; - _mm_storeu_si128((__m128i*)data, output); - //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; - alf_covariance[class_idx].y[0][k] += data[0]; - alf_covariance[class_idx].y[1][k] += data[1]; - alf_covariance[class_idx].y[2][k] += data[2]; - alf_covariance[class_idx].y[3][k] += data[3]; - } + alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + }*/ + + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); + __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); + __m256i multiplied = _mm256_mul_epi32(y_local_32, e_local_32); + //__m256i output = _mm256_permutevar8x32_epi32(multiplied, perm_mask); + + int64_t data[4]; + _mm256_storeu_si256((__m256i*)data, multiplied); + + alf_covariance[class_idx].y[0][k] += data[0]; + alf_covariance[class_idx].y[1][k] += data[1]; + alf_covariance[class_idx].y[2][k] += data[2]; + alf_covariance[class_idx].y[3][k] += data[3]; } alf_covariance[class_idx].pix_acc += y_local * (double)y_local; } From be9527cf1d1c4a7f219d3d7bf58077939826ae68 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 26 Aug 2021 11:07:13 +0300 Subject: [PATCH 09/13] [alf] Change the order of alf_covariance.ee values to get better optimized solution for alf_get_blk_stats_avx2() --- src/alf.c | 42 ++++++++++++++-------------- src/alf.h | 2 +- src/strategies/avx2/alf-avx2.c | 17 ++++------- src/strategies/generic/alf-generic.c | 6 ++-- 4 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/alf.c b/src/alf.c index fa5d7aa3..6f1a3939 100644 --- a/src/alf.c +++ b/src/alf.c @@ -119,7 +119,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max) { for (int l = 0; inc && l < num_coeff; ++l) { - if (cov->ee[clip_max[k]][0][k][l] != cov->ee[clip_max[k] + 1][0][k][l]) + if (cov->ee[k][l][clip_max[k]][0] != cov->ee[k][l][clip_max[k] + 1][0]) { inc = false; } @@ -142,7 +142,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip) { for (int l = 0; dec && l < cov->num_coeff; ++l) { - if (cov->ee[clip[k]][clip[l]][k][l] != cov->ee[clip[k] - 1][clip[l]][k][l]) + if (cov->ee[k][l][clip[k]][clip[l]] != cov->ee[k][l][clip[k] - 1][clip[l]]) { dec = false; } @@ -162,7 +162,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double y[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ee[k][l] = cov->ee[clip[k]][clip[l]][k][l]; + ee[k][l] = cov->ee[k][l][clip[k]][clip[l]]; } } } @@ -352,8 +352,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } gns_solve_by_chol(ke, ky, f, size); @@ -373,8 +373,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } gns_solve_by_chol(ke, ky, f, size); @@ -392,8 +392,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } } @@ -404,8 +404,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[idx_min] = cov->y[clip[idx_min]][idx_min]; for (int l = 0; l < size; l++) { - ke[idx_min][l] = cov->ee[clip[idx_min]][clip[l]][idx_min][l]; - ke[l][idx_min] = cov->ee[clip[l]][clip[idx_min]][l][idx_min]; + ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]]; + ke[l][idx_min] = cov->ee[l][idx_min][clip[l]][clip[idx_min]]; } } else @@ -469,9 +469,9 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip, double sum = 0; for (int j = i + 1; j < num_coeff; j++) { - sum += cov->ee[clip[i]][clip[j]][i][j] * coeff[j]; + sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j]; } - error += ((cov->ee[clip[i]][clip[i]][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i]; + error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i]; } return error / factor; @@ -488,9 +488,9 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1 for (int j = i + 1; j < num_coeff; j++) { // E[j][i] = E[i][j], sum will be multiplied by 2 later - sum += cov->ee[0][0][i][j] * coeff[j]; + sum += cov->ee[i][j][0][0] * coeff[j]; } - error += ((cov->ee[0][0][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i]; + error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i]; } return error / factor; @@ -753,7 +753,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src) { for (int i = 0; i < num_coeff; i++) { - dst->ee[b0][b1][j][i] += src->ee[b0][b1][j][i]; + dst->ee[j][i][b0][b1] += src->ee[j][i][b0][b1]; } } } @@ -780,7 +780,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co { for (int i = 0; i < num_coeff; i++) { - dst->ee[b0][b1][j][i] = lhs->ee[b0][b1][j][i] + rhs->ee[b0][b1][j][i]; + dst->ee[j][i][b0][b1] = lhs->ee[j][i][b0][b1] + rhs->ee[j][i][b0][b1]; } } } @@ -1972,7 +1972,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k]; for (int l = 0; l < size; l++) { - k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[0][0][k][l]; + k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0]; } } @@ -2766,11 +2766,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state, { if (0 /*g_alf_wssd*/) { - alf_covariance->ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + alf_covariance->ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]); } else { - alf_covariance->ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance->ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; } } } @@ -2826,7 +2826,7 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state, { for (int b1 = 0; b1 < num_bins; b1++) { - alf_covariance->ee[b0][b1][k][l] = alf_covariance->ee[b1][b0][l][k]; + alf_covariance->ee[k][l][b0][b1] = alf_covariance->ee[l][k][b1][b0]; } } } diff --git a/src/alf.h b/src/alf.h index 862b284d..793102ac 100644 --- a/src/alf.h +++ b/src/alf.h @@ -176,7 +176,7 @@ typedef enum { PACK( typedef struct alf_covariance { double pix_acc; - int64_t ee[MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF]; + int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES]; int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF]; int num_coeff; int num_bins; diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 7fd625b4..91b79287 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -234,22 +234,15 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, __m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]); /*for (int b1 = 0; b1 < 4; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; }*/ - //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32); - int64_t data[4]; - _mm256_storeu_si256((__m256i*)data, multiplied); - - - alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; - alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; - alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; - alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; - + __m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]); + _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig)); + } } /* @@ -288,7 +281,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, { for (int b1 = 0; b1 < 4; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0]; } } } diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index e37acbb6..99def841 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -933,11 +933,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state, { if (0/*m_alfWSSD*/) { - alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + alf_covariance[class_idx].ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]); } else { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; } } } @@ -978,7 +978,7 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state, { for (int b1 = 0; b1 < num_bins; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0]; } } } From 5df8add04610288ca188f344fa8e683d2d392ab1 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 26 Aug 2021 15:37:01 +0300 Subject: [PATCH 10/13] [alf] Change order of alf_covariance.y array for better AVX2 optimization in alf_get_blk_stats_avx2() --- src/alf.c | 30 ++++++++++++++-------------- src/alf.h | 2 +- src/strategies/avx2/alf-avx2.c | 17 +++++----------- src/strategies/generic/alf-generic.c | 4 ++-- 4 files changed, 23 insertions(+), 30 deletions(-) diff --git a/src/alf.c b/src/alf.c index 6f1a3939..19c6893e 100644 --- a/src/alf.c +++ b/src/alf.c @@ -115,7 +115,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max) clip_max[k] = 0; bool inc = true; - while (inc && clip_max[k] + 1 < cov->num_bins && cov->y[clip_max[k] + 1][k] == cov->y[clip_max[k]][k]) + while (inc && clip_max[k] + 1 < cov->num_bins && cov->y[k][clip_max[k] + 1] == cov->y[k][clip_max[k]]) { for (int l = 0; inc && l < num_coeff; ++l) { @@ -138,7 +138,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip) for (int k = 0; k < cov->num_coeff - 1; ++k) { bool dec = true; - while (dec && clip[k] > 0 && cov->y[clip[k] - 1][k] == cov->y[clip[k]][k]) + while (dec && clip[k] > 0 && cov->y[k][clip[k] - 1] == cov->y[k][clip[k]]) { for (int l = 0; dec && l < cov->num_coeff; ++l) { @@ -159,7 +159,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double { for (int k = 0; k < size; k++) { - y[k] = cov->y[clip[k]][k]; + y[k] = cov->y[k][clip[k]]; for (int l = 0; l < size; l++) { ee[k][l] = cov->ee[k][l][clip[k]][clip[l]]; @@ -304,7 +304,7 @@ static double calculate_error(const alf_covariance *cov, const int *clip, const double sum = 0; for (int i = 0; i < cov->num_coeff; i++) { - sum += coeff[i] * cov->y[clip[i]][i]; + sum += coeff[i] * cov->y[i][clip[i]]; } return cov->pix_acc - sum; @@ -349,7 +349,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b if (clip[k] - step >= clip_max[k]) { clip[k] -= step; - ky[k] = cov->y[clip[k]][k]; + ky[k] = cov->y[k][clip[k]]; for (int l = 0; l < size; l++) { ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; @@ -370,7 +370,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b if (clip[k] + step < cov->num_bins) { clip[k] += step; - ky[k] = cov->y[clip[k]][k]; + ky[k] = cov->y[k][clip[k]]; for (int l = 0; l < size; l++) { ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; @@ -389,7 +389,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b clip[k] -= step; } - ky[k] = cov->y[clip[k]][k]; + ky[k] = cov->y[k][clip[k]]; for (int l = 0; l < size; l++) { ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; @@ -401,7 +401,7 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b { err_best = err_min; clip[idx_min] += inc_min; - ky[idx_min] = cov->y[clip[idx_min]][idx_min]; + ky[idx_min] = cov->y[idx_min][clip[idx_min]]; for (int l = 0; l < size; l++) { ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]]; @@ -471,7 +471,7 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip, { sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j]; } - error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i]; + error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[i][clip[i]]) * coeff[i]; } return error / factor; @@ -490,7 +490,7 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1 // E[j][i] = E[i][j], sum will be multiplied by 2 later sum += cov->ee[i][j][0][0] * coeff[j]; } - error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i]; + error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[i][0]) * coeff[i]; } return error / factor; @@ -762,7 +762,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src) { for (int j = 0; j < num_coeff; j++) { - dst->y[b][j] += src->y[b][j]; + dst->y[j][b] += src->y[j][b]; } } dst->pix_acc += src->pix_acc; @@ -789,7 +789,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co { for (int j = 0; j < num_coeff; j++) { - dst->y[b][j] = lhs->y[b][j] + rhs->y[b][j]; + dst->y[j][b] = lhs->y[j][b] + rhs->y[j][b]; } } dst->pix_acc = lhs->pix_acc + rhs->pix_acc; @@ -1969,7 +1969,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a for (int k = 0; k < size; k++) { - ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k]; + ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[k][0]; for (int l = 0; l < size; l++) { k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0]; @@ -2779,11 +2779,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state, { if (0 /*g_alf_wssd*/) { - alf_covariance->y[b][k] += weight * (e_local[k][b] * (double)y_local); + alf_covariance->y[k][b] += weight * (e_local[k][b] * (double)y_local); } else { - alf_covariance->y[b][k] += e_local[k][b] * (double)y_local; + alf_covariance->y[k][b] += e_local[k][b] * (double)y_local; } } } diff --git a/src/alf.h b/src/alf.h index 793102ac..1cc0a71b 100644 --- a/src/alf.h +++ b/src/alf.h @@ -177,7 +177,7 @@ PACK( typedef struct alf_covariance { double pix_acc; int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES]; - int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF]; + int32_t y[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; int num_coeff; int num_bins; } alf_covariance;) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 91b79287..1048e041 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -240,7 +240,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32); - __m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]); + __m256i orig = _mm256_loadu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]); _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig)); } @@ -248,21 +248,14 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, /* for (int b = 0; b < 4; b++) { - alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local; }*/ __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); - __m256i multiplied = _mm256_mul_epi32(y_local_32, e_local_32); - //__m256i output = _mm256_permutevar8x32_epi32(multiplied, perm_mask); - - int64_t data[4]; - _mm256_storeu_si256((__m256i*)data, multiplied); - - alf_covariance[class_idx].y[0][k] += data[0]; - alf_covariance[class_idx].y[1][k] += data[1]; - alf_covariance[class_idx].y[2][k] += data[2]; - alf_covariance[class_idx].y[3][k] += data[3]; + __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); + __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]); + _mm_store_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); } alf_covariance[class_idx].pix_acc += y_local * (double)y_local; } diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index 99def841..6758fa4e 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -946,11 +946,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state, { if (0/*m_alfWSSD*/) { - alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local); + alf_covariance[class_idx].y[k][b] += weight * (e_local[k][b] * (double)y_local); } else { - alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local; } } } From 671497326453354bd0851a72c693d897f9a78da8 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 26 Aug 2021 18:05:06 +0300 Subject: [PATCH 11/13] [alf] Change _mm_store_si128 to _mm_storeu_si128 in alf_get_blk_stats_avx2() --- src/strategies/avx2/alf-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 1048e041..e0ad40a4 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -255,7 +255,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]); - _mm_store_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); + _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); } alf_covariance[class_idx].pix_acc += y_local * (double)y_local; } From fdf125f406b73bdfa8cecf28d843f9f02d02aa2d Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 27 Aug 2021 10:25:20 +0300 Subject: [PATCH 12/13] [alf] Fix incorrect conversion in alf_get_blk_stats_avx2 --- src/strategies/avx2/alf-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index e0ad40a4..1d4a8e9c 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -252,7 +252,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, }*/ __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); - __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); + __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1); __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]); _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); From 26f18865f7b6600ed811580c365b41469a45820f Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Fri, 27 Aug 2021 13:40:28 +0300 Subject: [PATCH 13/13] [alf] Change the processing in alf_get_blk_stats_avx2() to allow utilizing the whole 256bit register --- src/strategies/avx2/alf-avx2.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 1d4a8e9c..e832eb48 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -249,14 +249,23 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, for (int b = 0; b < 4; b++) { alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local; - }*/ + }*/ - __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); + } + for (int k = 0; k < num_coeff-1; k+=2) + { + __m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]); __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1); __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); - __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]); - _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); + __m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]); + _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig)); } + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]); + __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1); + __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); + __m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]); + _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig)); + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; } org += org_stride;