diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj index 2ac2e406..67ee5ac4 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj @@ -170,6 +170,12 @@ + + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 + AdvancedVectorExtensions2 AdvancedVectorExtensions2 @@ -264,6 +270,7 @@ + diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters index 87f212e0..d547dfb4 100644 --- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters +++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters @@ -263,6 +263,9 @@ Optimization\strategies\sse41 + + Optimization\strategies\avx2 + @@ -491,6 +494,9 @@ Optimization\strategies\sse41 + + Optimization\strategies\avx2 + diff --git a/src/Makefile.am b/src/Makefile.am index af098c39..706dc9aa 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -196,7 +196,9 @@ libavx2_la_SOURCES = \ strategies/avx2/quant-avx2.h \ strategies/avx2/reg_sad_pow2_widths-avx2.h \ strategies/avx2/sao-avx2.c \ - strategies/avx2/sao-avx2.h + strategies/avx2/sao-avx2.h \ + strategies/avx2/alf-avx2.c \ + strategies/avx2/alf-avx2.h # strategies/avx2/encode_coding_tree-avx2.c \ # strategies/avx2/encode_coding_tree-avx2.h diff --git a/src/alf.c b/src/alf.c index d591aaed..fa5d7aa3 100644 --- a/src/alf.c +++ b/src/alf.c @@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state, assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma } -static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], - const kvz_pixel *rec, - const int stride, - const channel_type channel, - const int transpose_idx, - int vb_distance, - short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) -{ - static const int alf_pattern_5[13] = { - 0, - 1, 2, 3, - 4, 5, 6, 5, 4, - 3, 2, 1, - 0 - }; - - static const int alf_pattern_7[25] = { - 0, - 1, 2, 3, - 4, 5, 6, 7, 8, - 9, 10, 11, 12, 11, 10, 9, - 8, 7, 6, 5, 4, - 3, 2, 1, - 0 - }; - - int clip_top_row = -4; - int clip_bot_row = 4; - if (vb_distance >= -3 && vb_distance < 0) - { - clip_bot_row = -vb_distance - 1; - clip_top_row = -clip_bot_row; // symmetric - } - else if (vb_distance >= 0 && vb_distance < 3) - { - clip_top_row = -vb_distance; - clip_bot_row = -clip_top_row; // symmetric - } - - const bool is_luma = channel == CHANNEL_TYPE_LUMA; - const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; - const int half_filter_length = (is_luma ? 7 : 5) >> 1; - const short* clip = alf_clipping_values[channel]; - const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; - - int k = 0; - - const int16_t curr = rec[0]; - - if (transpose_idx == 0) - { - for (int i = -half_filter_length; i < 0; i++) - { - const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; - const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; - for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); - } - } - } - for (int j = -half_filter_length; j < 0; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); - } - } - } - else if (transpose_idx == 1) - { - for (int j = -half_filter_length; j < 0; j++) - { - const kvz_pixel* rec0 = rec + j; - const kvz_pixel* rec1 = rec - j; - - for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int i = -half_filter_length; i < 0; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); - } - } - } - else if (transpose_idx == 2) - { - for (int i = -half_filter_length; i < 0; i++) - { - const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; - const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; - - for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); - } - } - } - for (int j = -half_filter_length; j < 0; j++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); - } - } - } - else - { - for (int j = -half_filter_length; j < 0; j++) - { - const kvz_pixel* rec0 = rec + j; - const kvz_pixel* rec1 = rec - j; - - for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int i = -half_filter_length; i < 0; i++, k++) - { - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); - } - } - } - for (int b = 0; b < num_bins; b++) - { - e_local[filter_pattern[k]][b] += curr; - } -} - -static void alf_get_blk_stats(encoder_state_t * const state, - channel_type channel, - alf_covariance *alf_covariance, - alf_classifier **g_classifier, - kvz_pixel *org, - int32_t org_stride, - kvz_pixel *rec, - int32_t rec_stride, - const int x_pos, - const int y_pos, - const int x_dst, - const int y_dst, - const int width, - const int height, - int vb_ctu_height, - int vb_pos, - short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) -{ - int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; - - const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; - - int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; - int transpose_idx = 0; - int class_idx = 0; - - for (int i = 0; i < height; i++) - { - int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; - for (int j = 0; j < width; j++) - { - if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) - { - continue; - } - memset(e_local, 0, sizeof(e_local)); - if (g_classifier) - { - alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; - transpose_idx = cl->transpose_idx; - class_idx = cl->class_idx; - } - - double weight = 1.0; - if (0/*m_alfWSSD*/) - { - //weight = g_luma_level_to_weight_plut[org[j]]; - } - int16_t y_local = org[j] - rec[j]; - alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); - for (int k = 0; k < num_coeff; k++) - { - for (int l = k; l < num_coeff; l++) - { - for (int b0 = 0; b0 < num_bins; b0++) - { - for (int b1 = 0; b1 < num_bins; b1++) - { - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); - } - else - { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; - } - } - } - } - for (int b = 0; b < num_bins; b++) - { - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local); - } - else - { - alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; - } - } - } - if (0/*m_alfWSSD*/) - { - alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local); - } - else - { - alf_covariance[class_idx].pix_acc += y_local * (double)y_local; - } - } - org += org_stride; - rec += rec_stride; - } - - int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; - for (class_idx = 0; class_idx < num_classes; class_idx++) - { - for (int k = 1; k < num_coeff; k++) - { - for (int l = 0; l < k; l++) - { - for (int b0 = 0; b0 < num_bins; b0++) - { - for (int b1 = 0; b1 < num_bins; b1++) - { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; - } - } - } - } - } -} static void alf_derive_stats_for_filtering(encoder_state_t * const state, short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) @@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state, const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1; const int cov_index = ctu_rs_addr * num_classes; - alf_get_blk_stats(state, ch_type, + kvz_alf_get_blk_stats(state, ch_type, &alf_cov[cov_index], comp_idx ? NULL : alf_info->classifier, org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h, diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c new file mode 100644 index 00000000..70e5afe5 --- /dev/null +++ b/src/strategies/avx2/alf-avx2.c @@ -0,0 +1,393 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +#include "global.h" + +#include "strategies/avx2/alf-avx2.h" + +#if COMPILE_INTEL_AVX2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 + +#include +#include + +#include "strategyselector.h" + +static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1) +{ + return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref); +} + + +static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], + const kvz_pixel* rec, + const int stride, + const channel_type channel, + const int transpose_idx, + int vb_distance, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + static const int alf_pattern_5[13] = { + 0, + 1, 2, 3, + 4, 5, 6, 5, 4, + 3, 2, 1, + 0 + }; + + static const int alf_pattern_7[25] = { + 0, + 1, 2, 3, + 4, 5, 6, 7, 8, + 9, 10, 11, 12, 11, 10, 9, + 8, 7, 6, 5, 4, + 3, 2, 1, + 0 + }; + + int clip_top_row = -4; + int clip_bot_row = 4; + if (vb_distance >= -3 && vb_distance < 0) + { + clip_bot_row = -vb_distance - 1; + clip_top_row = -clip_bot_row; // symmetric + } + else if (vb_distance >= 0 && vb_distance < 3) + { + clip_top_row = -vb_distance; + clip_bot_row = -clip_top_row; // symmetric + } + + const bool is_luma = channel == CHANNEL_TYPE_LUMA; + const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; + const int half_filter_length = (is_luma ? 7 : 5) >> 1; + const short* clip = alf_clipping_values[channel]; + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int k = 0; + + const int16_t curr = rec[0]; + + const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1); + + if (transpose_idx == 0) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + else if (transpose_idx == 1) + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + else if (transpose_idx == 2) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + + for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result); + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[j] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-j] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + else + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + __m128i clips = _mm_loadl_epi64((__m128i*) clip); + __m128i neg_clips = _mm_sign_epi16(clips, negate); + __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr)); + __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr)); + __m128i min_clips_val0 = _mm_min_epi16(clips, val0); + __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips); + + __m128i min_clips_val1 = _mm_min_epi16(clips, val1); + __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips); + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + } + } + + __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]); + __m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr)); + _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result); + +} + +static void alf_get_blk_stats_avx2(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; + + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; + int transpose_idx = 0; + int class_idx = 0; + + for (int i = 0; i < height; i++) + { + int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; + for (int j = 0; j < width; j++) + { + if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) + { + continue; + } + memset(e_local, 0, sizeof(e_local)); + if (g_classifier) + { + alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; + transpose_idx = cl->transpose_idx; + class_idx = cl->class_idx; + } + + int16_t y_local = org[j] - rec[j]; + + __m256d y_local_d = _mm256_set1_pd((double)y_local); + alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); + for (int k = 0; k < num_coeff; k++) + { + for (int l = k; l < num_coeff; l++) + { + for (int b0 = 0; b0 < 4; b0++) + { + __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]); + //for (int b1 = 0; b1 < 4; b1++) + { + + //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); + __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); + __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); + + __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d); + double data[4]; + _mm256_store_pd(data, multiplied); + + //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; + alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; + alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; + alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; + } + } + } + //for (int b = 0; b < 4; b++) + { + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); + __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1); + __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32); + + __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d); + + __m128i output = _mm256_cvtpd_epi32(multiplied); + + int32_t data[4]; + _mm_storeu_si128((__m128i*)data, output); + //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + alf_covariance[class_idx].y[0][k] += data[0]; + alf_covariance[class_idx].y[1][k] += data[1]; + alf_covariance[class_idx].y[2][k] += data[2]; + alf_covariance[class_idx].y[3][k] += data[3]; + } + } + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; + } + org += org_stride; + rec += rec_stride; + } + + int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; + for (class_idx = 0; class_idx < num_classes; class_idx++) + { + for (int k = 1; k < num_coeff; k++) + { + for (int l = 0; l < k; l++) + { + for (int b0 = 0; b0 < 4; b0++) + { + for (int b1 = 0; b1 < 4; b1++) + { + alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + } + } + } + } + } +} + +#endif // KVZ_BIT_DEPTH == 8 +#endif //COMPILE_INTEL_AVX2 + + +int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) { + bool success = true; +#if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 + if (bitdepth == 8){ + success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2); + } +#endif // KVZ_BIT_DEPTH == 8 +#endif + return success; +} diff --git a/src/strategies/avx2/alf-avx2.h b/src/strategies/avx2/alf-avx2.h new file mode 100644 index 00000000..a791b6f8 --- /dev/null +++ b/src/strategies/avx2/alf-avx2.h @@ -0,0 +1,32 @@ +#pragma once +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2021 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see . + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for AVX2. + */ + +#include "global.h" // IWYU pragma: keep +#include "kvazaar.h" + +int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth); + diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index d9aa3b95..e37acbb6 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state, alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height); } + + + +static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES], + const kvz_pixel* rec, + const int stride, + const channel_type channel, + const int transpose_idx, + int vb_distance, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + static const int alf_pattern_5[13] = { + 0, + 1, 2, 3, + 4, 5, 6, 5, 4, + 3, 2, 1, + 0 + }; + + static const int alf_pattern_7[25] = { + 0, + 1, 2, 3, + 4, 5, 6, 7, 8, + 9, 10, 11, 12, 11, 10, 9, + 8, 7, 6, 5, 4, + 3, 2, 1, + 0 + }; + + int clip_top_row = -4; + int clip_bot_row = 4; + if (vb_distance >= -3 && vb_distance < 0) + { + clip_bot_row = -vb_distance - 1; + clip_top_row = -clip_bot_row; // symmetric + } + else if (vb_distance >= 0 && vb_distance < 3) + { + clip_top_row = -vb_distance; + clip_bot_row = -clip_top_row; // symmetric + } + + const bool is_luma = channel == CHANNEL_TYPE_LUMA; + const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5; + const int half_filter_length = (is_luma ? 7 : 5) >> 1; + const short* clip = alf_clipping_values[channel]; + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int k = 0; + + const int16_t curr = rec[0]; + + if (transpose_idx == 0) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); + } + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); + } + } + } + else if (transpose_idx == 1) + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); + } + } + } + else if (transpose_idx == 2) + { + for (int i = -half_filter_length; i < 0; i++) + { + const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride; + const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride; + + for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]); + } + } + } + for (int j = -half_filter_length; j < 0; j++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]); + } + } + } + else + { + for (int j = -half_filter_length; j < 0; j++) + { + const kvz_pixel* rec0 = rec + j; + const kvz_pixel* rec1 = rec - j; + + for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int i = -half_filter_length; i < 0; i++, k++) + { + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]); + } + } + } + for (int b = 0; b < num_bins; b++) + { + e_local[filter_pattern[k]][b] += curr; + } +} + +static void alf_get_blk_stats_generic(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]) +{ + int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES]; + + const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES; + + int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7; + int transpose_idx = 0; + int class_idx = 0; + + for (int i = 0; i < height; i++) + { + int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos; + for (int j = 0; j < width; j++) + { + if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX) + { + continue; + } + memset(e_local, 0, sizeof(e_local)); + if (g_classifier) + { + alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j]; + transpose_idx = cl->transpose_idx; + class_idx = cl->class_idx; + } + + double weight = 1.0; + if (0/*m_alfWSSD*/) + { + //weight = g_luma_level_to_weight_plut[org[j]]; + } + int16_t y_local = org[j] - rec[j]; + alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values); + for (int k = 0; k < num_coeff; k++) + { + for (int l = k; l < num_coeff; l++) + { + for (int b0 = 0; b0 < num_bins; b0++) + { + for (int b1 = 0; b1 < num_bins; b1++) + { + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + } + else + { + alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + } + } + } + } + for (int b = 0; b < num_bins; b++) + { + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local); + } + else + { + alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local; + } + } + } + if (0/*m_alfWSSD*/) + { + alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local); + } + else + { + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; + } + } + org += org_stride; + rec += rec_stride; + } + + int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1; + for (class_idx = 0; class_idx < num_classes; class_idx++) + { + for (int k = 1; k < num_coeff; k++) + { + for (int l = 0; l < k; l++) + { + for (int b0 = 0; b0 < num_bins; b0++) + { + for (int b1 = 0; b1 < num_bins; b1++) + { + alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + } + } + } + } + } +} + + int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic); success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic); success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic); + success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic); + return success; } diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c index d5d99936..666e8cb5 100644 --- a/src/strategies/strategies-alf.c +++ b/src/strategies/strategies-alf.c @@ -20,6 +20,7 @@ #include "strategies/strategies-alf.h" #include "strategies/sse41/alf-sse41.h" +#include "strategies/avx2/alf-avx2.h" #include "strategies/generic/alf-generic.h" #include "strategyselector.h" @@ -28,6 +29,7 @@ alf_derive_classification_blk_func* kvz_alf_derive_classification_blk; alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; +alf_get_blk_stats_func* kvz_alf_get_blk_stats; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { bool success = true; @@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) { if (kvz_g_hardware_flags.intel_flags.sse41) { success &= kvz_strategy_register_alf_sse41(opaque, bitdepth); } + if (kvz_g_hardware_flags.intel_flags.avx2) { + success &= kvz_strategy_register_alf_avx2(opaque, bitdepth); + } return success; } \ No newline at end of file diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h index 8ad15384..32a650f7 100644 --- a/src/strategies/strategies-alf.h +++ b/src/strategies/strategies-alf.h @@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state, int vb_pos, const int vb_ctu_height); +typedef void (alf_get_blk_stats_func)(encoder_state_t* const state, + channel_type channel, + alf_covariance* alf_covariance, + alf_classifier** g_classifier, + kvz_pixel* org, + int32_t org_stride, + kvz_pixel* rec, + int32_t rec_stride, + const int x_pos, + const int y_pos, + const int x_dst, + const int y_dst, + const int width, + const int height, + int vb_ctu_height, + int vb_pos, + short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]); + // Declare function pointers. extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk; extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk; extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk; +extern alf_get_blk_stats_func* kvz_alf_get_blk_stats; int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); @@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth); {"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \ {"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \ {"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \ + {"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \