diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index 2ac2e406..67ee5ac4 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -170,6 +170,12 @@
+
+ AdvancedVectorExtensions2
+ AdvancedVectorExtensions2
+ AdvancedVectorExtensions2
+ AdvancedVectorExtensions2
+
AdvancedVectorExtensions2
AdvancedVectorExtensions2
@@ -264,6 +270,7 @@
+
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index 87f212e0..d547dfb4 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -263,6 +263,9 @@
Optimization\strategies\sse41
+
+ Optimization\strategies\avx2
+
@@ -491,6 +494,9 @@
Optimization\strategies\sse41
+
+ Optimization\strategies\avx2
+
diff --git a/src/Makefile.am b/src/Makefile.am
index af098c39..706dc9aa 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -196,7 +196,9 @@ libavx2_la_SOURCES = \
strategies/avx2/quant-avx2.h \
strategies/avx2/reg_sad_pow2_widths-avx2.h \
strategies/avx2/sao-avx2.c \
- strategies/avx2/sao-avx2.h
+ strategies/avx2/sao-avx2.h \
+ strategies/avx2/alf-avx2.c \
+ strategies/avx2/alf-avx2.h
# strategies/avx2/encode_coding_tree-avx2.c \
# strategies/avx2/encode_coding_tree-avx2.h
diff --git a/src/alf.c b/src/alf.c
index d591aaed..fa5d7aa3 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -4276,264 +4276,6 @@ static void alf_get_avai_aps_ids_luma(encoder_state_t * const state,
assert(*new_aps_id < (int)ALF_CTB_MAX_NUM_APS); //Wrong APS index assignment in getAvaiApsIdsLuma
}
-static void alf_calc_covariance(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
- const kvz_pixel *rec,
- const int stride,
- const channel_type channel,
- const int transpose_idx,
- int vb_distance,
- short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
-{
- static const int alf_pattern_5[13] = {
- 0,
- 1, 2, 3,
- 4, 5, 6, 5, 4,
- 3, 2, 1,
- 0
- };
-
- static const int alf_pattern_7[25] = {
- 0,
- 1, 2, 3,
- 4, 5, 6, 7, 8,
- 9, 10, 11, 12, 11, 10, 9,
- 8, 7, 6, 5, 4,
- 3, 2, 1,
- 0
- };
-
- int clip_top_row = -4;
- int clip_bot_row = 4;
- if (vb_distance >= -3 && vb_distance < 0)
- {
- clip_bot_row = -vb_distance - 1;
- clip_top_row = -clip_bot_row; // symmetric
- }
- else if (vb_distance >= 0 && vb_distance < 3)
- {
- clip_top_row = -vb_distance;
- clip_bot_row = -clip_top_row; // symmetric
- }
-
- const bool is_luma = channel == CHANNEL_TYPE_LUMA;
- const int *filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
- const int half_filter_length = (is_luma ? 7 : 5) >> 1;
- const short* clip = alf_clipping_values[channel];
- const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
-
- int k = 0;
-
- const int16_t curr = rec[0];
-
- if (transpose_idx == 0)
- {
- for (int i = -half_filter_length; i < 0; i++)
- {
- const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
- const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
- for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
- }
- }
- }
- for (int j = -half_filter_length; j < 0; j++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
- }
- }
- }
- else if (transpose_idx == 1)
- {
- for (int j = -half_filter_length; j < 0; j++)
- {
- const kvz_pixel* rec0 = rec + j;
- const kvz_pixel* rec1 = rec - j;
-
- for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
- }
- }
- }
- for (int i = -half_filter_length; i < 0; i++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
- }
- }
- }
- else if (transpose_idx == 2)
- {
- for (int i = -half_filter_length; i < 0; i++)
- {
- const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
- const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
-
- for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
- }
- }
- }
- for (int j = -half_filter_length; j < 0; j++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
- }
- }
- }
- else
- {
- for (int j = -half_filter_length; j < 0; j++)
- {
- const kvz_pixel* rec0 = rec + j;
- const kvz_pixel* rec1 = rec - j;
-
- for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
- }
- }
- }
- for (int i = -half_filter_length; i < 0; i++, k++)
- {
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
- }
- }
- }
- for (int b = 0; b < num_bins; b++)
- {
- e_local[filter_pattern[k]][b] += curr;
- }
-}
-
-static void alf_get_blk_stats(encoder_state_t * const state,
- channel_type channel,
- alf_covariance *alf_covariance,
- alf_classifier **g_classifier,
- kvz_pixel *org,
- int32_t org_stride,
- kvz_pixel *rec,
- int32_t rec_stride,
- const int x_pos,
- const int y_pos,
- const int x_dst,
- const int y_dst,
- const int width,
- const int height,
- int vb_ctu_height,
- int vb_pos,
- short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
-{
- int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
-
- const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
-
- int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
- int transpose_idx = 0;
- int class_idx = 0;
-
- for (int i = 0; i < height; i++)
- {
- int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
- for (int j = 0; j < width; j++)
- {
- if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
- {
- continue;
- }
- memset(e_local, 0, sizeof(e_local));
- if (g_classifier)
- {
- alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
- transpose_idx = cl->transpose_idx;
- class_idx = cl->class_idx;
- }
-
- double weight = 1.0;
- if (0/*m_alfWSSD*/)
- {
- //weight = g_luma_level_to_weight_plut[org[j]];
- }
- int16_t y_local = org[j] - rec[j];
- alf_calc_covariance(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
- for (int k = 0; k < num_coeff; k++)
- {
- for (int l = k; l < num_coeff; l++)
- {
- for (int b0 = 0; b0 < num_bins; b0++)
- {
- for (int b1 = 0; b1 < num_bins; b1++)
- {
- if (0/*m_alfWSSD*/)
- {
- alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
- }
- else
- {
- alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
- }
- }
- }
- }
- for (int b = 0; b < num_bins; b++)
- {
- if (0/*m_alfWSSD*/)
- {
- alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
- }
- else
- {
- alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
- }
- }
- }
- if (0/*m_alfWSSD*/)
- {
- alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
- }
- else
- {
- alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
- }
- }
- org += org_stride;
- rec += rec_stride;
- }
-
- int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
- for (class_idx = 0; class_idx < num_classes; class_idx++)
- {
- for (int k = 1; k < num_coeff; k++)
- {
- for (int l = 0; l < k; l++)
- {
- for (int b0 = 0; b0 < num_bins; b0++)
- {
- for (int b1 = 0; b1 < num_bins; b1++)
- {
- alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
- }
- }
- }
- }
- }
-}
static void alf_derive_stats_for_filtering(encoder_state_t * const state,
short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
@@ -4619,7 +4361,7 @@ static void alf_derive_stats_for_filtering(encoder_state_t * const state,
const int num_classes = is_luma ? MAX_NUM_ALF_CLASSES : 1;
const int cov_index = ctu_rs_addr * num_classes;
- alf_get_blk_stats(state, ch_type,
+ kvz_alf_get_blk_stats(state, ch_type,
&alf_cov[cov_index],
comp_idx ? NULL : alf_info->classifier,
org, org_stride, rec, rec_stride, pos_x, pos_y, pos_x, pos_y, blk_w, blk_h,
diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c
new file mode 100644
index 00000000..70e5afe5
--- /dev/null
+++ b/src/strategies/avx2/alf-avx2.c
@@ -0,0 +1,393 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar. If not, see .
+ ****************************************************************************/
+
+#include "global.h"
+
+#include "strategies/avx2/alf-avx2.h"
+
+#if COMPILE_INTEL_AVX2
+#include "kvazaar.h"
+#if KVZ_BIT_DEPTH == 8
+
+#include
+#include
+
+#include "strategyselector.h"
+
+static int16_t clip_alf(const int16_t clip, const int16_t ref, const int16_t val0, const int16_t val1)
+{
+ return CLIP(-clip, +clip, val0 - ref) + CLIP(-clip, +clip, val1 - ref);
+}
+
+
+static void alf_calc_covariance_avx2(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
+ const kvz_pixel* rec,
+ const int stride,
+ const channel_type channel,
+ const int transpose_idx,
+ int vb_distance,
+ short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+ static const int alf_pattern_5[13] = {
+ 0,
+ 1, 2, 3,
+ 4, 5, 6, 5, 4,
+ 3, 2, 1,
+ 0
+ };
+
+ static const int alf_pattern_7[25] = {
+ 0,
+ 1, 2, 3,
+ 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 11, 10, 9,
+ 8, 7, 6, 5, 4,
+ 3, 2, 1,
+ 0
+ };
+
+ int clip_top_row = -4;
+ int clip_bot_row = 4;
+ if (vb_distance >= -3 && vb_distance < 0)
+ {
+ clip_bot_row = -vb_distance - 1;
+ clip_top_row = -clip_bot_row; // symmetric
+ }
+ else if (vb_distance >= 0 && vb_distance < 3)
+ {
+ clip_top_row = -vb_distance;
+ clip_bot_row = -clip_top_row; // symmetric
+ }
+
+ const bool is_luma = channel == CHANNEL_TYPE_LUMA;
+ const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
+ const int half_filter_length = (is_luma ? 7 : 5) >> 1;
+ const short* clip = alf_clipping_values[channel];
+ const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+ int k = 0;
+
+ const int16_t curr = rec[0];
+
+ const __m128i negate = _mm_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1);
+
+ if (transpose_idx == 0)
+ {
+ for (int i = -half_filter_length; i < 0; i++)
+ {
+ const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+ const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+ for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
+ __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) &e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+ }
+ }
+ for (int j = -half_filter_length; j < 0; j++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec[j] - curr));
+ __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+ }
+ }
+ else if (transpose_idx == 1)
+ {
+ for (int j = -half_filter_length; j < 0; j++)
+ {
+ const kvz_pixel* rec0 = rec + j;
+ const kvz_pixel* rec1 = rec - j;
+
+ for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
+ __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+ }
+ }
+ for (int i = -half_filter_length; i < 0; i++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
+ __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+ }
+ }
+ else if (transpose_idx == 2)
+ {
+ for (int i = -half_filter_length; i < 0; i++)
+ {
+ const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+ const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+
+ for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec0[j] - curr));
+ __m128i val1 = _mm_set1_epi16((rec1[-j] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*) & e_local[filter_pattern[k]][0], result);
+ }
+ }
+ for (int j = -half_filter_length; j < 0; j++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec[j] - curr));
+ __m128i val1 = _mm_set1_epi16((rec[-j] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+ }
+ }
+ else
+ {
+ for (int j = -half_filter_length; j < 0; j++)
+ {
+ const kvz_pixel* rec0 = rec + j;
+ const kvz_pixel* rec1 = rec - j;
+
+ for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec0[MAX(i, clip_top_row) * stride] - curr));
+ __m128i val1 = _mm_set1_epi16((rec1[-MAX(i, -clip_bot_row) * stride] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+ }
+ }
+ for (int i = -half_filter_length; i < 0; i++, k++)
+ {
+ __m128i clips = _mm_loadl_epi64((__m128i*) clip);
+ __m128i neg_clips = _mm_sign_epi16(clips, negate);
+ __m128i val0 = _mm_set1_epi16((rec[MAX(i, clip_top_row) * stride] - curr));
+ __m128i val1 = _mm_set1_epi16((rec[-MAX(i, -clip_bot_row) * stride] - curr));
+ __m128i min_clips_val0 = _mm_min_epi16(clips, val0);
+ __m128i max_clips_val0 = _mm_max_epi16(min_clips_val0, neg_clips);
+
+ __m128i min_clips_val1 = _mm_min_epi16(clips, val1);
+ __m128i max_clips_val1 = _mm_max_epi16(min_clips_val1, neg_clips);
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_add_epi16(max_clips_val0, max_clips_val1));
+ _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+ }
+ }
+
+ __m128i e_local_original = _mm_loadl_epi64((__m128i*) & e_local[filter_pattern[k]][0]);
+ __m128i result = _mm_add_epi16(e_local_original, _mm_set1_epi16(curr));
+ _mm_storel_epi64((__m128i*)& e_local[filter_pattern[k]][0], result);
+
+}
+
+static void alf_get_blk_stats_avx2(encoder_state_t* const state,
+ channel_type channel,
+ alf_covariance* alf_covariance,
+ alf_classifier** g_classifier,
+ kvz_pixel* org,
+ int32_t org_stride,
+ kvz_pixel* rec,
+ int32_t rec_stride,
+ const int x_pos,
+ const int y_pos,
+ const int x_dst,
+ const int y_dst,
+ const int width,
+ const int height,
+ int vb_ctu_height,
+ int vb_pos,
+ short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+ int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
+
+ const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+ int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
+ int transpose_idx = 0;
+ int class_idx = 0;
+
+ for (int i = 0; i < height; i++)
+ {
+ int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
+ for (int j = 0; j < width; j++)
+ {
+ if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
+ {
+ continue;
+ }
+ memset(e_local, 0, sizeof(e_local));
+ if (g_classifier)
+ {
+ alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
+ transpose_idx = cl->transpose_idx;
+ class_idx = cl->class_idx;
+ }
+
+ int16_t y_local = org[j] - rec[j];
+
+ __m256d y_local_d = _mm256_set1_pd((double)y_local);
+ alf_calc_covariance_avx2(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
+ for (int k = 0; k < num_coeff; k++)
+ {
+ for (int l = k; l < num_coeff; l++)
+ {
+ for (int b0 = 0; b0 < 4; b0++)
+ {
+ __m256d e_local_b0_d = _mm256_set1_pd((double)e_local[k][b0]);
+ //for (int b1 = 0; b1 < 4; b1++)
+ {
+
+ //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
+ __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]);
+ __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
+ __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
+
+ __m256d multiplied = _mm256_mul_pd(e_local_b0_d, e_local_b1_d);
+ double data[4];
+ _mm256_store_pd(data, multiplied);
+
+ //alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+ alf_covariance[class_idx].ee[b0][0][k][l] += data[0];
+ alf_covariance[class_idx].ee[b0][1][k][l] += data[1];
+ alf_covariance[class_idx].ee[b0][2][k][l] += data[2];
+ alf_covariance[class_idx].ee[b0][3][k][l] += data[3];
+ }
+ }
+ }
+ //for (int b = 0; b < 4; b++)
+ {
+ __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
+ __m128i e_local_32 = _mm_cvtepi16_epi32(e_local_1);
+ __m256d e_local_b1_d = _mm256_cvtepi32_pd(e_local_32);
+
+ __m256d multiplied = _mm256_mul_pd(y_local_d, e_local_b1_d);
+
+ __m128i output = _mm256_cvtpd_epi32(multiplied);
+
+ int32_t data[4];
+ _mm_storeu_si128((__m128i*)data, output);
+ //alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+ alf_covariance[class_idx].y[0][k] += data[0];
+ alf_covariance[class_idx].y[1][k] += data[1];
+ alf_covariance[class_idx].y[2][k] += data[2];
+ alf_covariance[class_idx].y[3][k] += data[3];
+ }
+ }
+ alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
+ }
+ org += org_stride;
+ rec += rec_stride;
+ }
+
+ int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
+ for (class_idx = 0; class_idx < num_classes; class_idx++)
+ {
+ for (int k = 1; k < num_coeff; k++)
+ {
+ for (int l = 0; l < k; l++)
+ {
+ for (int b0 = 0; b0 < 4; b0++)
+ {
+ for (int b1 = 0; b1 < 4; b1++)
+ {
+ alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+ }
+ }
+ }
+ }
+ }
+}
+
+#endif // KVZ_BIT_DEPTH == 8
+#endif //COMPILE_INTEL_AVX2
+
+
+int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth) {
+ bool success = true;
+#if COMPILE_INTEL_AVX2
+#if KVZ_BIT_DEPTH == 8
+ if (bitdepth == 8){
+ success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_avx2);
+ }
+#endif // KVZ_BIT_DEPTH == 8
+#endif
+ return success;
+}
diff --git a/src/strategies/avx2/alf-avx2.h b/src/strategies/avx2/alf-avx2.h
new file mode 100644
index 00000000..a791b6f8
--- /dev/null
+++ b/src/strategies/avx2/alf-avx2.h
@@ -0,0 +1,32 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2021 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar. If not, see .
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+
+int kvz_strategy_register_alf_avx2(void* opaque, uint8_t bitdepth);
+
diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c
index d9aa3b95..e37acbb6 100644
--- a/src/strategies/generic/alf-generic.c
+++ b/src/strategies/generic/alf-generic.c
@@ -724,6 +724,269 @@ static void alf_filter_7x7_block_generic(encoder_state_t* const state,
alf_filter_block_generic(state, src_pixels, dst_pixels, src_stride, dst_stride, filter_set, fClipSet, clp_rng, COMPONENT_Y, width, height, x_pos, y_pos, blk_dst_x, blk_dst_y, vb_pos, vb_ctu_height);
}
+
+
+
+static void alf_calc_covariance_generic(int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES],
+ const kvz_pixel* rec,
+ const int stride,
+ const channel_type channel,
+ const int transpose_idx,
+ int vb_distance,
+ short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+ static const int alf_pattern_5[13] = {
+ 0,
+ 1, 2, 3,
+ 4, 5, 6, 5, 4,
+ 3, 2, 1,
+ 0
+ };
+
+ static const int alf_pattern_7[25] = {
+ 0,
+ 1, 2, 3,
+ 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 11, 10, 9,
+ 8, 7, 6, 5, 4,
+ 3, 2, 1,
+ 0
+ };
+
+ int clip_top_row = -4;
+ int clip_bot_row = 4;
+ if (vb_distance >= -3 && vb_distance < 0)
+ {
+ clip_bot_row = -vb_distance - 1;
+ clip_top_row = -clip_bot_row; // symmetric
+ }
+ else if (vb_distance >= 0 && vb_distance < 3)
+ {
+ clip_top_row = -vb_distance;
+ clip_bot_row = -clip_top_row; // symmetric
+ }
+
+ const bool is_luma = channel == CHANNEL_TYPE_LUMA;
+ const int* filter_pattern = is_luma ? alf_pattern_7 : alf_pattern_5;
+ const int half_filter_length = (is_luma ? 7 : 5) >> 1;
+ const short* clip = alf_clipping_values[channel];
+ const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+ int k = 0;
+
+ const int16_t curr = rec[0];
+
+ if (transpose_idx == 0)
+ {
+ for (int i = -half_filter_length; i < 0; i++)
+ {
+ const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+ const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+ for (int j = -half_filter_length - i; j <= half_filter_length + i; j++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
+ }
+ }
+ }
+ for (int j = -half_filter_length; j < 0; j++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
+ }
+ }
+ }
+ else if (transpose_idx == 1)
+ {
+ for (int j = -half_filter_length; j < 0; j++)
+ {
+ const kvz_pixel* rec0 = rec + j;
+ const kvz_pixel* rec1 = rec - j;
+
+ for (int i = -half_filter_length - j; i <= half_filter_length + j; i++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
+ }
+ }
+ }
+ for (int i = -half_filter_length; i < 0; i++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
+ }
+ }
+ }
+ else if (transpose_idx == 2)
+ {
+ for (int i = -half_filter_length; i < 0; i++)
+ {
+ const kvz_pixel* rec0 = rec + MAX(i, clip_top_row) * stride;
+ const kvz_pixel* rec1 = rec - MAX(i, -clip_bot_row) * stride;
+
+ for (int j = half_filter_length + i; j >= -half_filter_length - i; j--, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[j], rec1[-j]);
+ }
+ }
+ }
+ for (int j = -half_filter_length; j < 0; j++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[j], rec[-j]);
+ }
+ }
+ }
+ else
+ {
+ for (int j = -half_filter_length; j < 0; j++)
+ {
+ const kvz_pixel* rec0 = rec + j;
+ const kvz_pixel* rec1 = rec - j;
+
+ for (int i = half_filter_length + j; i >= -half_filter_length - j; i--, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec0[MAX(i, clip_top_row) * stride], rec1[-MAX(i, -clip_bot_row) * stride]);
+ }
+ }
+ }
+ for (int i = -half_filter_length; i < 0; i++, k++)
+ {
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += clip_alf(clip[b], curr, rec[MAX(i, clip_top_row) * stride], rec[-MAX(i, -clip_bot_row) * stride]);
+ }
+ }
+ }
+ for (int b = 0; b < num_bins; b++)
+ {
+ e_local[filter_pattern[k]][b] += curr;
+ }
+}
+
+static void alf_get_blk_stats_generic(encoder_state_t* const state,
+ channel_type channel,
+ alf_covariance* alf_covariance,
+ alf_classifier** g_classifier,
+ kvz_pixel* org,
+ int32_t org_stride,
+ kvz_pixel* rec,
+ int32_t rec_stride,
+ const int x_pos,
+ const int y_pos,
+ const int x_dst,
+ const int y_dst,
+ const int width,
+ const int height,
+ int vb_ctu_height,
+ int vb_pos,
+ short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES])
+{
+ int16_t e_local[MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES];
+
+ const int num_bins = MAX_ALF_NUM_CLIPPING_VALUES;
+
+ int num_coeff = channel == CHANNEL_TYPE_LUMA ? 13 : 7;
+ int transpose_idx = 0;
+ int class_idx = 0;
+
+ for (int i = 0; i < height; i++)
+ {
+ int vb_distance = ((y_dst + i) % vb_ctu_height) - vb_pos;
+ for (int j = 0; j < width; j++)
+ {
+ if (g_classifier && g_classifier[y_dst + i][x_dst + j].class_idx == ALF_UNUSED_CLASS_IDX && g_classifier[y_dst + i][x_dst + j].transpose_idx == ALF_UNUSED_TRANSPOSE_IDX)
+ {
+ continue;
+ }
+ memset(e_local, 0, sizeof(e_local));
+ if (g_classifier)
+ {
+ alf_classifier* cl = &g_classifier[y_dst + i][x_dst + j];
+ transpose_idx = cl->transpose_idx;
+ class_idx = cl->class_idx;
+ }
+
+ double weight = 1.0;
+ if (0/*m_alfWSSD*/)
+ {
+ //weight = g_luma_level_to_weight_plut[org[j]];
+ }
+ int16_t y_local = org[j] - rec[j];
+ alf_calc_covariance_generic(e_local, rec + j, rec_stride, channel, transpose_idx, vb_distance, alf_clipping_values);
+ for (int k = 0; k < num_coeff; k++)
+ {
+ for (int l = k; l < num_coeff; l++)
+ {
+ for (int b0 = 0; b0 < num_bins; b0++)
+ {
+ for (int b1 = 0; b1 < num_bins; b1++)
+ {
+ if (0/*m_alfWSSD*/)
+ {
+ alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]);
+ }
+ else
+ {
+ alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1];
+ }
+ }
+ }
+ }
+ for (int b = 0; b < num_bins; b++)
+ {
+ if (0/*m_alfWSSD*/)
+ {
+ alf_covariance[class_idx].y[b][k] += weight * (e_local[k][b] * (double)y_local);
+ }
+ else
+ {
+ alf_covariance[class_idx].y[b][k] += e_local[k][b] * (double)y_local;
+ }
+ }
+ }
+ if (0/*m_alfWSSD*/)
+ {
+ alf_covariance[class_idx].pix_acc += weight * (y_local * (double)y_local);
+ }
+ else
+ {
+ alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
+ }
+ }
+ org += org_stride;
+ rec += rec_stride;
+ }
+
+ int num_classes = g_classifier ? MAX_NUM_ALF_CLASSES : 1;
+ for (class_idx = 0; class_idx < num_classes; class_idx++)
+ {
+ for (int k = 1; k < num_coeff; k++)
+ {
+ for (int l = 0; l < k; l++)
+ {
+ for (int b0 = 0; b0 < num_bins; b0++)
+ {
+ for (int b1 = 0; b1 < num_bins; b1++)
+ {
+ alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k];
+ }
+ }
+ }
+ }
+ }
+}
+
+
int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
{
bool success = true;
@@ -731,6 +994,8 @@ int kvz_strategy_register_alf_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "alf_derive_classification_blk", "generic", 0, &alf_derive_classification_blk_generic);
success &= kvz_strategyselector_register(opaque, "alf_filter_5x5_blk", "generic", 0, &alf_filter_5x5_block_generic);
success &= kvz_strategyselector_register(opaque, "alf_filter_7x7_blk", "generic", 0, &alf_filter_7x7_block_generic);
+ success &= kvz_strategyselector_register(opaque, "alf_get_blk_stats", "generic", 0, &alf_get_blk_stats_generic);
+
return success;
}
diff --git a/src/strategies/strategies-alf.c b/src/strategies/strategies-alf.c
index d5d99936..666e8cb5 100644
--- a/src/strategies/strategies-alf.c
+++ b/src/strategies/strategies-alf.c
@@ -20,6 +20,7 @@
#include "strategies/strategies-alf.h"
#include "strategies/sse41/alf-sse41.h"
+#include "strategies/avx2/alf-avx2.h"
#include "strategies/generic/alf-generic.h"
#include "strategyselector.h"
@@ -28,6 +29,7 @@
alf_derive_classification_blk_func* kvz_alf_derive_classification_blk;
alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
+alf_get_blk_stats_func* kvz_alf_get_blk_stats;
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
bool success = true;
@@ -37,6 +39,9 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth) {
if (kvz_g_hardware_flags.intel_flags.sse41) {
success &= kvz_strategy_register_alf_sse41(opaque, bitdepth);
}
+ if (kvz_g_hardware_flags.intel_flags.avx2) {
+ success &= kvz_strategy_register_alf_avx2(opaque, bitdepth);
+ }
return success;
}
\ No newline at end of file
diff --git a/src/strategies/strategies-alf.h b/src/strategies/strategies-alf.h
index 8ad15384..32a650f7 100644
--- a/src/strategies/strategies-alf.h
+++ b/src/strategies/strategies-alf.h
@@ -78,10 +78,29 @@ typedef void (alf_filter_7x7_blk_func)(encoder_state_t* const state,
int vb_pos,
const int vb_ctu_height);
+typedef void (alf_get_blk_stats_func)(encoder_state_t* const state,
+ channel_type channel,
+ alf_covariance* alf_covariance,
+ alf_classifier** g_classifier,
+ kvz_pixel* org,
+ int32_t org_stride,
+ kvz_pixel* rec,
+ int32_t rec_stride,
+ const int x_pos,
+ const int y_pos,
+ const int x_dst,
+ const int y_dst,
+ const int width,
+ const int height,
+ int vb_ctu_height,
+ int vb_pos,
+ short alf_clipping_values[MAX_NUM_CHANNEL_TYPE][MAX_ALF_NUM_CLIPPING_VALUES]);
+
// Declare function pointers.
extern alf_derive_classification_blk_func * kvz_alf_derive_classification_blk;
extern alf_filter_5x5_blk_func* kvz_alf_filter_5x5_blk;
extern alf_filter_7x7_blk_func* kvz_alf_filter_7x7_blk;
+extern alf_get_blk_stats_func* kvz_alf_get_blk_stats;
int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
@@ -90,5 +109,6 @@ int kvz_strategy_register_alf(void* opaque, uint8_t bitdepth);
{"alf_derive_classification_blk", (void**) &kvz_alf_derive_classification_blk}, \
{"alf_filter_5x5_blk", (void**) &kvz_alf_filter_5x5_blk}, \
{"alf_filter_7x7_blk", (void**) &kvz_alf_filter_7x7_blk}, \
+ {"alf_get_blk_stats", (void**) &kvz_alf_get_blk_stats}, \