From be9527cf1d1c4a7f219d3d7bf58077939826ae68 Mon Sep 17 00:00:00 2001 From: Marko Viitanen Date: Thu, 26 Aug 2021 11:07:13 +0300 Subject: [PATCH] [alf] Change the order of alf_covariance.ee values to get better optimized solution for alf_get_blk_stats_avx2() --- src/alf.c | 42 ++++++++++++++-------------- src/alf.h | 2 +- src/strategies/avx2/alf-avx2.c | 17 ++++------- src/strategies/generic/alf-generic.c | 6 ++-- 4 files changed, 30 insertions(+), 37 deletions(-) diff --git a/src/alf.c b/src/alf.c index fa5d7aa3..6f1a3939 100644 --- a/src/alf.c +++ b/src/alf.c @@ -119,7 +119,7 @@ static void get_clip_max(const alf_covariance *cov, int *clip_max) { for (int l = 0; inc && l < num_coeff; ++l) { - if (cov->ee[clip_max[k]][0][k][l] != cov->ee[clip_max[k] + 1][0][k][l]) + if (cov->ee[k][l][clip_max[k]][0] != cov->ee[k][l][clip_max[k] + 1][0]) { inc = false; } @@ -142,7 +142,7 @@ static void reduce_clip_cost(const alf_covariance *cov, int *clip) { for (int l = 0; dec && l < cov->num_coeff; ++l) { - if (cov->ee[clip[k]][clip[l]][k][l] != cov->ee[clip[k] - 1][clip[l]][k][l]) + if (cov->ee[k][l][clip[k]][clip[l]] != cov->ee[k][l][clip[k] - 1][clip[l]]) { dec = false; } @@ -162,7 +162,7 @@ static void set_ey_from_clip(const alf_covariance *cov, const int* clip, double y[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ee[k][l] = cov->ee[clip[k]][clip[l]][k][l]; + ee[k][l] = cov->ee[k][l][clip[k]][clip[l]]; } } } @@ -352,8 +352,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } gns_solve_by_chol(ke, ky, f, size); @@ -373,8 +373,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } gns_solve_by_chol(ke, ky, f, size); @@ -392,8 +392,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[k] = cov->y[clip[k]][k]; for (int l = 0; l < size; l++) { - ke[k][l] = cov->ee[clip[k]][clip[l]][k][l]; - ke[l][k] = cov->ee[clip[l]][clip[k]][l][k]; + ke[k][l] = cov->ee[k][l][clip[k]][clip[l]]; + ke[l][k] = cov->ee[l][k][clip[l]][clip[k]]; } } @@ -404,8 +404,8 @@ static double optimize_filter(const alf_covariance *cov, int* clip, double *f, b ky[idx_min] = cov->y[clip[idx_min]][idx_min]; for (int l = 0; l < size; l++) { - ke[idx_min][l] = cov->ee[clip[idx_min]][clip[l]][idx_min][l]; - ke[l][idx_min] = cov->ee[clip[l]][clip[idx_min]][l][idx_min]; + ke[idx_min][l] = cov->ee[idx_min][l][clip[idx_min]][clip[l]]; + ke[l][idx_min] = cov->ee[l][idx_min][clip[l]][clip[idx_min]]; } } else @@ -469,9 +469,9 @@ static double calc_error_for_coeffs(const alf_covariance *cov, const int *clip, double sum = 0; for (int j = i + 1; j < num_coeff; j++) { - sum += cov->ee[clip[i]][clip[j]][i][j] * coeff[j]; + sum += cov->ee[i][j][clip[i]][clip[j]] * coeff[j]; } - error += ((cov->ee[clip[i]][clip[i]][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i]; + error += ((cov->ee[i][i][clip[i]][clip[i]] * coeff[i] + sum * 2) / factor - 2 * cov->y[clip[i]][i]) * coeff[i]; } return error / factor; @@ -488,9 +488,9 @@ static double calc_error_for_cc_alf_coeffs(const alf_covariance *cov, const int1 for (int j = i + 1; j < num_coeff; j++) { // E[j][i] = E[i][j], sum will be multiplied by 2 later - sum += cov->ee[0][0][i][j] * coeff[j]; + sum += cov->ee[i][j][0][0] * coeff[j]; } - error += ((cov->ee[0][0][i][i] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i]; + error += ((cov->ee[i][i][0][0] * coeff[i] + sum * 2) / factor - 2 * cov->y[0][i]) * coeff[i]; } return error / factor; @@ -753,7 +753,7 @@ static void add_alf_cov(alf_covariance *dst, alf_covariance *src) { for (int i = 0; i < num_coeff; i++) { - dst->ee[b0][b1][j][i] += src->ee[b0][b1][j][i]; + dst->ee[j][i][b0][b1] += src->ee[j][i][b0][b1]; } } } @@ -780,7 +780,7 @@ static void add_alf_cov_lhs_rhs(alf_covariance *dst, alf_covariance *lhs, alf_co { for (int i = 0; i < num_coeff; i++) { - dst->ee[b0][b1][j][i] = lhs->ee[b0][b1][j][i] + rhs->ee[b0][b1][j][i]; + dst->ee[j][i][b0][b1] = lhs->ee[j][i][b0][b1] + rhs->ee[j][i][b0][b1]; } } } @@ -1972,7 +1972,7 @@ static void derive_cc_alf_filter_coeff(alf_covariance *alf_covariance_frame_cc_a ky[k] = alf_covariance_frame_cc_alf[filter_idx].y[0][k]; for (int l = 0; l < size; l++) { - k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[0][0][k][l]; + k_e[k][l] = alf_covariance_frame_cc_alf[filter_idx].ee[k][l][0][0]; } } @@ -2766,11 +2766,11 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state, { if (0 /*g_alf_wssd*/) { - alf_covariance->ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + alf_covariance->ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]); } else { - alf_covariance->ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance->ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; } } } @@ -2826,7 +2826,7 @@ static void get_blk_stats_cc_alf(encoder_state_t * const state, { for (int b1 = 0; b1 < num_bins; b1++) { - alf_covariance->ee[b0][b1][k][l] = alf_covariance->ee[b1][b0][l][k]; + alf_covariance->ee[k][l][b0][b1] = alf_covariance->ee[l][k][b1][b0]; } } } diff --git a/src/alf.h b/src/alf.h index 862b284d..793102ac 100644 --- a/src/alf.h +++ b/src/alf.h @@ -176,7 +176,7 @@ typedef enum { PACK( typedef struct alf_covariance { double pix_acc; - int64_t ee[MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF]; + int64_t ee[MAX_NUM_ALF_LUMA_COEFF][MAX_NUM_ALF_LUMA_COEFF][MAX_ALF_NUM_CLIPPING_VALUES][MAX_ALF_NUM_CLIPPING_VALUES]; int32_t y[MAX_ALF_NUM_CLIPPING_VALUES][MAX_NUM_ALF_LUMA_COEFF]; int num_coeff; int num_bins; diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 7fd625b4..91b79287 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -234,22 +234,15 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, __m256i e_local_b0_d = _mm256_set1_epi32((int32_t)e_local[k][b0]); /*for (int b1 = 0; b1 < 4; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; }*/ - //__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c) __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[l][0]); __m256i e_local_32 = _mm256_cvtepi16_epi64(e_local_1); __m256i multiplied = _mm256_mul_epi32(e_local_b0_d, e_local_32); - int64_t data[4]; - _mm256_storeu_si256((__m256i*)data, multiplied); - - - alf_covariance[class_idx].ee[b0][0][k][l] += data[0]; - alf_covariance[class_idx].ee[b0][1][k][l] += data[1]; - alf_covariance[class_idx].ee[b0][2][k][l] += data[2]; - alf_covariance[class_idx].ee[b0][3][k][l] += data[3]; - + __m256i orig = _mm256_lddqu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0]); + _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].ee[k][l][b0], _mm256_add_epi64(multiplied, orig)); + } } /* @@ -288,7 +281,7 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, { for (int b1 = 0; b1 < 4; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0]; } } } diff --git a/src/strategies/generic/alf-generic.c b/src/strategies/generic/alf-generic.c index e37acbb6..99def841 100644 --- a/src/strategies/generic/alf-generic.c +++ b/src/strategies/generic/alf-generic.c @@ -933,11 +933,11 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state, { if (0/*m_alfWSSD*/) { - alf_covariance[class_idx].ee[b0][b1][k][l] += weight * (e_local[k][b0] * (double)e_local[l][b1]); + alf_covariance[class_idx].ee[k][l][b0][b1] += weight * (e_local[k][b0] * (double)e_local[l][b1]); } else { - alf_covariance[class_idx].ee[b0][b1][k][l] += e_local[k][b0] * (double)e_local[l][b1]; + alf_covariance[class_idx].ee[k][l][b0][b1] += e_local[k][b0] * (double)e_local[l][b1]; } } } @@ -978,7 +978,7 @@ static void alf_get_blk_stats_generic(encoder_state_t* const state, { for (int b1 = 0; b1 < num_bins; b1++) { - alf_covariance[class_idx].ee[b0][b1][k][l] = alf_covariance[class_idx].ee[b1][b0][l][k]; + alf_covariance[class_idx].ee[k][l][b0][b1] = alf_covariance[class_idx].ee[l][k][b1][b0]; } } }