From 8e60bbf6a6536017f264d5829edbe9ed957adec9 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Tue, 18 Jun 2019 12:19:28 +0300 Subject: [PATCH] Slightly tune 16x16 forward DCT Use an array of __m256i's to store temporary value, essentially letting the compiler enforce alignment and use aligned loads and stores. --- src/strategies/avx2/dct-avx2.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 74430ab6..3e72104c 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -399,7 +399,7 @@ static void matrix_idct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t */ } -static void matmul_16x16_a_bt_t(const int16_t *a, const int16_t *b_t, int16_t *output, const int8_t shift) +static void matmul_16x16_a_bt_t(const int16_t *a, const int16_t *b_t, __m256i *output, const int8_t shift) { const int32_t add = 1 << (shift - 1); const __m256i debias = _mm256_set1_epi32(add); @@ -458,11 +458,11 @@ static void matmul_16x16_a_bt_t(const int16_t *a, const int16_t *b_t, int16_t *o results_32[fro] = truncate(res, debias, shift); } __m256i final_col = _mm256_packs_epi32(results_32[0], results_32[1]); - _mm256_storeu_si256((__m256i *)output + x, final_col); + output[x] = final_col; } } -static void matmul_16x16_a_bt(const int16_t *a, const int16_t *b_t, int16_t *output, const int8_t shift) +static void matmul_16x16_a_bt(const int16_t *a, const __m256i *b_t, int16_t *output, const int8_t shift) { const int32_t add = 1 << (shift - 1); const __m256i debias = _mm256_set1_epi32(add); @@ -473,14 +473,14 @@ static void matmul_16x16_a_bt(const int16_t *a, const int16_t *b_t, int16_t *out for (int32_t fco = 0; fco < 2; fco++) { // Read first cols 0, 1, 2, 3, 8, 9, 10, 11, and then next 4 - __m256i bt_c0 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 0); - __m256i bt_c1 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 1); - __m256i bt_c2 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 2); - __m256i bt_c3 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 3); - __m256i bt_c8 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 8); - __m256i bt_c9 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 9); - __m256i bt_c10 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 10); - __m256i bt_c11 = _mm256_loadu_si256((const __m256i *)b_t + fco * 4 + 11); + __m256i bt_c0 = b_t[fco * 4 + 0]; + __m256i bt_c1 = b_t[fco * 4 + 1]; + __m256i bt_c2 = b_t[fco * 4 + 2]; + __m256i bt_c3 = b_t[fco * 4 + 3]; + __m256i bt_c8 = b_t[fco * 4 + 8]; + __m256i bt_c9 = b_t[fco * 4 + 9]; + __m256i bt_c10 = b_t[fco * 4 + 10]; + __m256i bt_c11 = b_t[fco * 4 + 11]; __m256i p0 = _mm256_madd_epi16(a_r, bt_c0); __m256i p1 = _mm256_madd_epi16(a_r, bt_c1); @@ -601,7 +601,7 @@ static void matrix_dct_16x16_avx2(int8_t bitdepth, const int16_t *input, int16_t * in the second multiplication. */ - int16_t tmpres[16 * 16]; + __m256i tmpres[16]; matmul_16x16_a_bt_t(input, dct, tmpres, shift_1st); matmul_16x16_a_bt (dct, tmpres, output, shift_2nd); }