From 7ec7ab33610997e7ada48e522c5a8c852ad28c62 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Thu, 23 May 2019 16:05:14 +0300 Subject: [PATCH] Implement a tailored AVX2 8x8 DCT --- src/strategies/avx2/dct-avx2.c | 132 ++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 99dadd19..82c4c086 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -154,6 +154,133 @@ static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, } } +static INLINE __m256i swap_lanes(__m256i v) +{ + return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(1, 0, 3, 2)); +} + +static INLINE __m256i truncate(__m256i v, __m256i debias, int32_t shift) +{ + __m256i truncable = _mm256_add_epi32 (v, debias); + return _mm256_srai_epi32(truncable, shift); +} + +static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output) +{ + int32_t shift_1st = kvz_g_convert_to_bit[8] + 1 + (bitdepth - 8); + int32_t shift_2nd = kvz_g_convert_to_bit[8] + 8; + + const int32_t add1 = 1 << (shift_1st - 1); + const __m256i debias1 = _mm256_set1_epi32(add1); + + const int32_t add2 = 1 << (shift_2nd - 1); + const __m256i debias2 = _mm256_set1_epi32(add2); + + const __m256i *dct = (__m256i *)&(kvz_g_dct_8[0][0]); + + // Keep upper row intact and swap neighboring 16-bit words in lower row + const __m256i shuf_lorow_mask = + _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 18, 19, 16, 17, 22, 23, 20, 21, + 26, 27, 24, 25, 30, 31, 28, 29); + + __m256i tmpres[4]; + + // Dual Rows, because two 8x16b words fit in one YMM + __m256i i_dr_0 = _mm256_loadu_si256((__m256i *)input + 0); + __m256i i_dr_1 = _mm256_loadu_si256((__m256i *)input + 1); + __m256i i_dr_2 = _mm256_loadu_si256((__m256i *)input + 2); + __m256i i_dr_3 = _mm256_loadu_si256((__m256i *)input + 3); + + __m256i i_dr_0_swp = swap_lanes(i_dr_0); + __m256i i_dr_1_swp = swap_lanes(i_dr_1); + __m256i i_dr_2_swp = swap_lanes(i_dr_2); + __m256i i_dr_3_swp = swap_lanes(i_dr_3); + + /* + * Multiply input by the tranpose of DCT matrix into tmpres, and DCT matrix + * by tmpres - this is then our output matrix + * + * It's easier to implement an AVX2 matrix multiplication if you can multiply + * the left term with the transpose of the right term. Here things are stored + * row-wise, not column-wise, so we can effectively read DCT_T column-wise + * into YMM registers by reading DCT row-wise. Also because of this, the + * first multiplication is hacked to produce the transpose of the result + * instead, since it will be used in similar fashion as the right operand + * in the second multiplication. + */ + for (int dry = 0; dry < 4; dry++) { + + // Read columns of DCT matrix's transpose by reading rows of DCT matrix + __m256i d_dr = _mm256_loadu_si256(dct + dry); + + __m256i prod0 = _mm256_madd_epi16(d_dr, i_dr_0); + __m256i prod0_swp = _mm256_madd_epi16(d_dr, i_dr_0_swp); + __m256i prod1 = _mm256_madd_epi16(d_dr, i_dr_1); + __m256i prod1_swp = _mm256_madd_epi16(d_dr, i_dr_1_swp); + __m256i prod2 = _mm256_madd_epi16(d_dr, i_dr_2); + __m256i prod2_swp = _mm256_madd_epi16(d_dr, i_dr_2_swp); + __m256i prod3 = _mm256_madd_epi16(d_dr, i_dr_3); + __m256i prod3_swp = _mm256_madd_epi16(d_dr, i_dr_3_swp); + + __m256i hsum0 = _mm256_hadd_epi32(prod0, prod0_swp); + __m256i hsum1 = _mm256_hadd_epi32(prod1, prod1_swp); + __m256i hsum2 = _mm256_hadd_epi32(prod2, prod2_swp); + __m256i hsum3 = _mm256_hadd_epi32(prod3, prod3_swp); + + __m256i hsum2c_0 = _mm256_hadd_epi32(hsum0, hsum1); + __m256i hsum2c_1 = _mm256_hadd_epi32(hsum2, hsum3); + + __m256i hsum2c_0_tr = truncate(hsum2c_0, debias1, shift_1st); + __m256i hsum2c_1_tr = truncate(hsum2c_1, debias1, shift_1st); + + __m256i tmp_dr = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr); + + tmpres[dry] = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask); + } + + __m256i t_dr_0 = tmpres[0]; + __m256i t_dr_1 = tmpres[1]; + __m256i t_dr_2 = tmpres[2]; + __m256i t_dr_3 = tmpres[3]; + + __m256i t_dr_0_swp = swap_lanes(t_dr_0); + __m256i t_dr_1_swp = swap_lanes(t_dr_1); + __m256i t_dr_2_swp = swap_lanes(t_dr_2); + __m256i t_dr_3_swp = swap_lanes(t_dr_3); + + for (int dry = 0; dry < 4; dry++) { + __m256i d_dr = _mm256_loadu_si256(dct + dry); + + __m256i prod0 = _mm256_madd_epi16(d_dr, t_dr_0); + __m256i prod0_swp = _mm256_madd_epi16(d_dr, t_dr_0_swp); + __m256i prod1 = _mm256_madd_epi16(d_dr, t_dr_1); + __m256i prod1_swp = _mm256_madd_epi16(d_dr, t_dr_1_swp); + __m256i prod2 = _mm256_madd_epi16(d_dr, t_dr_2); + __m256i prod2_swp = _mm256_madd_epi16(d_dr, t_dr_2_swp); + __m256i prod3 = _mm256_madd_epi16(d_dr, t_dr_3); + __m256i prod3_swp = _mm256_madd_epi16(d_dr, t_dr_3_swp); + + __m256i hsum0 = _mm256_hadd_epi32(prod0, prod0_swp); + __m256i hsum1 = _mm256_hadd_epi32(prod1, prod1_swp); + __m256i hsum2 = _mm256_hadd_epi32(prod2, prod2_swp); + __m256i hsum3 = _mm256_hadd_epi32(prod3, prod3_swp); + + __m256i hsum2c_0 = _mm256_hadd_epi32(hsum0, hsum1); + __m256i hsum2c_1 = _mm256_hadd_epi32(hsum2, hsum3); + + __m256i hsum2c_0_tr = truncate(hsum2c_0, debias2, shift_2nd); + __m256i hsum2c_1_tr = truncate(hsum2c_1, debias2, shift_2nd); + + __m256i tmp_dr = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr); + + __m256i final_dr = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask); + + _mm256_storeu_si256(((__m256i *)output) + dry, final_dr); + } +} + // 16x16 matrix multiplication with value clipping. // Parameters: Two 16x16 matrices containing 16-bit values in consecutive addresses, // destination for the result and the shift value for clipping. @@ -340,7 +467,10 @@ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const // Generate all the transform functions TRANSFORM(dst, 4); TRANSFORM(dct, 4); -TRANSFORM(dct, 8); + +// Ha, we've got a tailored implementation for this +// TRANSFORM(dct, 8); + TRANSFORM(dct, 16); TRANSFORM(dct, 32);