From 13d4313e02562d6ee53bfb2a8cb906baf7879bb3 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 26 Jul 2023 14:05:04 +0300 Subject: [PATCH] [avx2] Mostly working --- src/strategies/avx2/dct-avx2.c | 41 +++++++-- src/strategies/avx2/dct_avx2_tables.h | 121 ++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 6 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 71361feb..d82d6415 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -2174,6 +2174,9 @@ void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t const int32_t shift_2nd = INVERSE_SHIFT_2ND; const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename + if (ver == DST7) { + ver_coeff = fi_dst7_8x2_coeff_hor; + } const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size @@ -2359,6 +2362,9 @@ void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_2nd = INVERSE_SHIFT_2ND; const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename + if (ver == DST7) { + ver_coeff = fi_dst7_16x2_coeff_hor; + } const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size @@ -3607,7 +3613,7 @@ static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const _mm256_store_si256((__m256i*)dst, v_result); } -void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type) +void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) { const int width = 8; const int height = 2; @@ -3620,6 +3626,9 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_ty const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename + if (hor == DST7) { + hor_coeff = fi_dst7_2x8_coeff_ver; + } // Only dct2 transform is defined for this block size __m256i v_ver_pass_out; @@ -4810,6 +4819,9 @@ void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename + if (hor == DST7) { + hor_coeff = fi_dst7_2x16_coeff_ver; + } // DST7 and DCT8 are not defined for this block size __m256i v_ver_pass_out[2]; @@ -5455,7 +5467,7 @@ static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, con // dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); //} - for (int j = 0; j < 16; ++j) { + for (int j = 0; j < line; ++j) { __m256i res_0 = _mm256_setzero_si256(); __m256i res_1 = _mm256_setzero_si256(); @@ -7219,7 +7231,7 @@ void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, } __m256i v_ver_pass_out[16]; - if(ver == DCT2) { + if(ver == DCT2 || hor == DCT2) { fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width); } else { @@ -8128,9 +8140,26 @@ static void mts_idct_avx2( else { const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; - - dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1]; - idct_func(input, output, type_hor, type_ver); + if (height == 1) { + if (width == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0); + _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0))); + } else if (width == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0); + } + } + else if (width == 1){ + if (height == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0); + _mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0))); + } else if (height == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0); + } + } + else { + dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1]; + idct_func(input, output, type_hor, type_ver); + } } } diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h index f56cb2cc..946ab6b8 100644 --- a/src/strategies/avx2/dct_avx2_tables.h +++ b/src/strategies/avx2/dct_avx2_tables.h @@ -818,6 +818,41 @@ ALIGNED(32) const int16_t fi_dct2_2x16_coeff_ver[512] = { 83, -80, 75, -70, 83, -80, 75, -70, 36, -25, 18, -9, 36, -25, 18, -9, }; +ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = { + 8, 25, 40, 55, 8, 25, 40, 55, 88, 87, 81, 73, 88, 87, 81, 73, // 0 + 68, 77, 85, 88, 68, 77, 85, 88, 62, 48, 33, 17, 62, 48, 33, 17, + 17, 48, 73, 87, 17, 48, 73, 87, -8, -40, -68, -85, -8, -40, -68, -85, + 88, 77, 55, 25, 88, 77, 55, 25, -88, -81, -62, -33, -88, -81, -62, -33, + 25, 68, 88, 81, 25, 68, 88, 81, -88, -68, -25, 25, -88, -68, -25, 25, + 48, 0, -48, -81, 48, 0, -48, -81, 68, 88, 81, 48, 68, 88, 81, 48, + 33, 81, 85, 40, 33, 81, 85, 40, 17, 73, 88, 55, 17, 73, 88, 55, +-25, -77, -87, -48, -25, -77, -87, -48, -8, -68, -88, -62, -8, -68, -88, -62, + 40, 88, 62, -17, 40, 88, 62, -17, 87, 33, -48, -88, 87, 33, -48, -88, // 8 +-81, -77, -8, 68, -81, -77, -8, 68, -55, 25, 85, 73, -55, 25, 85, 73, + 48, 88, 25, -68, 48, 88, 25, -68, -25, -88, -48, 48, -25, -88, -48, 48, +-81, 0, 81, 68, -81, 0, 81, 68, 88, 25, -68, -81, 88, 25, -68, -81, + 55, 81, -17, -88, 55, 81, -17, -88, -85, 8, 88, 33, -85, 8, 88, 33, +-25, 77, 62, -48, -25, 77, 62, -48, -73, -68, 40, 87, -73, -68, 40, 87, + 62, 68, -55, -73, 62, 68, -55, -73, 33, 85, -25, -87, 33, 85, -25, -87, + 48, 77, -40, -81, 48, 77, -40, -81, 17, 88, -8, -88, 17, 88, -8, -88, + 68, 48, -81, -25, 68, 48, -81, -25, 81, -48, -68, 68, 81, -48, -68, 68, // 16 + 88, 0, -88, 25, 88, 0, -88, 25, 48, -81, -25, 88, 48, -81, -25, 88, + 73, 25, -88, 33, 73, 25, -88, 33, -40, -62, 81, 8, -40, -62, 81, 8, + 68, -77, -17, 88, 68, -77, -17, 88, -87, 48, 55, -85, -87, 48, 55, -85, + 77, 0, -77, 77, 77, 0, -77, 77, -77, 77, 0, -77, -77, 77, 0, -77, + 0, -77, 77, 0, 0, -77, 77, 0, 77, 0, -77, 77, 77, 0, -77, 77, + 81, -25, -48, 88, 81, -25, -48, 88, 48, 25, -81, 81, 48, 25, -81, 81, +-68, 0, 68, -88, -68, 0, 68, -88, -25, -48, 88, -68, -25, -48, 88, -68, + 85, -48, -8, 62, 85, -48, -8, 62, 73, -88, 68, -17, 73, -88, 68, -17, // 24 +-88, 77, -33, -25, -88, 77, -33, -25, -40, 81, -87, 55, -40, 81, -87, 55, + 87, -68, 33, 8, 87, -68, 33, 8, -55, 17, 25, -62, -55, 17, 25, -62, +-48, 77, -88, 81, -48, 77, -88, 81, 85, -88, 73, -40, 85, -88, 73, -40, + 88, -81, 68, -48, 88, -81, 68, -48, -68, 81, -88, 88, -68, 81, -88, 88, + 25, 0, -25, 48, 25, 0, -25, 48, -81, 68, -48, 25, -81, 68, -48, 25, + 88, -88, 87, -85, 88, -88, 87, -85, 62, -55, 48, -40, 62, -55, 48, -40, + 81, -77, 73, -68, 81, -77, 73, -68, 33, -25, 17, -8, 33, -25, 17, -8, +}; + ALIGNED(32) const int16_t fi_dct2_2x32_coeff_ver[2048] = { 64, 90, 90, 90, 89, 88, 87, 85, 64, 90, 90, 90, 89, 88, 87, 85, // 0 83, 82, 80, 78, 75, 73, 70, 67, 83, 82, 80, 78, 75, 73, 70, 67, @@ -1346,6 +1381,17 @@ ALIGNED(32) const int16_t fi_dct2_8x2_coeff_hor[128] = { 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18, }; +ALIGNED(32) const int16_t fi_dst7_8x2_coeff_hor[128] = { + 17, 46, 71, 85, 86, 78, 60, 32, 17, 46, 71, 85, 86, 78, 60, 32, + 32, 78, 85, 46, -17, -71, -86, -60, 32, 78, 85, 46, -17, -71, -86, -60, + 46, 86, 32, -60, -85, -17, 71, 78, 46, 86, 32, -60, -85, -17, 71, 78, + 60, 71, -46, -78, 32, 85, -17, -86, 60, 71, -46, -78, 32, 85, -17, -86, + 71, 32, -86, 17, 78, -60, -46, 85, 71, 32, -86, 17, 78, -60, -46, 85, + 78, -17, -60, 86, -46, -32, 85, -71, 78, -17, -60, 86, -46, -32, 85, -71, + 85, -60, 17, 32, -71, 86, -78, 46, 85, -60, 17, 32, -71, 86, -78, 46, + 86, -85, 78, -71, 60, -46, 32, -17, 86, -85, 78, -71, 60, -46, 32, -17, +}; + const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table @@ -2381,6 +2427,43 @@ ALIGNED(32) const int16_t fi_dct2_16x2_coeff_hor[512] = { const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + +ALIGNED(32) const int16_t fi_dst7_16x2_coeff_hor[512] = { + 8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0 + 88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17, + 17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25, + -8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33, + 25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81, +-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48, + 33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48, + 17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62, + 40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8 + 87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73, + 48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68, +-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81, + 55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48, +-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87, + 62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81, + 33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88, + 68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16 + 81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88, + 73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88, +-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85, + 77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0, +-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77, + 81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88, + 48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68, + 85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24 + 73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55, + 87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81, +-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40, + 88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48, +-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25, + 88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68, + 62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8, +}; + + ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = { 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, @@ -2881,6 +2964,44 @@ ALIGNED(32) const int16_t fi_dst7_16x16_coeff_hor[256] = { -25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, }; +ALIGNED(32) const int16_t fi_dct2_16x1_coeff_hor[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_16x1_coeff_hor[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;