From 4dccbcc30d8989effde34c6311e15c637313d43a Mon Sep 17 00:00:00 2001 From: siivonek Date: Mon, 24 Jul 2023 15:32:53 +0300 Subject: [PATCH] [avx2] Forward transforms seem to be working --- src/strategies/avx2/dct-avx2.c | 6831 ++++++++++++++++++++++++- src/strategies/avx2/dct_avx2_tables.h | 4785 +++++++++++++++++ 2 files changed, 11441 insertions(+), 175 deletions(-) create mode 100644 src/strategies/avx2/dct_avx2_tables.h diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index bb8a92bc..f875a581 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -56,6 +56,11 @@ extern const int16_t uvg_g_dct_32_t[32][32]; #include "uvg266.h" #if UVG_BIT_DEPTH == 8 #include +#include "strategies/avx2/dct_avx2_tables.h" +#define MAX_LOG2_TR_DYNAMIC_RANGE 15 +#define TRANSFORM_MATRIX_SHIFT 6 +#define INVERSE_SHIFT_1ST (TRANSFORM_MATRIX_SHIFT + 1) +#define INVERSE_SHIFT_2ND (TRANSFORM_MATRIX_SHIFT + MAX_LOG2_TR_DYNAMIC_RANGE - 1 - UVG_BIT_DEPTH) /* * \file @@ -73,6 +78,583 @@ static INLINE __m256i truncate_avx2(__m256i v, __m256i debias, int32_t shift) return _mm256_srai_epi32(truncable, shift); } + +// TODO: find avx2 solution for transpose +// TODO: attempt to make a generic transpose for avx2. Needs some extra logic for different widths and heights. +// TODO: make a few solutions for exact sizes and see if some pattern emerges... +void transpose_matrix(const int16_t* src, int16_t* dst, const int width, const int height) { + const int sample_num = width * height; + const int vectors = sample_num / 16; + + int16_t* d_ptr = dst; + if (vectors == 0) { + return; + } + else if (vectors == 1) { + + } + else { + // Reserve enough storage for max transform size 32x32 + __m256i v_16b_result[64]; + __m256i v_32b_result[64]; + __m256i v_64b_result[64]; + __m256i v_128b_result[64]; + + // Handle two source vectors at a time + for (int i = 0; i < vectors; i += 2) { + __m256i v_src_0 = _mm256_load_si256((const __m256i*)src); + __m256i v_src_1 = _mm256_load_si256((const __m256i*)(src + 16)); + + v_16b_result[i] = _mm256_unpacklo_epi16(v_src_0, v_src_1); + v_16b_result[i + 1] = _mm256_unpackhi_epi16(v_src_0, v_src_1); + + src += 32; + } + + // 32 bit shuffle pass + int loop_idx = 0; + for (int i = 0; i < vectors; i += 2) { + const int idx_a = loop_idx; + const int idx_b = loop_idx + 2; + + v_32b_result[i] = _mm256_unpacklo_epi32(v_16b_result[idx_a], v_16b_result[idx_b]); + v_32b_result[i + 1] = _mm256_unpackhi_epi32(v_16b_result[idx_a], v_16b_result[idx_b]); + loop_idx++; + } + + // 64 bit shuffle pass + loop_idx = 0; + for (int i = 0; i < vectors; i += 2) { + const int idx_a = loop_idx; + const int idx_b = loop_idx + 4; + + v_64b_result[i] = _mm256_unpacklo_epi32(v_32b_result[idx_a], v_32b_result[idx_b]); + v_64b_result[i + 1] = _mm256_unpackhi_epi32(v_32b_result[idx_a], v_32b_result[idx_b]); + loop_idx++; + } + + // Final 128 bit shuffle pass + for (int i = 0; i < vectors; i += 2) { + const int idx_a = 0; + const int idx_b = 0; + + v_128b_result[i] = _mm256_unpacklo_epi32(v_64b_result[idx_a], v_64b_result[idx_b]); + v_128b_result[i + 1] = _mm256_unpackhi_epi32(v_64b_result[idx_a], v_64b_result[idx_b]); + } + + // Store loop + for (int i = 0; i < vectors; ++i) { + _mm256_store_si256((__m256i*)dst, v_128b_result[i]); + dst += 16; + } + } +} + +void transpose_generic(const int16_t* src, int16_t* dst, const int width, const int height) +{ + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + dst[x * height + y] = src[y * width + x]; + } + } +} + + +typedef void (transpose_func)(const __m256i* src, __m256i* dst); + + +static void transpose_2x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x16_avx2(const __m256i* src, __m256i* dst){} +static void transpose_2x32_avx2(const __m256i* src, __m256i* dst) +{ + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + __m256i v_tmp[4]; + v_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle); + v_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle); + v_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle); + v_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0)); + + dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); +} +static void transpose_2x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_4x16_avx2(const __m256i* src, __m256i* dst) +{ + const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16); + + // const __m256i v_shuffle = _mm256_set_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + // 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31); + + __m256i v_src_tmp[4]; + v_src_tmp[0] = _mm256_shuffle_epi8(src[0], v_shuffle); + v_src_tmp[1] = _mm256_shuffle_epi8(src[1], v_shuffle); + v_src_tmp[2] = _mm256_shuffle_epi8(src[2], v_shuffle); + v_src_tmp[3] = _mm256_shuffle_epi8(src[3], v_shuffle); + + __m256i v_tmp[4]; + v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20); + v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31); + v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20); + v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31); + + __m256i v_tmp16_lo[2]; + __m256i v_tmp16_hi[2]; + v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); + v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0)); + + dst[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); +} +static void transpose_4x32_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp[8]; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + for (int i = 0; i < 8; ++i) { + v_tmp[i] = _mm256_shuffle_epi8(src[i], v_shuffle); + v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[i] = _mm256_shuffle_epi32(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp[0], v_tmp[1]); + v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp[2], v_tmp[3]); + v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp[4], v_tmp[5]); + v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp[6], v_tmp[7]); + + v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp[0], v_tmp[1]); + v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp[2], v_tmp[3]); + v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp[4], v_tmp[5]); + v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp[6], v_tmp[7]); + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20); + + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31); +} +static void transpose_4x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_8x16_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + __m256i v_tmp128[8]; + + v_tmp128[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_tmp128[1] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_tmp128[2] = _mm256_permute2x128_si256(src[1], src[5], 0x20); + v_tmp128[3] = _mm256_permute2x128_si256(src[1], src[5], 0x31); + v_tmp128[4] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_tmp128[5] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + v_tmp128[6] = _mm256_permute2x128_si256(src[3], src[7], 0x20); + v_tmp128[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31); + + v_tmp16_lo[0] = _mm256_unpacklo_epi16(v_tmp128[0], v_tmp128[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(v_tmp128[2], v_tmp128[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(v_tmp128[4], v_tmp128[5]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(v_tmp128[6], v_tmp128[7]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(v_tmp128[0], v_tmp128[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(v_tmp128[2], v_tmp128[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(v_tmp128[4], v_tmp128[5]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(v_tmp128[6], v_tmp128[7]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + + dst[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + dst[1] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + dst[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + dst[3] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + dst[4] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + dst[5] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + dst[6] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + dst[7] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); +} +static void transpose_8x32_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + + const __m256i v_shuffle = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31); + for (int i = 0; i < 8; ++i) { + const int offset = i * 2; + v_tmp16_lo[i] = _mm256_unpacklo_epi16(src[offset], src[offset + 1]); + v_tmp16_hi[i] = _mm256_unpackhi_epi16(src[offset], src[offset + 1]); + } + + for (int i = 0; i < 8; i += 4) { + v_tmp32_lo[i + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]); + v_tmp32_lo[i + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]); + v_tmp32_lo[i + 2] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]); + v_tmp32_lo[i + 3] = _mm256_unpacklo_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]); + + v_tmp32_hi[i + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 0], v_tmp16_lo[i + 1]); + v_tmp32_hi[i + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[i + 2], v_tmp16_lo[i + 3]); + v_tmp32_hi[i + 2] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 0], v_tmp16_hi[i + 1]); + v_tmp32_hi[i + 3] = _mm256_unpackhi_epi32(v_tmp16_hi[i + 2], v_tmp16_hi[i + 3]); + } + + for (int i = 0; i < 8; i += 4) { + v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]); + v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]); + v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]); + v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]); + + v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 1]); + v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 2], v_tmp32_lo[i + 3]); + v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 1]); + v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 2], v_tmp32_hi[i + 3]); + } + + for (int i = 0; i < 8; ++i) { + v_tmp64_lo[i] = _mm256_permute4x64_epi64(v_tmp64_lo[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp64_hi[i] = _mm256_permute4x64_epi64(v_tmp64_hi[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + dst[0] = _mm256_shuffle_epi8(v_tmp64_lo[0], v_shuffle); + dst[1] = _mm256_shuffle_epi8(v_tmp64_lo[4], v_shuffle); + dst[2] = _mm256_shuffle_epi8(v_tmp64_hi[0], v_shuffle); + dst[3] = _mm256_shuffle_epi8(v_tmp64_hi[4], v_shuffle); + + dst[4] = _mm256_shuffle_epi8(v_tmp64_lo[2], v_shuffle); + dst[5] = _mm256_shuffle_epi8(v_tmp64_lo[6], v_shuffle); + dst[6] = _mm256_shuffle_epi8(v_tmp64_hi[2], v_shuffle); + dst[7] = _mm256_shuffle_epi8(v_tmp64_hi[6], v_shuffle); + + dst[8] = _mm256_shuffle_epi8(v_tmp64_lo[1], v_shuffle); + dst[9] = _mm256_shuffle_epi8(v_tmp64_lo[5], v_shuffle); + dst[10] = _mm256_shuffle_epi8(v_tmp64_hi[1], v_shuffle); + dst[11] = _mm256_shuffle_epi8(v_tmp64_hi[5], v_shuffle); + + dst[12] = _mm256_shuffle_epi8(v_tmp64_lo[3], v_shuffle); + dst[13] = _mm256_shuffle_epi8(v_tmp64_lo[7], v_shuffle); + dst[14] = _mm256_shuffle_epi8(v_tmp64_hi[3], v_shuffle); + dst[15] = _mm256_shuffle_epi8(v_tmp64_hi[7], v_shuffle); +} +static void transpose_8x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_16x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_16x4_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[2]; + __m256i v_tmp16_hi[2]; + __m256i v_tmp32_lo[2]; + __m256i v_tmp32_hi[2]; + + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + + dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31); +} +static void transpose_16x8_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + __m256i v_tmp64_lo[4]; + __m256i v_tmp64_hi[4]; + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[2], src[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[5]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[6], src[7]); + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[2], src[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[5]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[6], src[7]); + + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[1]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[2], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[1]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[2], v_tmp16_hi[3]); + + v_tmp64_lo[0] = _mm256_unpacklo_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + v_tmp64_lo[1] = _mm256_unpacklo_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + v_tmp64_lo[2] = _mm256_unpacklo_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + v_tmp64_lo[3] = _mm256_unpacklo_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + v_tmp64_hi[0] = _mm256_unpackhi_epi64(v_tmp32_lo[0], v_tmp32_lo[1]); + v_tmp64_hi[1] = _mm256_unpackhi_epi64(v_tmp32_lo[2], v_tmp32_lo[3]); + v_tmp64_hi[2] = _mm256_unpackhi_epi64(v_tmp32_hi[0], v_tmp32_hi[1]); + v_tmp64_hi[3] = _mm256_unpackhi_epi64(v_tmp32_hi[2], v_tmp32_hi[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20); + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31); +} + +static void transpose_16x16_avx2_stride(const int16_t* src, int16_t* dst, const int src_stride, const int dst_stride) { + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride)); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(*(__m256i*)(src + s * src_stride), *(__m256i*)(src + (s + 1) * src_stride)); + } + + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 2) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]); + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 1]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 1]); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 8; d += 4, s += 4) { + v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + + v_tmp64_lo[d + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_lo[d + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + v_tmp64_hi[d + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_hi[d + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + } + + _mm256_storeu_si256((__m256i*)(dst + 0 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 1 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 2 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 3 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 4 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 5 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 6 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x20)); + _mm256_storeu_si256((__m256i*)(dst + 7 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x20)); + + _mm256_storeu_si256((__m256i*)(dst + 8 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[4], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 9 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[4], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 10 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[6], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 11 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[6], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 12 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_lo[5], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 13 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[1], v_tmp64_hi[5], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 14 * dst_stride), _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_lo[7], 0x31)); + _mm256_storeu_si256((__m256i*)(dst + 15 * dst_stride), _mm256_permute2x128_si256(v_tmp64_hi[3], v_tmp64_hi[7], 0x31)); +} + +static void transpose_16x16_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 16); +} + +static void transpose_16x32_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t*)dst, 16, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 16, (int16_t*)dst + 16, 16, 32); + +} +static void transpose_16x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_32x2_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo0 = _mm256_unpacklo_epi16(src[0], src[2]); + __m256i v_tmp16_lo1 = _mm256_unpacklo_epi16(src[1], src[3]); + __m256i v_tmp16_hi0 = _mm256_unpackhi_epi16(src[0], src[2]); + __m256i v_tmp16_hi1 = _mm256_unpackhi_epi16(src[1], src[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp16_lo0, v_tmp16_hi0, 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp16_lo1, v_tmp16_hi1, 0x31); +} +static void transpose_32x4_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[4]; + __m256i v_tmp16_hi[4]; + v_tmp16_lo[0] = _mm256_unpacklo_epi16(src[0], src[2]); + v_tmp16_lo[1] = _mm256_unpacklo_epi16(src[1], src[3]); + v_tmp16_lo[2] = _mm256_unpacklo_epi16(src[4], src[6]); + v_tmp16_lo[3] = _mm256_unpacklo_epi16(src[5], src[7]); + + v_tmp16_hi[0] = _mm256_unpackhi_epi16(src[0], src[2]); + v_tmp16_hi[1] = _mm256_unpackhi_epi16(src[1], src[3]); + v_tmp16_hi[2] = _mm256_unpackhi_epi16(src[4], src[6]); + v_tmp16_hi[3] = _mm256_unpackhi_epi16(src[5], src[7]); + + __m256i v_tmp32_lo[4]; + __m256i v_tmp32_hi[4]; + v_tmp32_lo[0] = _mm256_unpacklo_epi32(v_tmp16_lo[0], v_tmp16_lo[2]); + v_tmp32_lo[1] = _mm256_unpacklo_epi32(v_tmp16_lo[1], v_tmp16_lo[3]); + v_tmp32_lo[2] = _mm256_unpacklo_epi32(v_tmp16_hi[0], v_tmp16_hi[2]); + v_tmp32_lo[3] = _mm256_unpacklo_epi32(v_tmp16_hi[1], v_tmp16_hi[3]); + + v_tmp32_hi[0] = _mm256_unpackhi_epi32(v_tmp16_lo[0], v_tmp16_lo[2]); + v_tmp32_hi[1] = _mm256_unpackhi_epi32(v_tmp16_lo[1], v_tmp16_lo[3]); + v_tmp32_hi[2] = _mm256_unpackhi_epi32(v_tmp16_hi[0], v_tmp16_hi[2]); + v_tmp32_hi[3] = _mm256_unpackhi_epi32(v_tmp16_hi[1], v_tmp16_hi[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp32_lo[0], v_tmp32_hi[0], 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp32_lo[2], v_tmp32_hi[2], 0x31); + + dst[4] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x20); + dst[5] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x20); + dst[6] = _mm256_permute2x128_si256(v_tmp32_lo[1], v_tmp32_hi[1], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp32_lo[3], v_tmp32_hi[3], 0x31); +} +static void transpose_32x8_avx2(const __m256i* src, __m256i* dst) +{ + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 4) { + v_tmp16_lo[d + 0] = _mm256_unpacklo_epi16(src[s + 0], src[s + 2]); + v_tmp16_lo[d + 1] = _mm256_unpacklo_epi16(src[s + 1], src[s + 3]); + + v_tmp16_hi[d + 0] = _mm256_unpackhi_epi16(src[s + 0], src[s + 2]); + v_tmp16_hi[d + 1] = _mm256_unpackhi_epi16(src[s + 1], src[s + 3]); + } + + __m256i v_tmp32_lo[8]; + __m256i v_tmp32_hi[8]; + for (int d = 0, s = 0; d < 4; d += 2, s += 4) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]); + v_tmp32_lo[d + 4] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]); + v_tmp32_lo[d + 5] = _mm256_unpacklo_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]); + + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 0], v_tmp16_lo[s + 2]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(v_tmp16_lo[s + 1], v_tmp16_lo[s + 3]); + v_tmp32_hi[d + 4] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 0], v_tmp16_hi[s + 2]); + v_tmp32_hi[d + 5] = _mm256_unpackhi_epi32(v_tmp16_hi[s + 1], v_tmp16_hi[s + 3]); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 4; d += 2, s += 4) { + v_tmp64_lo[d + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_lo[d + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_lo[d + 4] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_lo[d + 5] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + + v_tmp64_hi[d + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 2]); + v_tmp64_hi[d + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 1], v_tmp32_lo[s + 3]); + v_tmp64_hi[d + 4] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 2]); + v_tmp64_hi[d + 5] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 1], v_tmp32_hi[s + 3]); + } + + dst[0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x20); + + dst[4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_hi[0], 0x31); + dst[5] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_hi[4], 0x31); + dst[6] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_hi[2], 0x31); + dst[7] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_hi[6], 0x31); + + dst[8] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x20); + dst[9] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x20); + dst[10] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x20); + dst[11] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x20); + + dst[12] = _mm256_permute2x128_si256(v_tmp64_lo[1], v_tmp64_hi[1], 0x31); + dst[13] = _mm256_permute2x128_si256(v_tmp64_lo[5], v_tmp64_hi[5], 0x31); + dst[14] = _mm256_permute2x128_si256(v_tmp64_lo[3], v_tmp64_hi[3], 0x31); + dst[15] = _mm256_permute2x128_si256(v_tmp64_lo[7], v_tmp64_hi[7], 0x31); +} +static void transpose_32x16_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t *)dst, 32, 16); + transpose_16x16_avx2_stride((int16_t const *)src + 16, (int16_t *)dst + 16 * 16, 32, 16); +} +static void transpose_32x32_avx2(const __m256i* src, __m256i* dst) { + transpose_16x16_avx2_stride((int16_t const *)src, (int16_t *)dst, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16, (int16_t *)dst + 16 * 32, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32, (int16_t *)dst + 16, 32, 32); + transpose_16x16_avx2_stride((int16_t const *)src + 16 * 32 + 16, (int16_t *)dst + 16 * 32 + 16, 32, 32); +} +static void transpose_32x64_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x2_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x4_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x8_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x16_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x32_avx2(const __m256i* src, __m256i* dst){} +static void transpose_64x64_avx2(const __m256i* src, __m256i* dst){} + + + +static transpose_func* transpose_func_table[6][6] = { + { transpose_2x2_avx2, transpose_4x2_avx2, transpose_8x2_avx2, transpose_16x2_avx2, transpose_32x2_avx2, transpose_64x2_avx2}, + { transpose_2x4_avx2, transpose_4x4_avx2, transpose_8x4_avx2, transpose_16x4_avx2, transpose_32x4_avx2, transpose_64x4_avx2}, + { transpose_2x8_avx2, transpose_4x8_avx2, transpose_8x8_avx2, transpose_16x8_avx2, transpose_32x8_avx2, transpose_64x8_avx2}, + {transpose_2x16_avx2, transpose_4x16_avx2, transpose_8x16_avx2, transpose_16x16_avx2, transpose_32x16_avx2, transpose_64x16_avx2}, + {transpose_2x32_avx2, transpose_4x32_avx2, transpose_8x32_avx2, transpose_16x32_avx2, transpose_32x32_avx2, transpose_64x32_avx2}, + {transpose_2x64_avx2, transpose_4x64_avx2, transpose_8x64_avx2, transpose_16x64_avx2, transpose_32x64_avx2, transpose_64x64_avx2}, +}; + + +// Dispatcher function for avx2 transposes. This calls the proper subfunction +void transpose_avx2(const __m256i* src, __m256i* dst, const int width, const int height) +{ + // No need to transpose something of width or height 1 + const int w_log2_minus1 = uvg_g_convert_to_log2[width] - 1; + const int h_log2_minus1 = uvg_g_convert_to_log2[height] - 1; + + transpose_func* func = transpose_func_table[h_log2_minus1][w_log2_minus1]; + func(src, dst); +} + + // 4x4 matrix multiplication with value clipping. // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses, // destination for the result and the shift value for clipping. @@ -945,12 +1527,6 @@ ITRANSFORM(dct, 32); /*****************************************************/ // DST-7 -#define DEFINE_DST7_P4_MATRIX(a,b,c,d) { \ - { a, b, c, d},\ - { c, c, 0, -c},\ - { d, -a, -c, b},\ - { b, -d, c, -a},\ -} #define DEFINE_DST7_P4_MATRIX_T(a,b,c,d) { \ { a, c, d, b},\ @@ -959,17 +1535,6 @@ ITRANSFORM(dct, 32); { d, -c, b, -a},\ } -#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \ -{\ - { a, b, c, d, e, f, g, h},\ - { c, f, h, e, b, -a, -d, -g},\ - { e, g, b, -c, -h, -d, a, f},\ - { g, c, -d, -f, a, h, b, -e},\ - { h, -a, -g, b, f, -c, -e, d},\ - { f, -e, -a, g, -d, -b, h, -c},\ - { d, -h, e, -a, -c, g, -f, b},\ - { b, -d, f, -h, g, -e, c, -a},\ -} #define DEFINE_DST7_P8_MATRIX_T(a,b,c,d,e,f,g,h) \ {\ @@ -983,25 +1548,6 @@ ITRANSFORM(dct, 32); { h, -g, f, -e, d, -c, b, -a,},\ }\ -#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ -{ \ - { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}, \ - { c, f, i, l, o, o, l, i, f, c, 0, -c, -f, -i, -l, -o}, \ - { e, j, o, m, h, c, -b, -g, -l, -p, -k, -f, -a, d, i, n}, \ - { g, n, l, e, -b, -i, -p, -j, -c, d, k, o, h, a, -f, -m}, \ - { i, o, f, -c, -l, -l, -c, f, o, i, 0, -i, -o, -f, c, l}, \ - { k, k, 0, -k, -k, 0, k, k, 0, -k, -k, 0, k, k, 0, -k}, \ - { m, g, -f, -n, -a, l, h, -e, -o, -b, k, i, -d, -p, -c, j}, \ - { o, c, -l, -f, i, i, -f, -l, c, o, 0, -o, -c, l, f, -i}, \ - { p, -a, -o, b, n, -c, -m, d, l, -e, -k, f, j, -g, -i, h}, \ - { n, -e, -i, j, d, -o, a, m, -f, -h, k, c, -p, b, l, -g}, \ - { l, -i, -c, o, -f, -f, o, -c, -i, l, 0, -l, i, c, -o, f}, \ - { j, -m, c, g, -p, f, d, -n, i, a, -k, l, -b, -h, o, -e}, \ - { h, -p, i, -a, -g, o, -j, b, f, -n, k, -c, -e, m, -l, d}, \ - { f, -l, o, -i, c, c, -i, o, -l, f, 0, -f, l, -o, i, -c}, \ - { d, -h, l, -p, m, -i, e, -a, -c, g, -k, o, -n, j, -f, b}, \ - { b, -d, f, -h, j, -l, n, -p, o, -m, k, -i, g, -e, c, -a}, \ -} #define DEFINE_DST7_P16_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ { \ @@ -1024,43 +1570,6 @@ ITRANSFORM(dct, 32); } - -#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F}, \ - {c, f, i, l, o, r, u, x, A, D, F, C, z, w, t, q, n, k, h, e, b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E}, \ - {e, j, o, t, y, D, D, y, t, o, j, e, 0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e, 0, e, j, o, t, y, D}, \ - {g, n, u, B, D, w, p, i, b, -e, -l, -s, -z, -F, -y, -r, -k, -d, c, j, q, x, E, A, t, m, f, -a, -h, -o, -v, -C}, \ - {i, r, A, C, t, k, b, -g, -p, -y, -E, -v, -m, -d, e, n, w, F, x, o, f, -c, -l, -u, -D, -z, -q, -h, a, j, s, B}, \ - {k, v, F, u, j, -a, -l, -w, -E, -t, -i, b, m, x, D, s, h, -c, -n, -y, -C, -r, -g, d, o, z, B, q, f, -e, -p, -A}, \ - {m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z}, \ - {o, D, t, e, -j, -y, -y, -j, e, t, D, o, 0, -o, -D, -t, -e, j, y, y, j, -e, -t, -D, -o, 0, o, D, t, e, -j, -y}, \ - {q, E, n, -c, -t, -B, -k, f, w, y, h, -i, -z, -v, -e, l, C, s, b, -o, -F, -p, a, r, D, m, -d, -u, -A, -j, g, x}, \ - {s, A, h, -k, -D, -p, c, v, x, e, -n, -F, -m, f, y, u, b, -q, -C, -j, i, B, r, -a, -t, -z, -g, l, E, o, -d, -w}, \ - {u, w, b, -s, -y, -d, q, A, f, -o, -C, -h, m, E, j, -k, -F, -l, i, D, n, -g, -B, -p, e, z, r, -c, -x, -t, a, v}, \ - {w, s, -d, -A, -o, h, E, k, -l, -D, -g, p, z, c, -t, -v, a, x, r, -e, -B, -n, i, F, j, -m, -C, -f, q, y, b, -u}, \ - {y, o, -j, -D, -e, t, t, -e, -D, -j, o, y, 0, -y, -o, j, D, e, -t, -t, e, D, j, -o, -y, 0, y, o, -j, -D, -e, t}, \ - {A, k, -p, -v, e, F, f, -u, -q, j, B, a, -z, -l, o, w, -d, -E, -g, t, r, -i, -C, -b, y, m, -n, -x, c, D, h, -s}, \ - {C, g, -v, -n, o, u, -h, -B, a, D, f, -w, -m, p, t, -i, -A, b, E, e, -x, -l, q, s, -j, -z, c, F, d, -y, -k, r}, \ - {E, c, -B, -f, y, i, -v, -l, s, o, -p, -r, m, u, -j, -x, g, A, -d, -D, a, F, b, -C, -e, z, h, -w, -k, t, n, -q}, \ - {F, -a, -E, b, D, -c, -C, d, B, -e, -A, f, z, -g, -y, h, x, -i, -w, j, v, -k, -u, l, t, -m, -s, n, r, -o, -q, p}, \ - {D, -e, -y, j, t, -o, -o, t, j, -y, -e, D, 0, -D, e, y, -j, -t, o, o, -t, -j, y, e, -D, 0, D, -e, -y, j, t, -o}, \ - {B, -i, -s, r, j, -A, -a, C, -h, -t, q, k, -z, -b, D, -g, -u, p, l, -y, -c, E, -f, -v, o, m, -x, -d, F, -e, -w, n}, \ - {z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m}, \ - {x, -q, -g, E, -j, -n, A, -c, -u, t, d, -B, m, k, -D, f, r, -w, -a, y, -p, -h, F, -i, -o, z, -b, -v, s, e, -C, l}, \ - {v, -u, -a, w, -t, -b, x, -s, -c, y, -r, -d, z, -q, -e, A, -p, -f, B, -o, -g, C, -n, -h, D, -m, -i, E, -l, -j, F, -k}, \ - {t, -y, e, o, -D, j, j, -D, o, e, -y, t, 0, -t, y, -e, -o, D, -j, -j, D, -o, -e, y, -t, 0, t, -y, e, o, -D, j}, \ - {r, -C, k, g, -y, v, -d, -n, F, -o, -c, u, -z, h, j, -B, s, -a, -q, D, -l, -f, x, -w, e, m, -E, p, b, -t, A, -i}, \ - {p, -F, q, -a, -o, E, -r, b, n, -D, s, -c, -m, C, -t, d, l, -B, u, -e, -k, A, -v, f, j, -z, w, -g, -i, y, -x, h}, \ - {n, -B, w, -i, -e, s, -F, r, -d, -j, x, -A, m, a, -o, C, -v, h, f, -t, E, -q, c, k, -y, z, -l, -b, p, -D, u, -g}, \ - {l, -x, C, -q, e, g, -s, E, -v, j, b, -n, z, -A, o, -c, -i, u, -F, t, -h, -d, p, -B, y, -m, a, k, -w, D, -r, f}, \ - {j, -t, D, -y, o, -e, -e, o, -y, D, -t, j, 0, -j, t, -D, y, -o, e, e, -o, y, -D, t, -j, 0, j, -t, D, -y, o, -e}, \ - {h, -p, x, -F, y, -q, i, -a, -g, o, -w, E, -z, r, -j, b, f, -n, v, -D, A, -s, k, -c, -e, m, -u, C, -B, t, -l, d}, \ - {f, -l, r, -x, D, -C, w, -q, k, -e, -a, g, -m, s, -y, E, -B, v, -p, j, -d, -b, h, -n, t, -z, F, -A, u, -o, i, -c}, \ - {d, -h, l, -p, t, -x, B, -F, C, -y, u, -q, m, -i, e, -a, -c, g, -k, o, -s, w, -A, E, -D, z, -v, r, -n, j, -f, b}, \ - {b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a}, \ -} - #define DEFINE_DST7_P32_MATRIX_T(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ { \ {a, c, e, g, i, k, m, o, q, s, u, w, y, A, C, E, F, D, B, z, x, v, t, r, p, n, l, j, h, f, d, b,},\ @@ -1097,85 +1606,6 @@ ITRANSFORM(dct, 32); {F, -E, D, -C, B, -A, z, -y, x, -w, v, -u, t, -s, r, -q, p, -o, n, -m, l, -k, j, -i, h, -g, f, -e, d, -c, b, -a,},\ } -// DCT-8 -#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \ -{ \ - {a, b, c, d}, \ - {b, 0, -b, -b}, \ - {c, -b, -d, a}, \ - {d, -b, a, -c}, \ -} - -#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \ -{ \ - {a, b, c, d, e, f, g, h}, \ - {b, e, h, -g, -d, -a, -c, -f}, \ - {c, h, -e, -a, -f, g, b, d}, \ - {d, -g, -a, -h, c, e, -f, -b}, \ - {e, -d, -f, c, g, -b, -h, a}, \ - {f, -a, g, e, -b, h, d, -c}, \ - {g, -c, b, -f, -h, d, -a, e}, \ - {h, -f, d, -b, a, -c, e, -g}, \ -} - -#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}, \ - {b, e, h, k, n, 0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n}, \ - {c, h, m, -p, -k, -f, -a, -e, -j, -o, n, i, d, b, g, l}, \ - {d, k, -p, -i, -b, -f, -m, n, g, a, h, o, -l, -e, -c, -j}, \ - {e, n, -k, -b, -h, 0, h, b, k, -n, -e, -e, -n, k, b, h}, \ - {f, 0, -f, -f, 0, f, f, 0, -f, -f, 0, f, f, 0, -f, -f}, \ - {g, -n, -a, -m, h, f, -o, -b, -l, i, e, -p, -c, -k, j, d}, \ - {h, -k, -e, n, b, 0, -b, -n, e, k, -h, -h, k, e, -n, -b}, \ - {i, -h, -j, g, k, -f, -l, e, m, -d, -n, c, o, -b, -p, a}, \ - {j, -e, -o, a, -n, -f, i, k, -d, -p, b, -m, -g, h, l, -c}, \ - {k, -b, n, h, -e, 0, e, -h, -n, b, -k, -k, b, -n, -h, e}, \ - {l, -b, i, o, -e, f, -p, -h, c, -m, -k, a, -j, -n, d, -g}, \ - {m, -e, d, -l, -n, f, -c, k, o, -g, b, -j, -p, h, -a, i}, \ - {n, -h, b, -e, k, 0, -k, e, -b, h, -n, -n, h, -b, e, -k}, \ - {o, -k, g, -c, b, -f, j, -n, -p, l, -h, d, -a, e, -i, m}, \ - {p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o}, \ -} - - -#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ -{ \ - {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F}, \ - {b, e, h, k, n, q, t, w, z, C, F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D}, \ - {c, h, m, r, w, B, 0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B, 0, B, w, r, m, h, c, c, h, m, r, w, B}, \ - {d, k, r, y, F, -A, -t, -m, -f, -b, -i, -p, -w, -D, C, v, o, h, a, g, n, u, B, -E, -x, -q, -j, -c, -e, -l, -s, -z}, \ - {e, n, w, F, -y, -p, -g, -c, -l, -u, -D, A, r, i, a, j, s, B, -C, -t, -k, -b, -h, -q, -z, E, v, m, d, f, o, x}, \ - {f, q, B, -A, -p, -e, -g, -r, -C, z, o, d, h, s, D, -y, -n, -c, -i, -t, -E, x, m, b, j, u, F, -w, -l, -a, -k, -v}, \ - {g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t}, \ - {h, w, -B, -m, -c, -r, 0, r, c, m, B, -w, -h, -h, -w, B, m, c, r, 0, -r, -c, -m, -B, w, h, h, w, -B, -m, -c, -r}, \ - {i, z, -w, -f, -l, -C, t, c, o, F, -q, -a, -r, E, n, d, u, -B, -k, -g, -x, y, h, j, A, -v, -e, -m, -D, s, b, p}, \ - {j, C, -r, -b, -u, z, g, m, F, -o, -e, -x, w, d, p, -E, -l, -h, -A, t, a, s, -B, -i, -k, -D, q, c, v, -y, -f, -n}, \ - {k, F, -m, -i, -D, o, g, B, -q, -e, -z, s, c, x, -u, -a, -v, w, b, t, -y, -d, -r, A, f, p, -C, -h, -n, E, j, l}, \ - {l, -E, -h, -p, A, d, t, -w, -a, -x, s, e, B, -o, -i, -F, k, m, -D, -g, -q, z, c, u, -v, -b, -y, r, f, C, -n, -j}, \ - {m, -B, -c, -w, r, h, 0, -h, -r, w, c, B, -m, -m, B, c, w, -r, -h, 0, h, r, -w, -c, -B, m, m, -B, -c, -w, r, h}, \ - {n, -y, -c, -D, i, s, -t, -h, E, d, x, -o, -m, z, b, C, -j, -r, u, g, -F, -e, -w, p, l, -A, -a, -B, k, q, -v, -f}, \ - {o, -v, -h, C, a, D, -g, -w, n, p, -u, -i, B, b, E, -f, -x, m, q, -t, -j, A, c, F, -e, -y, l, r, -s, -k, z, d}, \ - {p, -s, -m, v, j, -y, -g, B, d, -E, -a, -F, c, C, -f, -z, i, w, -l, -t, o, q, -r, -n, u, k, -x, -h, A, e, -D, -b}, \ - {q, -p, -r, o, s, -n, -t, m, u, -l, -v, k, w, -j, -x, i, y, -h, -z, g, A, -f, -B, e, C, -d, -D, c, E, -b, -F, a}, \ - {r, -m, -w, h, B, -c, 0, c, -B, -h, w, m, -r, -r, m, w, -h, -B, c, 0, -c, B, h, -w, -m, r, r, -m, -w, h, B, -c}, \ - {s, -j, -B, a, -C, -i, t, r, -k, -A, b, -D, -h, u, q, -l, -z, c, -E, -g, v, p, -m, -y, d, -F, -f, w, o, -n, -x, e}, \ - {t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g}, \ - {u, -d, B, n, -k, -E, g, -r, -x, a, -y, -q, h, -F, -j, o, A, -c, v, t, -e, C, m, -l, -D, f, -s, -w, b, -z, -p, i}, \ - {v, -a, w, u, -b, x, t, -c, y, s, -d, z, r, -e, A, q, -f, B, p, -g, C, o, -h, D, n, -i, E, m, -j, F, l, -k}, \ - {w, -c, r, B, -h, m, 0, -m, h, -B, -r, c, -w, -w, c, -r, -B, h, -m, 0, m, -h, B, r, -c, w, w, -c, r, B, -h, m}, \ - {x, -f, m, -E, -q, b, -t, -B, j, -i, A, u, -c, p, F, -n, e, -w, -y, g, -l, D, r, -a, s, C, -k, h, -z, -v, d, -o}, \ - {y, -i, h, -x, -z, j, -g, w, A, -k, f, -v, -B, l, -e, u, C, -m, d, -t, -D, n, -c, s, E, -o, b, -r, -F, p, -a, q}, \ - {z, -l, c, -q, E, u, -g, h, -v, -D, p, -b, m, -A, -y, k, -d, r, -F, -t, f, -i, w, C, -o, a, -n, B, x, -j, e, -s}, \ - {A, -o, c, -j, v, F, -t, h, -e, q, -C, -y, m, -a, l, -x, -D, r, -f, g, -s, E, w, -k, b, -n, z, B, -p, d, -i, u}, \ - {B, -r, h, -c, m, -w, 0, w, -m, c, -h, r, -B, -B, r, -h, c, -m, w, 0, -w, m, -c, h, -r, B, B, -r, h, -c, m, -w}, \ - {C, -u, m, -e, d, -l, t, -B, -D, v, -n, f, -c, k, -s, A, E, -w, o, -g, b, -j, r, -z, -F, x, -p, h, -a, i, -q, y}, \ - {D, -x, r, -l, f, -a, g, -m, s, -y, E, C, -w, q, -k, e, -b, h, -n, t, -z, F, B, -v, p, -j, d, -c, i, -o, u, -A}, \ - {E, -A, w, -s, o, -k, g, -c, b, -f, j, -n, r, -v, z, -D, -F, B, -x, t, -p, l, -h, d, -a, e, -i, m, -q, u, -y, C}, \ - {F, -D, B, -z, x, -v, t, -r, p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o, q, -s, u, -w, y, -A, C, -E}, \ -} - - // DST-7 ALIGNED(64) const int16_t uvg_g_dst7_4[4][4] = DEFINE_DST7_P4_MATRIX(29, 55, 74, 84); ALIGNED(64) const int16_t uvg_g_dst7_8[8][8] = DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86); @@ -1576,6 +2006,6044 @@ static tr_func* idct_table[5] = { mts_idct_4x4_avx2, mts_idct_8x8_avx2, mts_idct_16x16_avx2, mts_idct_32x32_avx2, NULL/*fastInverseDCT2_B64*/ }; +typedef void (dct_full_pass)(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver); + + +// ********************************************** +// New tailored functions for each size combination +// ********************************************** + +static void fast_forward_tr_2xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*)coeff); + __m256i v_coeff_1 = _mm256_load_si256((__m256i*)(coeff + 16)); + __m256i* v_dst_ptr = dst; + + const int reduced_line = line - skip_line; + // Handle 8 lines at a time (16 samples, 2 samples per line) + for (int j = 0; j < reduced_line; j += 8) { + // src vector: [00 01 02 03 04 05 06 07|08 09 10 11 12 13 14 15] + __m256i v_src = _mm256_load_si256((const __m256i*) src); + + // Multiply with a and add together all adjacent elements + // even vector: [a00+a01 a02+a03 a04+a05 a06+a07|a08+a09 a10+a11 a12+a13 a14+a15] + __m256i v_even = _mm256_madd_epi16(v_src, v_coeff_0); + // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15] + __m256i v_odd = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift); + + v_dst_ptr[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + v_dst_ptr++; + } +} + +void fast_forward_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 8; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_2x8_coeff_ver; + if (ver == DST7) { + ver_coeff = ff_dst7_2x8_coeff_ver; + } + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_hor_pass_out; + fast_forward_tr_2xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got data for only 1 vector + // const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x8_shuffle_ver); + const __m256i v_src_raw = v_hor_pass_out; + // __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + v_madd[i] = _mm256_madd_epi16(v_src, v_coeff[i]); + } + __m256i v_hadd_0[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_hadd_0[i] = _mm256_hadd_epi32(v_madd[offset], v_madd[offset + 1]); + } + + __m256i v_trunc[2]; + for (int i = 0; i < 2; ++i) { + const int offset = i * 2; + v_trunc[i] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[offset], v_hadd_0[offset + 1]), debias, shift_2nd); + } + + __m256i v_result = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + const __m256i v_res_shfl = _mm256_load_si256((const __m256i*)ff_dct2_2x8_result_shuffle_ver); + // Shuffle values to correct order + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + v_result = _mm256_shuffle_epi32(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + v_result = _mm256_shuffle_epi8(v_result, v_res_shfl); + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_2x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_hor); + + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + __m256i v_madd_4 = _mm256_madd_epi16(v_src, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src, v_coeff[7]); + + __m256i v_hadd_00 = _mm256_hadd_epi32(v_madd_0, v_madd_1); + __m256i v_hadd_01 = _mm256_hadd_epi32(v_madd_2, v_madd_3); + __m256i v_hadd_02 = _mm256_hadd_epi32(v_madd_4, v_madd_5); + __m256i v_hadd_03 = _mm256_hadd_epi32(v_madd_6, v_madd_7); + + __m256i v_hadd_10 = _mm256_hadd_epi32(v_hadd_00, v_hadd_01); + __m256i v_hadd_11 = _mm256_hadd_epi32(v_hadd_02, v_hadd_03); + + __m256i v_trunc_0 = truncate_avx2(v_hadd_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_hadd_11, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_2x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_shuffle_ver); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_8x2_res_shuffle_ver); + + __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi8(v_src, v_shuffle); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out; + fast_inverse_tr_2x8_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x8_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 16; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = uvg_g_dct_16; + if (ver == DST7) { + ver_coeff = uvg_g_dst7_16; + } + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_hor_pass_out[2]; + fast_forward_tr_2xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, 0); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Permute hor pass output to correct order + __m256i v_tmp_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_tmp_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_src_0 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_tmp_0, v_tmp_1, 0x31); + + const __m256i* v_coeff_ptr = (const __m256i*)ver_coeff; + + __m256i v_madd[2][16]; + for (int i = 0; i < 16; ++i) { + v_madd[0][i] = _mm256_madd_epi16(v_src_0, v_coeff_ptr[i]); + v_madd[1][i] = _mm256_madd_epi16(v_src_1, v_coeff_ptr[i]); + } + + __m256i v_hadd_0[2][8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_hadd_0[0][dst] = _mm256_hadd_epi32(v_madd[0][src], v_madd[0][src + 1]); + v_hadd_0[1][dst] = _mm256_hadd_epi32(v_madd[1][src], v_madd[1][src + 1]); + } + + __m256i v_hadd_1[2][4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_hadd_1[0][dst] = _mm256_hadd_epi32(v_hadd_0[0][src], v_hadd_0[0][src + 1]); + v_hadd_1[1][dst] = _mm256_hadd_epi32(v_hadd_0[1][src], v_hadd_0[1][src + 1]); + } + + __m256i v_tmp_00 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x20); + __m256i v_tmp_01 = _mm256_permute2x128_si256(v_hadd_1[0][0], v_hadd_1[0][1], 0x31); + __m256i v_tmp_02 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x20); + __m256i v_tmp_03 = _mm256_permute2x128_si256(v_hadd_1[0][2], v_hadd_1[0][3], 0x31); + + __m256i v_tmp_10 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x20); + __m256i v_tmp_11 = _mm256_permute2x128_si256(v_hadd_1[1][0], v_hadd_1[1][1], 0x31); + __m256i v_tmp_12 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x20); + __m256i v_tmp_13 = _mm256_permute2x128_si256(v_hadd_1[1][2], v_hadd_1[1][3], 0x31); + + __m256i v_trunc_00 = truncate_avx2((_mm256_add_epi32(v_tmp_00, v_tmp_01)), debias, shift_2nd); + __m256i v_trunc_01 = truncate_avx2((_mm256_add_epi32(v_tmp_02, v_tmp_03)), debias, shift_2nd); + + __m256i v_trunc_10 = truncate_avx2((_mm256_add_epi32(v_tmp_10, v_tmp_11)), debias, shift_2nd); + __m256i v_trunc_11 = truncate_avx2((_mm256_add_epi32(v_tmp_12, v_tmp_13)), debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_00, v_trunc_10); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_01, v_trunc_11); + + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + _mm256_store_si256((__m256i*)&dst[0], v_result_0); + _mm256_store_si256((__m256i*)&dst[16], v_result_1); +} + + +static void fast_inverse_tr_2x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + + v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + for (int c = 0; c < 16; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_coeff += 2; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + } + + __m256i v_hadd_0[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_hadd_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]); + } + + __m256i v_trunc[4]; + for (int i = 0; i < 4; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); +} + +static void fast_inverse_tr_2x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_lo = _mm256_unpacklo_epi16(src[0], src[1]); + __m256i v_src_hi = _mm256_unpackhi_epi16(src[0], src[1]); + + __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]); + __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]); + + __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]); + __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]); + + __m256i v_trunc_0 = truncate_avx2(v_madd_lo_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_lo_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_madd_hi_0, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_madd_hi_1, debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_2x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; + const int16_t* ver_coeff = uvg_g_dct_32; + // For result shuffling, can use existing shuffle vector + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + ALIGNED(32) int16_t v_hor_pass_out[2*32]; + fast_forward_tr_2xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + __m256i temp_out[4]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 2; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ff_dct2_32x32_coeff_ver; + const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4); + for (int i = 0; i < 16; ++i) { + + __m256i v_src = _mm256_set1_epi32(*temp_source); + temp_source += i & 1 ? 3 : 1; + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 2); +} + + +static void fast_inverse_tr_2x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src[4]; + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle); + } + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int c = 0; c < 32; c++) { + const __m256i v_coeff_0 = _mm256_setr_epi64x(c_ptr[0], c_ptr[1], c_ptr[0], c_ptr[1]); + const __m256i v_coeff_1 = _mm256_setr_epi64x(c_ptr[2], c_ptr[3], c_ptr[2], c_ptr[3]); + const __m256i v_coeff_2 = _mm256_setr_epi64x(c_ptr[4], c_ptr[5], c_ptr[4], c_ptr[5]); + const __m256i v_coeff_3 = _mm256_setr_epi64x(c_ptr[6], c_ptr[7], c_ptr[6], c_ptr[7]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_add[c] = _mm256_add_epi32(v_add_00, v_add_01); + c_ptr += 8; + } + + __m256i v_hadd_0[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_hadd_0[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_hadd_1[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd_1[d] = _mm256_hadd_epi32(v_hadd_0[s + 0], v_hadd_0[s + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + dst[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); +} + +static void fast_inverse_tr_2x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + const __m256i v_src_lo0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + const __m256i v_src_lo1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]); + const __m256i v_src_hi0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + const __m256i v_src_hi1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]); + + __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[0]), debias, shift); + __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo0, v_coeff[1]), debias, shift); + __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[0]), debias, shift); + __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo1, v_coeff[1]), debias, shift); + __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[0]), debias, shift); + __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi0, v_coeff[1]), debias, shift); + __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[0]), debias, shift); + __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi1, v_coeff[1]), debias, shift); + + __m256i v_result[4]; + __m256i v_tmp[4]; + v_tmp[0] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01), v_res_shuffle); + v_tmp[1] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11), v_res_shuffle); + v_tmp[2] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01), v_res_shuffle); + v_tmp[3] = _mm256_shuffle_epi8(_mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11), v_res_shuffle); + + v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[2], 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp[1], v_tmp[3], 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 2; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = uvg_g_dct_32_t; // rename + const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename + // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_2x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_2x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + +} + + +void fast_forward_tr_4xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & coeff[32]); + const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & coeff[48]); + + const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0); + const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1); + + const int reduced_line = line - skip_line; + // Handle 4 lines at a time (16 samples, 4 samples per line) + for (int j = 0; j < reduced_line; j += 4) { + // line 0 line 1 line 2 line 3 + // src vector: [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15] + __m256i v_src_raw = _mm256_load_si256((const __m256i*) src); + + // Arrange data for column-wise calculation. Data and coeffs are ordered so no further shuffling + // or permutes are needed. + // vec 1 : [s00 s01 s04 s05 s08 s09 s12 s13 | s00 s01 s04 s05 s08 s09 s12 s13] + // vec 2 : [s02 s03 s06 s07 s10 s11 s14 s15 | s02 s03 s06 s07 s10 s11 s14 s15] + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3); + + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + dst += 1; + } +} + +void fast_forward_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + // TODO: coeffs for DST7 and DCT8 + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = fast_forward_dct2_b4_coeff; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } + else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = fast_forward_dst7_b4_coeff; + } + else if (ver == DCT8) { + ver_coeff = fast_forward_dct8_b4_coeff; + } + + __m256i v_hor_pass_out; + fast_forward_tr_4xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & ver_coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & ver_coeff[16]); + const __m256i v_coeff_2 = _mm256_load_si256((const __m256i*) & ver_coeff[32]); + const __m256i v_coeff_3 = _mm256_load_si256((const __m256i*) & ver_coeff[48]); + + const __m256i v_permute_0 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_0); + const __m256i v_permute_1 = _mm256_load_si256((__m256i*)ff_dct2_b4_permute_1); + + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_hor_pass_out, v_permute_1); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_0, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_1, v_coeff_3); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_4x4_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_shuffle_hor); + + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + __m256i v_src = _mm256_shuffle_epi8(v_src_raw, v_shuffle); + v_src = _mm256_permute4x64_epi64(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_4x4_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x4_result_shuffle_ver); + + __m256i v_src = _mm256_permute4x64_epi64(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src, v_coeff[3]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_4xN_coeff_hor; + const int16_t* ver_coeff = fi_dct2_4xN_coeff_hor; // Can use same table for both passes + if (hor == DST7) { + hor_coeff = fi_dst7_4xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4xN_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4xN_coeff_hor; + } + + __m256i v_hor_pass_out; + fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height); +} + + +void fast_forward_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = ff_dct2_4x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = ff_dst7_4x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_4x8_coeff_ver; + } + + __m256i v_hor_pass_out[2]; + fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + __m256i v_madd[2][8]; + for (int i = 0; i < 8; ++i) { + v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff[0]); + v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff[1]); + v_coeff += 2; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + v_add[i] = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]); + } + + __m256i v_trunc[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd); + } + + __m256i v_result[2]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + // Order results + v_result[0] = _mm256_permute4x64_epi64(v_result[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_permute4x64_epi64(v_result[1], _MM_SHUFFLE(3, 1, 2, 0)); + + v_result[0] = _mm256_shuffle_epi32(v_result[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_shuffle_epi32(v_result[1], _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)&dst[0], v_result[0]); + _mm256_store_si256((__m256i*)&dst[16], v_result[1]); +} + + +static void fast_inverse_tr_4x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + const __m256i v_permute = _mm256_load_si256((const __m256i*)permute_32b_0415); + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + v_src_0 = _mm256_permutevar8x32_epi32(v_src_0, v_permute); + v_src_1 = _mm256_permutevar8x32_epi32(v_src_1, v_permute); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_madd_04 = _mm256_madd_epi16(v_src_0, v_coeff[8]); + __m256i v_madd_14 = _mm256_madd_epi16(v_src_1, v_coeff[9]); + + __m256i v_madd_05 = _mm256_madd_epi16(v_src_0, v_coeff[10]); + __m256i v_madd_15 = _mm256_madd_epi16(v_src_1, v_coeff[11]); + + __m256i v_madd_06 = _mm256_madd_epi16(v_src_0, v_coeff[12]); + __m256i v_madd_16 = _mm256_madd_epi16(v_src_1, v_coeff[13]); + + __m256i v_madd_07 = _mm256_madd_epi16(v_src_0, v_coeff[14]); + __m256i v_madd_17 = _mm256_madd_epi16(v_src_1, v_coeff[15]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10); + __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11); + __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12); + __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13); + __m256i v_add_4 = _mm256_add_epi32(v_madd_04, v_madd_14); + __m256i v_add_5 = _mm256_add_epi32(v_madd_05, v_madd_15); + __m256i v_add_6 = _mm256_add_epi32(v_madd_06, v_madd_16); + __m256i v_add_7 = _mm256_add_epi32(v_madd_07, v_madd_17); + + __m256i v_hadd_0 = _mm256_hadd_epi32(v_add_0, v_add_1); + __m256i v_hadd_1 = _mm256_hadd_epi32(v_add_2, v_add_3); + __m256i v_hadd_2 = _mm256_hadd_epi32(v_add_4, v_add_5); + __m256i v_hadd_3 = _mm256_hadd_epi32(v_add_6, v_add_7); + + __m256i v_trunc_0 = truncate_avx2(v_hadd_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_hadd_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_hadd_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_hadd_3, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_4x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + v_result_0 = _mm256_shuffle_epi32(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_shuffle_epi32(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +void fast_inverse_tr_4x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x4_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_8x4_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_8x4_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x4_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x4_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x4_coeff_hor; + } + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_4x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = uvg_g_dct_16; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = uvg_g_dst7_16; + } else if (ver == DCT8) { + ver_coeff = uvg_g_dct8_16; + } + + __m256i v_hor_pass_out[4]; + fast_forward_tr_4xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + const int64_t* coeff_ptr = (const int64_t*)ver_coeff; // Read four coeffs at once by casting into 64 bit integer + + __m256i v_madd[4][16]; + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff_0 = _mm256_set1_epi64x(coeff_ptr[0]); + const __m256i v_coeff_1 = _mm256_set1_epi64x(coeff_ptr[1]); + const __m256i v_coeff_2 = _mm256_set1_epi64x(coeff_ptr[2]); + const __m256i v_coeff_3 = _mm256_set1_epi64x(coeff_ptr[3]); + v_madd[0][i] = _mm256_madd_epi16(v_hor_pass_out[0], v_coeff_0); + v_madd[1][i] = _mm256_madd_epi16(v_hor_pass_out[1], v_coeff_1); + v_madd[2][i] = _mm256_madd_epi16(v_hor_pass_out[2], v_coeff_2); + v_madd[3][i] = _mm256_madd_epi16(v_hor_pass_out[3], v_coeff_3); + coeff_ptr += 4; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + __m256i v_tmp0 = _mm256_add_epi32(v_madd[0][i], v_madd[1][i]); + __m256i v_tmp1 = _mm256_add_epi32(v_madd[2][i], v_madd[3][i]); + + v_add[i] = _mm256_add_epi32(v_tmp0, v_tmp1); + } + + __m256i v_trunc[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]), debias, shift_2nd); + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_shuffle_epi32(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + + +static void fast_inverse_tr_4x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_raw[0], v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_raw[1], v_shuffle); + __m256i v_src_2 = _mm256_shuffle_epi8(v_src_raw[2], v_shuffle); + __m256i v_src_3 = _mm256_shuffle_epi8(v_src_raw[3], v_shuffle); + + v_src_0 = _mm256_permute4x64_epi64(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_permute4x64_epi64(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_2 = _mm256_permute4x64_epi64(v_src_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_3 = _mm256_permute4x64_epi64(v_src_3, _MM_SHUFFLE(3, 1, 2, 0)); + + v_src_0 = _mm256_shuffle_epi32(v_src_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_1 = _mm256_shuffle_epi32(v_src_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_2 = _mm256_shuffle_epi32(v_src_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_src_3 = _mm256_shuffle_epi32(v_src_3, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + __m256i v_madd_2[16]; + __m256i v_madd_3[16]; + for (int c = 0; c < 16; c++) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[2]); + v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[3]); + v_coeff += 4; + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_hadd[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); +} + +static void fast_inverse_tr_4x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(src[2], src[3], 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(src[2], src[3], 0x31); + + __m256i v_madd_0[4]; + __m256i v_madd_1[4]; + __m256i v_madd_2[4]; + __m256i v_madd_3[4]; + for (int c = 0; c < 4; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src_2, v_coeff[0]); + v_madd_3[c] = _mm256_madd_epi16(v_src_3, v_coeff[1]); + v_coeff += 2; + } + + __m256i v_trunc_0[4]; + __m256i v_trunc_1[4]; + for (int i = 0; i < 4; ++i) { + v_trunc_0[i] = truncate_avx2(_mm256_add_epi32(v_madd_0[i], v_madd_1[i]), debias, shift); + v_trunc_1[i] = truncate_avx2(_mm256_add_epi32(v_madd_2[i], v_madd_3[i]), debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + + __m256i v_tmp32_0 = _mm256_unpacklo_epi32(v_tmp0, v_tmp1); + __m256i v_tmp32_1 = _mm256_unpackhi_epi32(v_tmp0, v_tmp1); + __m256i v_tmp32_2 = _mm256_unpacklo_epi32(v_tmp2, v_tmp3); + __m256i v_tmp32_3 = _mm256_unpackhi_epi32(v_tmp2, v_tmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp32_0, v_tmp32_1, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp32_2, v_tmp32_3, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x4_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_16x4_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x4_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x4_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x4_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x4_coeff_hor; + } + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_4x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; + const int16_t* ver_coeff = ff_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fast_forward_dst7_b4_coeff; + } else if (hor == DCT8) { + hor_coeff = fast_forward_dct8_b4_coeff; + } + if (ver == DST7) { + ver_coeff = ff_dst7_4x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_4x32_coeff_ver; + } + + int16_t v_hor_pass_out[4*32]; + fast_forward_tr_4xN_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + + __m256i temp_out[8]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 4; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + const int32_t* temp_source = (int32_t*)(v_hor_pass_out + j * 4); + for (int i = 0; i < 16; ++i) { + + __m256i v_src = _mm256_set1_epi32(*temp_source); + temp_source += i & 1 ? 7 : 1; + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 4); + + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_4x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Handle as 64 bit integer to load four coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_src_raw[i], v_shuffle); + } + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_permute4x64_epi64(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[i] = _mm256_shuffle_epi32(v_src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int c = 0; c < 32; c++) { + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + const __m256i v_coeff = _mm256_set1_epi64x(*c_ptr); + v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff); + c_ptr++; + } + + __m256i v_add_0[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]); + } + + __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]); + __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]); + + v_add[c] = _mm256_add_epi32(v_add_10, v_add_11); + } + + __m256i v_hadd[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_hadd[d] = _mm256_hadd_epi32(v_add[s + 0], v_add[s + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } + // TODO: cutoff for dct8 and dst7 +} + +static void fast_inverse_tr_4x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = src; + + __m256i v_src[8]; + __m256i v_tmp[8]; + v_src[0] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x20); + v_src[1] = _mm256_permute2x128_si256(v_src_raw[0], v_src_raw[1], 0x31); + v_src[2] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x20); + v_src[3] = _mm256_permute2x128_si256(v_src_raw[2], v_src_raw[3], 0x31); + v_src[4] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x20); + v_src[5] = _mm256_permute2x128_si256(v_src_raw[4], v_src_raw[5], 0x31); + v_src[6] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x20); + v_src[7] = _mm256_permute2x128_si256(v_src_raw[6], v_src_raw[7], 0x31); + + for (int d = 0, c = 0; c < 4; ++c, d += 2) { + __m256i v_madd_00 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_10 = _mm256_madd_epi16(v_src[2], v_coeff[0]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src[3], v_coeff[1]); + __m256i v_madd_20 = _mm256_madd_epi16(v_src[4], v_coeff[0]); + __m256i v_madd_21 = _mm256_madd_epi16(v_src[5], v_coeff[1]); + __m256i v_madd_30 = _mm256_madd_epi16(v_src[6], v_coeff[0]); + __m256i v_madd_31 = _mm256_madd_epi16(v_src[7], v_coeff[1]); + v_coeff += 2; + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_01), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_10, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_20, v_madd_21), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_30, v_madd_31), debias, shift); + + v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + v_tmp[d + 0] = _mm256_permute4x64_epi64(v_tmp[d + 0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[d + 1] = _mm256_permute4x64_epi64(v_tmp[d + 1], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_result[8]; + transpose_avx2(v_tmp, v_result, 32, 4); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 4; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename + if (hor == DST7) { + hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename + } + if (ver == DST7) { + ver_coeff = uvg_g_dst7_32_t; + } else if (ver == DCT8) { + ver_coeff = uvg_g_dct8_32; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_4x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_4x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_tr_8xN_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const int reduced_line = line - skip_line; + // Handle 2 lines at a time (16 samples, 8 samples per line) + for (int j = 0; j < reduced_line; j += 2) { + // line 1 line 2 + // src vector: [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7] + __m256i v_src = _mm256_load_si256((const __m256i*)src); + + // Rearrange source in a way samples can be added together column-wise using add + // after first round of madd operations. + // Need 4 source vectors arranged as follows. High 128 lanes are the same as low: + // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...] + // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...] + // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...] + // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...] + + __m256i v_src_0 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i v_src_1 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(1, 1, 1, 1)); + __m256i v_src_2 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(2, 2, 2, 2)); + __m256i v_src_3 = _mm256_shuffle_epi32(v_src, _MM_SHUFFLE(3, 3, 3, 3)); + + // Lane 1 + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + // Lane 2 + __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]); + + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + // Trunc results from both lanes + __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + src += 16; + dst += 1; + } +} + +void fast_forward_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x2_coeff_ver; + // Only DCT2 is defined for 8x2 block + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + + __m256i v_hor_pass_out; + fast_forward_tr_8xN_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // TODO: coeffs for DST7 and DCT8 transforms + const __m256i* v_coeff = (const __m256i*)ver_coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x2_ver_pass_shuffle); + + // 8x2, only 16 samples, handle all at once + __m256i v_src_per = _mm256_permute4x64_epi64(v_hor_pass_out, _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle); + // v_src = _mm256_unpackhi_epi16(v_src_raw, v_src_swp); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src, v_coeff[1]); + + __m256i v_trunc_0 = truncate_avx2(v_madd_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_madd_1, debias, shift_2nd); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); // TODO: this permute can probably be optimized away + + _mm256_store_si256((__m256i*)dst, v_result); +} + + +static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)fi_tr_2x8_shuffle_hor); + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + // Got data for one vector + const __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + __m256i v_src = _mm256_permute4x64_epi64(v_src_raw, _MM_SHUFFLE(3, 1, 2, 0)); + v_src = _mm256_shuffle_epi8(v_src, v_shuffle); + + __m256i v_even = _mm256_madd_epi16(v_src, v_coeff_0); + // odd vector : [a00-a01 a02-a03 a04-a05 a06-a07|a08-a09 a10-a11 a12-a13 a14-a15] + __m256i v_odd = _mm256_madd_epi16(v_src, v_coeff_1); + + __m256i v_trunc_0 = truncate_avx2(v_even, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_odd, debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); +} + +static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver); + const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver); + + __m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]); + __m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + __m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_result = _mm256_shuffle_epi8(v_result, v_shuffle1); + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + v_result = _mm256_shuffle_epi8(v_result, v_shuffle2); + + _mm256_store_si256((__m256i*)dst, v_result); +} + +void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename + // Only dct2 transform is defined for this block size + + __m256i v_ver_pass_out; + fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x4_coeff_ver; + } + + __m256i v_hor_pass_out[2]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle); + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // 32 samples, process in two steps + __m256i v_src_per_0 = _mm256_permute4x64_epi64(v_hor_pass_out[0], _MM_SHUFFLE(3, 1, 2, 0)); + __m256i v_src_per_1 = _mm256_permute4x64_epi64(v_hor_pass_out[1], _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src_0 = _mm256_shuffle_epi8(v_src_per_0, v_shuffle); + __m256i v_src_1 = _mm256_shuffle_epi8(v_src_per_1, v_shuffle); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[6]); + + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_10); + __m256i v_add_1 = _mm256_add_epi32(v_madd_01, v_madd_11); + __m256i v_add_2 = _mm256_add_epi32(v_madd_02, v_madd_12); + __m256i v_add_3 = _mm256_add_epi32(v_madd_03, v_madd_13); + + __m256i v_trunc_0 = truncate_avx2(v_add_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(v_add_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(v_add_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(v_add_3, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + // Swap each middle 64 bit chunk in both 128 bit lanes + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + + // Swap each middle 16 bit value in each 64 bit chunk + v_result_0 = _mm256_shuffle_epi8(v_result_0, v_res_shuffle); + v_result_1 = _mm256_shuffle_epi8(v_result_1, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); +} + + +static void fast_inverse_tr_8x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i v_src_raw_0 = _mm256_load_si256((const __m256i*) & src[0]); + const __m256i v_src_raw_1 = _mm256_load_si256((const __m256i*) & src[16]); + + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw_0, v_src_raw_1); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw_0, v_src_raw_1); + + __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo, v_src_hi, 0x31); + + __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_01 = _mm256_madd_epi16(v_src_0, v_coeff[1]); + __m256i v_madd_02 = _mm256_madd_epi16(v_src_0, v_coeff[2]); + __m256i v_madd_03 = _mm256_madd_epi16(v_src_0, v_coeff[3]); + + __m256i v_madd_10 = _mm256_madd_epi16(v_src_1, v_coeff[4]); + __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_12 = _mm256_madd_epi16(v_src_1, v_coeff[6]); + __m256i v_madd_13 = _mm256_madd_epi16(v_src_1, v_coeff[7]); + + __m256i v_trunc_0 = truncate_avx2(_mm256_add_epi32(v_madd_00, v_madd_10), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_add_epi32(v_madd_01, v_madd_11), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_add_epi32(v_madd_02, v_madd_12), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_add_epi32(v_madd_03, v_madd_13), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_8x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[1], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[1], 0x31); + + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + } + + __m256i v_hadd[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[4]; + for (int i = 0; i < 4; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[2]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle); + v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle); + + __m256i v_tmp0 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x20); + __m256i v_tmp1 = _mm256_permute2x128_si256(v_result[0], v_result[1], 0x31); + + v_result[0] = _mm256_permute4x64_epi64(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result[1] = _mm256_permute4x64_epi64(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*) & dst[0], v_result[0]); + _mm256_store_si256((__m256i*) & dst[16], v_result[1]); +} + +void fast_inverse_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x8_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_4x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_4x8_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4x8_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x8_coeff_hor; + } + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_8x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x8_coeff_ver; + } + + __m256i v_hor_pass_out[4]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + const int32_t* coeff_ptr = (const int32_t*)ver_coeff; // Cast into 32 bit integer to read two coeffs at a time + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + + __m256i v_trunc[8]; + + __m256i v_src_0 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(v_src_lo_0, v_src_hi_0, 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(v_src_lo_1, v_src_hi_1, 0x31); + + for (int i = 0; i < 8; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi32(coeff_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi32(coeff_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi32(coeff_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi32(coeff_ptr[3]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff_3); + + __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_trunc[i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift_2nd); + coeff_ptr += 4; + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + for (int i = 0; i < 4; ++i) { + v_result[i] = _mm256_permute4x64_epi64(v_result[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + + +static void fast_inverse_tr_8x8_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src[4]; + v_src[0] = _mm256_permute4x64_epi64(v_src_raw[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[1] = _mm256_permute4x64_epi64(v_src_raw[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[2] = _mm256_permute4x64_epi64(v_src_raw[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[3] = _mm256_permute4x64_epi64(v_src_raw[3], _MM_SHUFFLE(3, 1, 2, 0)); + + v_src[0] = _mm256_shuffle_epi8(v_src[0], v_shuffle); + v_src[1] = _mm256_shuffle_epi8(v_src[1], v_shuffle); + v_src[2] = _mm256_shuffle_epi8(v_src[2], v_shuffle); + v_src[3] = _mm256_shuffle_epi8(v_src[3], v_shuffle); + + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]); + v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]); + v_c_ptr += 4; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_add[i], debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + dst[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + dst[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + dst[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); +} + +static void fast_inverse_tr_8x8_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[4]; + v_src[0] = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[1] = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[2] = _mm256_shuffle_epi32(src[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_src[3] = _mm256_shuffle_epi32(src[3], _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i v_tmp0 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20); + __m256i v_tmp1 = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31); + __m256i v_tmp2 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20); + __m256i v_tmp3 = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31); + + v_src[0] = _mm256_unpacklo_epi64(v_tmp0, v_tmp2); + v_src[1] = _mm256_unpackhi_epi64(v_tmp0, v_tmp2); + v_src[2] = _mm256_unpacklo_epi64(v_tmp1, v_tmp3); + v_src[3] = _mm256_unpackhi_epi64(v_tmp1, v_tmp3); + + + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int i = 0; i < 8; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src[0], v_c_ptr[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src[1], v_c_ptr[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src[2], v_c_ptr[2]); + v_madd_3[i] = _mm256_madd_epi16(v_src[3], v_c_ptr[3]); + v_c_ptr += 4; + } + + __m256i v_add[8]; + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_add[i] = _mm256_add_epi32(v_add_0, v_add_1); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_add[i], debias, shift); + } + + __m256i v_result[4]; + v_result[0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_result[1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + v_result[2] = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + v_result[3] = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_result[0] = _mm256_shuffle_epi8(v_result[0], v_res_shuffle); + v_result[1] = _mm256_shuffle_epi8(v_result[1], v_res_shuffle); + v_result[2] = _mm256_shuffle_epi8(v_result[2], v_res_shuffle); + v_result[3] = _mm256_shuffle_epi8(v_result[3], v_res_shuffle); + + __m256i v_rtmp0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]); + __m256i v_rtmp1 = _mm256_unpackhi_epi32(v_result[0], v_result[1]); + __m256i v_rtmp2 = _mm256_unpacklo_epi32(v_result[2], v_result[3]); + __m256i v_rtmp3 = _mm256_unpackhi_epi32(v_result[2], v_result[3]); + + __m256i v_tmp20 = _mm256_unpacklo_epi64(v_rtmp0, v_rtmp2); + __m256i v_tmp21 = _mm256_unpackhi_epi64(v_rtmp0, v_rtmp2); + __m256i v_tmp22 = _mm256_unpacklo_epi64(v_rtmp1, v_rtmp3); + __m256i v_tmp23 = _mm256_unpackhi_epi64(v_rtmp1, v_rtmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp20, v_tmp21, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp22, v_tmp23, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_8x8_coeff_hor; + const int16_t* ver_coeff = fi_dct2_8x8_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_8x8_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x8_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x8_coeff_hor; + } + + __m256i v_hor_pass_out[4]; + fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height); +} + + +void fast_forward_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 16; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x16_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + fast_forward_tr_8xN_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Can use same shuffles as 8x4 + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_shuffle); + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_8x4_ver_pass_result_shuffle); + //const __m256i* v_coeff = (const __m256i*)ver_coeff; + const int32_t *line_coeff = (const int32_t*)ver_coeff; + + // Multiply+add all source vectors with coeff vectors + __m256i v_madd[8][16]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 8; ++i) { + __m256i v_src_per = _mm256_permute4x64_epi64(v_src_ptr[0], _MM_SHUFFLE(3, 1, 2, 0)); + // Weave lo and hi halfs of each 128 bit lane + __m256i v_src = _mm256_shuffle_epi8(v_src_per, v_shuffle); + + for (int ii = 0; ii < 16; ++ii) { + //int coeff_row = ii * 8 + i; + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd[i][ii] = _mm256_madd_epi16(v_src, v_coeff); + } + line_coeff += 16; + v_src_ptr += 1; + } + + // Add vectors + __m256i v_add_0[4][16]; + for (int i = 0; i < 4; ++i) { + for (int ii = 0; ii < 16; ++ii) { + int offset = i * 2; + v_add_0[i][ii] = _mm256_add_epi32(v_madd[offset][ii], v_madd[offset + 1][ii]); + } + } + // Second round of additions + __m256i v_add_1[2][16]; + for (int i = 0; i < 2; ++i) { + for (int ii = 0; ii < 16; ++ii) { + int offset = i * 2; + v_add_1[i][ii] = _mm256_add_epi32(v_add_0[offset][ii], v_add_0[offset + 1][ii]); + } + } + // Third round of additions + __m256i v_trunc[16]; + for (int ii = 0; ii < 16; ++ii) { + v_trunc[ii] = _mm256_add_epi32(v_add_1[0][ii], v_add_1[1][ii]); + v_trunc[ii] = truncate_avx2(v_trunc[ii], debias, shift_2nd); + } + + + for (int i = 0; i < 16; i += 2) { + __m256i v_result = _mm256_packs_epi32(v_trunc[i], v_trunc[i + 1]); + + // Swap each middle 64 bit chunk in both 128 bit lanes + v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0)); + // Swap each middle 16 bit value in each 64 bit chunk + v_result = _mm256_shuffle_epi8(v_result, v_res_shuffle); + + _mm256_store_si256((__m256i*)dst, v_result); + dst += 16; + } +} + + +static void fast_inverse_tr_8x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + const __m256i v_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_tmp[8]; + for (int i = 0; i < 8; ++i) { + v_tmp[i] = _mm256_permute4x64_epi64(v_src_raw[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi8(v_tmp[i], v_shuffle); + } + + __m256i v_trunc[16]; + for (int c = 0; c < 16; c++) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]); + __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff[7]); + + v_coeff += 8; + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift); + } + + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } +} + +static void fast_inverse_tr_8x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = _mm256_shuffle_epi32(src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_tmp[8]; + v_tmp[0] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x20); + v_tmp[1] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x20); + v_tmp[2] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x20); + v_tmp[3] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x20); + v_tmp[4] = _mm256_permute2x128_si256(v_src[0], v_src[1], 0x31); + v_tmp[5] = _mm256_permute2x128_si256(v_src[2], v_src[3], 0x31); + v_tmp[6] = _mm256_permute2x128_si256(v_src[4], v_src[5], 0x31); + v_tmp[7] = _mm256_permute2x128_si256(v_src[6], v_src[7], 0x31); + + v_src[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); + v_src[1] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); + v_src[2] = _mm256_unpacklo_epi32(v_tmp[4], v_tmp[5]); + v_src[3] = _mm256_unpackhi_epi32(v_tmp[4], v_tmp[5]); + v_src[4] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); + v_src[5] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); + v_src[6] = _mm256_unpacklo_epi32(v_tmp[6], v_tmp[7]); + v_src[7] = _mm256_unpackhi_epi32(v_tmp[6], v_tmp[7]); + + __m256i v_trunc[2][8]; + for (int d = 0, s = 0; d < 2; ++d, s += 4) { + const __m256i* v_c_ptr = v_coeff; + __m256i v_madd_0[8]; + __m256i v_madd_1[8]; + __m256i v_madd_2[8]; + __m256i v_madd_3[8]; + for (int c = 0; c < 8; ++c) { + v_madd_0[c] = _mm256_madd_epi16(v_src[s + 0], v_c_ptr[0]); + v_madd_1[c] = _mm256_madd_epi16(v_src[s + 1], v_c_ptr[1]); + v_madd_2[c] = _mm256_madd_epi16(v_src[s + 2], v_c_ptr[2]); + v_madd_3[c] = _mm256_madd_epi16(v_src[s + 3], v_c_ptr[3]); + v_c_ptr += 4; + } + + for (int i = 0; i < 8; ++i) { + __m256i v_add_0 = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + __m256i v_add_1 = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + v_trunc[d][i] = truncate_avx2(_mm256_add_epi32(v_add_0, v_add_1), debias, shift); + } + } + + __m256i v_rtmp[8]; + v_rtmp[0] = _mm256_packs_epi32(v_trunc[0][0], v_trunc[0][1]); + v_rtmp[1] = _mm256_packs_epi32(v_trunc[0][2], v_trunc[0][3]); + v_rtmp[2] = _mm256_packs_epi32(v_trunc[0][4], v_trunc[0][5]); + v_rtmp[3] = _mm256_packs_epi32(v_trunc[0][6], v_trunc[0][7]); + v_rtmp[4] = _mm256_packs_epi32(v_trunc[1][0], v_trunc[1][1]); + v_rtmp[5] = _mm256_packs_epi32(v_trunc[1][2], v_trunc[1][3]); + v_rtmp[6] = _mm256_packs_epi32(v_trunc[1][4], v_trunc[1][5]); + v_rtmp[7] = _mm256_packs_epi32(v_trunc[1][6], v_trunc[1][7]); + + for (int i = 0; i < 8; ++i) { + v_rtmp[i] = _mm256_shuffle_epi8(v_rtmp[i], v_res_shuffle); + } + + __m256i v_tmp32_lo0 = _mm256_unpacklo_epi32(v_rtmp[0], v_rtmp[1]); + __m256i v_tmp32_lo1 = _mm256_unpacklo_epi32(v_rtmp[2], v_rtmp[3]); + __m256i v_tmp32_lo2 = _mm256_unpacklo_epi32(v_rtmp[4], v_rtmp[5]); + __m256i v_tmp32_lo3 = _mm256_unpacklo_epi32(v_rtmp[6], v_rtmp[7]); + + __m256i v_tmp32_hi0 = _mm256_unpackhi_epi32(v_rtmp[0], v_rtmp[1]); + __m256i v_tmp32_hi1 = _mm256_unpackhi_epi32(v_rtmp[2], v_rtmp[3]); + __m256i v_tmp32_hi2 = _mm256_unpackhi_epi32(v_rtmp[4], v_rtmp[5]); + __m256i v_tmp32_hi3 = _mm256_unpackhi_epi32(v_rtmp[6], v_rtmp[7]); + + __m256i v_tmp64_lo0 = _mm256_unpacklo_epi64(v_tmp32_lo0, v_tmp32_lo1); + __m256i v_tmp64_lo1 = _mm256_unpacklo_epi64(v_tmp32_hi0, v_tmp32_hi1); + __m256i v_tmp64_lo2 = _mm256_unpacklo_epi64(v_tmp32_lo2, v_tmp32_lo3); + __m256i v_tmp64_lo3 = _mm256_unpacklo_epi64(v_tmp32_hi2, v_tmp32_hi3); + + __m256i v_tmp64_hi0 = _mm256_unpackhi_epi64(v_tmp32_lo0, v_tmp32_lo1); + __m256i v_tmp64_hi1 = _mm256_unpackhi_epi64(v_tmp32_hi0, v_tmp32_hi1); + __m256i v_tmp64_hi2 = _mm256_unpackhi_epi64(v_tmp32_lo2, v_tmp32_lo3); + __m256i v_tmp64_hi3 = _mm256_unpackhi_epi64(v_tmp32_hi2, v_tmp32_hi3); + + __m256i v_result[8]; + v_result[0] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp64_lo0, v_tmp64_lo1, 0x31); + v_result[2] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp64_hi0, v_tmp64_hi1, 0x31); + v_result[4] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x20); + v_result[5] = _mm256_permute2x128_si256(v_tmp64_lo2, v_tmp64_lo3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x20); + v_result[7] = _mm256_permute2x128_si256(v_tmp64_hi2, v_tmp64_hi3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_8x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_16x8_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_16x8_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x8_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x8_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x8_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x8_coeff_hor; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_8x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x16_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_8xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_8x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_8xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_8xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_8x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_8x32_coeff_ver; + } + + ALIGNED(32) int16_t v_hor_pass_out[8 * 32]; + fast_forward_tr_8xN_avx2_hor(src, (__m256i *)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + __m256i temp_out[16]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + for (int j = 0; j < 8; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 16]; + source[1] = v_hor_pass_out[j + i * 16 + 8]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 8); +#undef NUM_PARTS +#undef PART_DIMENSION + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_8x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_tmp[16]; + for (int i = 0; i < 16; i += 2) { + v_tmp[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20); + v_tmp[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31); + } + + __m256i v_tmp16_lo[8]; + __m256i v_tmp16_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_tmp[s + 0], v_tmp[s + 1]); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_src[16]; + for (int d = 0, s = 0; d < 16; d += 2, ++s) { + v_src[d + 0] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x20); + v_src[d + 1] = _mm256_permute2x128_si256(v_tmp16_lo[s], v_tmp16_hi[s], 0x31); + } + + __m256i v_trunc[32]; + + for (int row = 0; row < 32; ++row) { + __m256i v_res = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd = _mm256_madd_epi16(v_src[i], v_coeff); + v_res = _mm256_add_epi32(v_res, v_madd); + c_ptr++; + } + + v_trunc[row] = truncate_avx2(v_res, debias, shift); + } + + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + dst[d] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + } +} + +static void fast_inverse_tr_8x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0246); + + __m256i v_src[16]; + for (int i = 0; i < 16; i += 2) { + v_src[i + 0] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x20); + v_src[i + 1] = _mm256_permute2x128_si256(v_src_raw[i + 0], v_src_raw[i + 1], 0x31); + } + + __m256i v_tmp[16]; + for (int s = 0; s < 16; s += 2) { + __m256i v_add[8]; + for (int d = 0, c = 0; d < 8; ++d, c += 2) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[s + 0], v_coeff[c + 0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[s + 1], v_coeff[c + 1]); + + v_add[d] = _mm256_add_epi32(v_madd_0, v_madd_1); + } + + __m256i v_hadd[4]; + v_hadd[0] = _mm256_hadd_epi32(v_add[0], v_add[1]); + v_hadd[1] = _mm256_hadd_epi32(v_add[2], v_add[3]); + v_hadd[2] = _mm256_hadd_epi32(v_add[4], v_add[5]); + v_hadd[3] = _mm256_hadd_epi32(v_add[6], v_add[7]); + + __m256i v_trunc[4]; + v_trunc[0] = truncate_avx2(v_hadd[0], debias, shift); + v_trunc[1] = truncate_avx2(v_hadd[1], debias, shift); + v_trunc[2] = truncate_avx2(v_hadd[2], debias, shift); + v_trunc[3] = truncate_avx2(v_hadd[3], debias, shift); + + v_tmp[s + 0] = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + v_tmp[s + 1] = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + } + + for (int i = 0; i < 16; ++i) { + v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle); + } + + __m256i v_tmp64_lo[8]; + __m256i v_tmp64_hi[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_tmp64_lo[d] = _mm256_unpacklo_epi64(v_tmp[s + 0], v_tmp[s + 1]); + v_tmp64_hi[d] = _mm256_unpackhi_epi64(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_result[16]; + for (int d = 0, s = 0; d < 16; d += 2, ++s) { + v_result[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x20); + v_result[d + 1] = _mm256_permute2x128_si256(v_tmp64_lo[s], v_tmp64_hi[s], 0x31); + } + + for (int i = 0; i < 16; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: mts cutoff +} + +void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 8; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table + if (hor == DST7) { + hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename + } + if (ver == DST7) { + ver_coeff = uvg_g_dst7_32_t; + } else if (ver == DCT8) { + ver_coeff = uvg_g_dct8_32; + } + + __m256i v_ver_pass_out[16]; + fast_inverse_tr_8x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_8x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_DCT2_B16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // ISP_TODO: might be faster to load these from arrays + const __m256i v_permute_0 = _mm256_set1_epi32(0); + const __m256i v_permute_1 = _mm256_set1_epi32(1); + const __m256i v_permute_2 = _mm256_set1_epi32(2); + const __m256i v_permute_3 = _mm256_set1_epi32(3); + const __m256i v_permute_4 = _mm256_set1_epi32(4); + const __m256i v_permute_5 = _mm256_set1_epi32(5); + const __m256i v_permute_6 = _mm256_set1_epi32(6); + const __m256i v_permute_7 = _mm256_set1_epi32(7); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const int reduced_line = line - skip_line; + // Handle 1 line at a time, 16 samples per line + for (int j = 0; j < reduced_line; ++j) { + // line 1 + // src vector: [s00 s01 s02 s03 s04 s05 s06 s07 | s08 s09 s10 s11 s12 s13 s14 s15] + __m256i v_src_raw = _mm256_load_si256((const __m256i*)src); + + // Arrange data so calculations can be done column-wise (to avoid using hadds). + // Need 8 source vectors. First will be filled with s00 and s01 pairs. Second with s02 and s03 pairs and so on + __m256i v_src_0 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_0); + __m256i v_src_1 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_1); + __m256i v_src_2 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_2); + __m256i v_src_3 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_3); + __m256i v_src_4 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_4); + __m256i v_src_5 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_5); + __m256i v_src_6 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_6); + __m256i v_src_7 = _mm256_permutevar8x32_epi32(v_src_raw, v_permute_7); + + __m256i v_madd_0_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_0_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_0_02 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_0_03 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + __m256i v_madd_0_04 = _mm256_madd_epi16(v_src_4, v_coeff[4]); + __m256i v_madd_0_05 = _mm256_madd_epi16(v_src_5, v_coeff[5]); + __m256i v_madd_0_06 = _mm256_madd_epi16(v_src_6, v_coeff[6]); + __m256i v_madd_0_07 = _mm256_madd_epi16(v_src_7, v_coeff[7]); + + __m256i v_madd_0_08 = _mm256_madd_epi16(v_src_0, v_coeff[8]); + __m256i v_madd_0_09 = _mm256_madd_epi16(v_src_1, v_coeff[9]); + __m256i v_madd_0_10 = _mm256_madd_epi16(v_src_2, v_coeff[10]); + __m256i v_madd_0_11 = _mm256_madd_epi16(v_src_3, v_coeff[11]); + __m256i v_madd_0_12 = _mm256_madd_epi16(v_src_4, v_coeff[12]); + __m256i v_madd_0_13 = _mm256_madd_epi16(v_src_5, v_coeff[13]); + __m256i v_madd_0_14 = _mm256_madd_epi16(v_src_6, v_coeff[14]); + __m256i v_madd_0_15 = _mm256_madd_epi16(v_src_7, v_coeff[15]); + + __m256i v_madd_1_0 = _mm256_add_epi32(v_madd_0_00, v_madd_0_01); + __m256i v_madd_1_1 = _mm256_add_epi32(v_madd_0_02, v_madd_0_03); + __m256i v_madd_1_2 = _mm256_add_epi32(v_madd_0_04, v_madd_0_05); + __m256i v_madd_1_3 = _mm256_add_epi32(v_madd_0_06, v_madd_0_07); + __m256i v_madd_1_4 = _mm256_add_epi32(v_madd_0_08, v_madd_0_09); + __m256i v_madd_1_5 = _mm256_add_epi32(v_madd_0_10, v_madd_0_11); + __m256i v_madd_1_6 = _mm256_add_epi32(v_madd_0_12, v_madd_0_13); + __m256i v_madd_1_7 = _mm256_add_epi32(v_madd_0_14, v_madd_0_15); + + __m256i v_madd_2_0 = _mm256_add_epi32(v_madd_1_0, v_madd_1_1); + __m256i v_madd_2_1 = _mm256_add_epi32(v_madd_1_2, v_madd_1_3); + __m256i v_madd_2_2 = _mm256_add_epi32(v_madd_1_4, v_madd_1_5); + __m256i v_madd_2_3 = _mm256_add_epi32(v_madd_1_6, v_madd_1_7); + + __m256i v_madd_3_0 = _mm256_add_epi32(v_madd_2_0, v_madd_2_1); + __m256i v_madd_3_1 = _mm256_add_epi32(v_madd_2_2, v_madd_2_3); + + __m256i v_trunc_0 = truncate_avx2(v_madd_3_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_madd_3_1, debias, shift); + + __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + dst[0] = v_result; + + src += 16; + dst++; + } +} + +void fast_forward_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 2; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x2_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } + + __m256i v_hor_pass_out[2]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 2 source vectors + // Unpack -> samples to be added are adjacent + __m256i v_src_hi = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + + __m256i v_madd_hi_0 = _mm256_madd_epi16(v_src_hi, v_coeff[0]); + __m256i v_madd_hi_1 = _mm256_madd_epi16(v_src_hi, v_coeff[1]); + __m256i v_madd_lo_0 = _mm256_madd_epi16(v_src_lo, v_coeff[0]); + __m256i v_madd_lo_1 = _mm256_madd_epi16(v_src_lo, v_coeff[1]); + + __m256i v_trunc_hi_0 = truncate_avx2(v_madd_hi_0, debias, shift_2nd); + __m256i v_trunc_hi_1 = truncate_avx2(v_madd_hi_1, debias, shift_2nd); + __m256i v_trunc_lo_0 = truncate_avx2(v_madd_lo_0, debias, shift_2nd); + __m256i v_trunc_lo_1 = truncate_avx2(v_madd_lo_1, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); +} + + +static void fast_inverse_tr_16x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + const __m256i v_src_0 = _mm256_load_si256((const __m256i*) & src[0]); + const __m256i v_src_1 = _mm256_load_si256((const __m256i*) & src[16]); + + const __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_0, v_src_1); + const __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_0, v_src_1); + + __m256i v_trunc_0 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_0), debias, shift); + __m256i v_trunc_1 = truncate_avx2(_mm256_madd_epi16(v_src_lo, v_coeff_1), debias, shift); + __m256i v_trunc_2 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_0), debias, shift); + __m256i v_trunc_3 = truncate_avx2(_mm256_madd_epi16(v_src_hi, v_coeff_1), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); +} + +static void fast_inverse_tr_16x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + __m256i v_madd_e[16]; + __m256i v_madd_o[16]; + for (int i = 0, c = 0; i < 16; ++i, c += 2) { + v_madd_e[i] = _mm256_madd_epi16(src[0], v_coeff[c + 0]); + v_madd_o[i] = _mm256_madd_epi16(src[1], v_coeff[c + 1]); + } + + __m256i v_add[16]; + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_add_epi32(v_madd_e[i], v_madd_o[i]); + } + + for (int i = 0; i < 16; ++i) { + v_add[i] = _mm256_permute4x64_epi64(v_add[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_hadd_0[8]; + for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) { + v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[4]; + for (int src = 0, dst = 0; dst < 4; ++dst, src += 2) { + v_trunc[dst] = truncate_avx2(_mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]), debias, shift); + } + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + + __m256i v_result_0 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + __m256i v_result_1 = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + + _mm256_store_si256((__m256i*) & dst[0], v_result_0); + _mm256_store_si256((__m256i*) & dst[16], v_result_1); +} + +void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 2; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename + // DST7 and DCT8 are not defined for this block size + + __m256i v_ver_pass_out[2]; + fast_inverse_tr_16x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 4; + // TODO: might be able to get rid of skips in these tailored solutions + int skip_width = 0; + int skip_height = 0; // This is not used anywhere + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x4_coeff_ver; + } + + __m256i v_hor_pass_out[4]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 4 vectors + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[2], v_hor_pass_out[3]); + + __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[2]); + __m256i v_madd_hi_02 = _mm256_madd_epi16(v_src_hi_0, v_coeff[4]); + __m256i v_madd_hi_03 = _mm256_madd_epi16(v_src_hi_0, v_coeff[6]); + __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[3]); + __m256i v_madd_hi_12 = _mm256_madd_epi16(v_src_hi_1, v_coeff[5]); + __m256i v_madd_hi_13 = _mm256_madd_epi16(v_src_hi_1, v_coeff[7]); + + __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[2]); + __m256i v_madd_lo_02 = _mm256_madd_epi16(v_src_lo_0, v_coeff[4]); + __m256i v_madd_lo_03 = _mm256_madd_epi16(v_src_lo_0, v_coeff[6]); + __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[3]); + __m256i v_madd_lo_12 = _mm256_madd_epi16(v_src_lo_1, v_coeff[5]); + __m256i v_madd_lo_13 = _mm256_madd_epi16(v_src_lo_1, v_coeff[7]); + + __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi_00, v_madd_hi_10); + __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi_01, v_madd_hi_11); + __m256i v_add_hi_2 = _mm256_add_epi32(v_madd_hi_02, v_madd_hi_12); + __m256i v_add_hi_3 = _mm256_add_epi32(v_madd_hi_03, v_madd_hi_13); + + __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo_00, v_madd_lo_10); + __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo_01, v_madd_lo_11); + __m256i v_add_lo_2 = _mm256_add_epi32(v_madd_lo_02, v_madd_lo_12); + __m256i v_add_lo_3 = _mm256_add_epi32(v_madd_lo_03, v_madd_lo_13); + + __m256i v_trunc_hi_0 = truncate_avx2(v_add_hi_0, debias, shift_2nd); + __m256i v_trunc_hi_1 = truncate_avx2(v_add_hi_1, debias, shift_2nd); + __m256i v_trunc_hi_2 = truncate_avx2(v_add_hi_2, debias, shift_2nd); + __m256i v_trunc_hi_3 = truncate_avx2(v_add_hi_3, debias, shift_2nd); + + __m256i v_trunc_lo_0 = truncate_avx2(v_add_lo_0, debias, shift_2nd); + __m256i v_trunc_lo_1 = truncate_avx2(v_add_lo_1, debias, shift_2nd); + __m256i v_trunc_lo_2 = truncate_avx2(v_add_lo_2, debias, shift_2nd); + __m256i v_trunc_lo_3 = truncate_avx2(v_add_lo_3, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_2, v_trunc_hi_2); + __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_3, v_trunc_hi_3); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); + _mm256_store_si256((__m256i*)(dst + 32), v_result_2); + _mm256_store_si256((__m256i*)(dst + 48), v_result_3); +} + + +static void fast_inverse_tr_16x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[2], v_src_raw[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[2], v_src_raw[3]); + + __m256i v_madd_lo_0[4]; + __m256i v_madd_lo_1[4]; + __m256i v_madd_hi_0[4]; + __m256i v_madd_hi_1[4]; + for (int i = 0; i < 4; i++) { + v_madd_lo_0[i] = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + v_madd_lo_1[i] = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + + v_madd_hi_0[i] = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + v_madd_hi_1[i] = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_trunc_lo[4]; + __m256i v_trunc_hi[4]; + for (int i = 0; i < 4; ++i) { + v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[i], v_madd_lo_1[i]), debias, shift); + v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[i], v_madd_hi_1[i]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_hi[0]); + dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_hi[1]); + dst[2] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_hi[2]); + dst[3] = _mm256_packs_epi32(v_trunc_lo[3], v_trunc_hi[3]); +} + +static void fast_inverse_tr_16x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src_0 = _mm256_permute2x128_si256(src[0], src[2], 0x20); + __m256i v_src_1 = _mm256_permute2x128_si256(src[0], src[2], 0x31); + __m256i v_src_2 = _mm256_permute2x128_si256(src[1], src[3], 0x20); + __m256i v_src_3 = _mm256_permute2x128_si256(src[1], src[3], 0x31); + + __m256i v_madd_0[16]; + __m256i v_madd_1[16]; + __m256i v_madd_2[16]; + __m256i v_madd_3[16]; + for (int i = 0; i < 16; ++i) { + v_madd_0[i] = _mm256_madd_epi16(v_src_0, v_coeff[0]); + v_madd_1[i] = _mm256_madd_epi16(v_src_1, v_coeff[1]); + v_madd_2[i] = _mm256_madd_epi16(v_src_2, v_coeff[0]); + v_madd_3[i] = _mm256_madd_epi16(v_src_3, v_coeff[1]); + + v_coeff += 2; + } + + __m256i v_add_0[16]; + __m256i v_add_1[16]; + for (int i = 0; i < 16; ++i) { + v_add_0[i] = _mm256_add_epi32(v_madd_0[i], v_madd_1[i]); + v_add_1[i] = _mm256_add_epi32(v_madd_2[i], v_madd_3[i]); + + } + + __m256i v_hadd_0[16]; + for (int i = 0; i < 16; ++i) { + v_hadd_0[i] = _mm256_hadd_epi32(v_add_0[i], v_add_1[i]); + } + + __m256i v_hadd_1[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_16x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 4; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x16_coeff_hor; // TODO: rename coeff tables + const int16_t* hor_coeff = fi_dct2_4x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_4x16_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_4x16_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x16_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x16_coeff_hor; + } + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_16x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x8_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* line_coeff = (const int32_t*)ver_coeff; + + // Got 8 lines of samples. Handle two lines at a time (beacuse of unpack) + __m256i v_madd_hi[4][8]; + __m256i v_madd_lo[4][8]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 4; ++i) { + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + + // Apply coefficients + for (int ii = 0; ii < 8; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff); + v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff); + } + + line_coeff += 8; + v_src_ptr += 2; + } + + // First round of additions + __m256i v_add_hi[2][8]; + __m256i v_add_lo[2][8]; + for (int i = 0; i < 2; ++i) { + for (int ii = 0; ii < 8; ++ii) { + const int offset = i * 2; + v_add_hi[i][ii] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]); + v_add_lo[i][ii] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]); + } + } + + // Final round of additions, truncation and store + for (int ii = 0; ii < 8; ++ii) { + __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi[0][ii], v_add_hi[1][ii]), debias, shift_2nd); + __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo[0][ii], v_add_lo[1][ii]), debias, shift_2nd); + __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + + _mm256_store_si256((__m256i*)dst, v_result); + dst += 16; + } +} + + +static void fast_inverse_tr_16x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[4]; + __m256i v_src_hi[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_src_lo[dst] = _mm256_unpacklo_epi16(v_src_raw[src + 0], v_src_raw[src + 1]); + v_src_hi[dst] = _mm256_unpackhi_epi16(v_src_raw[src + 0], v_src_raw[src + 1]); + } + + __m256i v_trunc_lo[8]; + __m256i v_trunc_hi[8]; + + for (int c = 0; c < 8; c++) { + __m256i v_madd_lo[4]; + __m256i v_madd_hi[4]; + for (int i = 0; i < 4; ++i) { + v_madd_lo[i] = _mm256_madd_epi16(v_src_lo[i], v_coeff[i]); + v_madd_hi[i] = _mm256_madd_epi16(v_src_hi[i], v_coeff[i]); + } + v_coeff += 4; + + __m256i v_add_lo_0 = _mm256_add_epi32(v_madd_lo[0], v_madd_lo[1]); + __m256i v_add_lo_1 = _mm256_add_epi32(v_madd_lo[2], v_madd_lo[3]); + + __m256i v_add_hi_0 = _mm256_add_epi32(v_madd_hi[0], v_madd_hi[1]); + __m256i v_add_hi_1 = _mm256_add_epi32(v_madd_hi[2], v_madd_hi[3]); + + v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_0, v_add_lo_1), debias, shift); + v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_0, v_add_hi_1), debias, shift); + } + + for (int i = 0; i < 8; ++i) { + dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + } +} + +static void fast_inverse_tr_16x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + __m256i v_tmp32_lo_0 = _mm256_unpacklo_epi32(src[0], src[1]); + __m256i v_tmp32_lo_1 = _mm256_unpacklo_epi32(src[2], src[3]); + __m256i v_tmp32_lo_2 = _mm256_unpacklo_epi32(src[4], src[5]); + __m256i v_tmp32_lo_3 = _mm256_unpacklo_epi32(src[6], src[7]); + + __m256i v_tmp32_hi_0 = _mm256_unpackhi_epi32(src[0], src[1]); + __m256i v_tmp32_hi_1 = _mm256_unpackhi_epi32(src[2], src[3]); + __m256i v_tmp32_hi_2 = _mm256_unpackhi_epi32(src[4], src[5]); + __m256i v_tmp32_hi_3 = _mm256_unpackhi_epi32(src[6], src[7]); + + __m256i v_tmp64_lo_0 = _mm256_unpacklo_epi64(v_tmp32_lo_0, v_tmp32_lo_1); + __m256i v_tmp64_lo_1 = _mm256_unpacklo_epi64(v_tmp32_lo_2, v_tmp32_lo_3); + __m256i v_tmp64_lo_2 = _mm256_unpacklo_epi64(v_tmp32_hi_0, v_tmp32_hi_1); + __m256i v_tmp64_lo_3 = _mm256_unpacklo_epi64(v_tmp32_hi_2, v_tmp32_hi_3); + + __m256i v_tmp64_hi_0 = _mm256_unpackhi_epi64(v_tmp32_lo_0, v_tmp32_lo_1); + __m256i v_tmp64_hi_1 = _mm256_unpackhi_epi64(v_tmp32_lo_2, v_tmp32_lo_3); + __m256i v_tmp64_hi_2 = _mm256_unpackhi_epi64(v_tmp32_hi_0, v_tmp32_hi_1); + __m256i v_tmp64_hi_3 = _mm256_unpackhi_epi64(v_tmp32_hi_2, v_tmp32_hi_3); + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x20); + v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x20); + v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x20); + v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x20); + v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_0, v_tmp64_lo_1, 0x31); + v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_0, v_tmp64_hi_1, 0x31); + v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_2, v_tmp64_lo_3, 0x31); + v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_2, v_tmp64_hi_3, 0x31); + + + __m256i v_trunc[16]; + for (int c = 0; c < 16; ++c) { + __m256i v_madd[8]; + for (int i = 0; i < 8; ++i) { + v_madd[i] = _mm256_madd_epi16(v_src[i], v_coeff[i]); + } + v_coeff += 8; + + __m256i v_add_0[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_0[dst] = _mm256_add_epi32(v_madd[src + 0], v_madd[src + 1]); + } + + __m256i v_add_10 = _mm256_add_epi32(v_add_0[0], v_add_0[1]); + __m256i v_add_11 = _mm256_add_epi32(v_add_0[2], v_add_0[3]); + + v_trunc[c] = truncate_avx2(_mm256_add_epi32(v_add_10, v_add_11), debias, shift); + } + + __m256i v_result[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_result[dst] = _mm256_packs_epi32(v_trunc[src + 0], v_trunc[src + 1]); + } + + for (int i = 0; i < 8; ++i) { + v_result[i] = _mm256_shuffle_epi8(v_result[i], v_res_shuffle); + } + + __m256i v_rtmp32_lo_0 = _mm256_unpacklo_epi32(v_result[0], v_result[1]); + __m256i v_rtmp32_lo_1 = _mm256_unpacklo_epi32(v_result[2], v_result[3]); + __m256i v_rtmp32_lo_2 = _mm256_unpacklo_epi32(v_result[4], v_result[5]); + __m256i v_rtmp32_lo_3 = _mm256_unpacklo_epi32(v_result[6], v_result[7]); + + __m256i v_rtmp32_hi_0 = _mm256_unpackhi_epi32(v_result[0], v_result[1]); + __m256i v_rtmp32_hi_1 = _mm256_unpackhi_epi32(v_result[2], v_result[3]); + __m256i v_rtmp32_hi_2 = _mm256_unpackhi_epi32(v_result[4], v_result[5]); + __m256i v_rtmp32_hi_3 = _mm256_unpackhi_epi32(v_result[6], v_result[7]); + + __m256i v_rtmp64_lo_0 = _mm256_unpacklo_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1); + __m256i v_rtmp64_lo_1 = _mm256_unpacklo_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3); + __m256i v_rtmp64_lo_2 = _mm256_unpacklo_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1); + __m256i v_rtmp64_lo_3 = _mm256_unpacklo_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3); + + __m256i v_rtmp64_hi_0 = _mm256_unpackhi_epi64(v_rtmp32_lo_0, v_rtmp32_lo_1); + __m256i v_rtmp64_hi_1 = _mm256_unpackhi_epi64(v_rtmp32_lo_2, v_rtmp32_lo_3); + __m256i v_rtmp64_hi_2 = _mm256_unpackhi_epi64(v_rtmp32_hi_0, v_rtmp32_hi_1); + __m256i v_rtmp64_hi_3 = _mm256_unpackhi_epi64(v_rtmp32_hi_2, v_rtmp32_hi_3); + + v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo_0, v_rtmp64_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_rtmp64_hi_0, v_rtmp64_hi_1, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_rtmp64_lo_2, v_rtmp64_lo_3, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi_2, v_rtmp64_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_16x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 8; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x16_coeff_hor; + const int16_t* hor_coeff = fi_dct2_8x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_8x16_coeff_ver; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_8x16_coeff_ver; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x16_coeff_hor; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x16_coeff_hor; + } + + __m256i v_ver_pass_out[8]; + fast_inverse_tr_16x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_16x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x16_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + fast_forward_DCT2_B16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + +#define NUM_PARTS 4 +#define PART_DIMENSION (16 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast into 32 bit integer to read two coeffs at a time + const __m256i* v_src_ptr = v_hor_pass_out; + + __m256i v_madd_lo[8][PART_DIMENSION]; + __m256i v_madd_hi[8][PART_DIMENSION]; + for (int i = 0; i < 8; ++i) { + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + + for (int c = 0; c < PART_DIMENSION; ++c) { + const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]); + v_madd_lo[i][c] = _mm256_madd_epi16(v_src_lo, v_coeff); + v_madd_hi[i][c] = _mm256_madd_epi16(v_src_hi, v_coeff); + } + v_src_ptr += 2; + coeff_ptr += 16; + } + + __m256i v_trunc_lo[PART_DIMENSION]; + __m256i v_trunc_hi[PART_DIMENSION]; + for (int i = 0; i < PART_DIMENSION; ++i) { + __m256i v_add_lo_0[4]; + __m256i v_add_hi_0[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_lo_0[dst] = _mm256_add_epi32(v_madd_lo[src + 0][i], v_madd_lo[src + 1][i]); + v_add_hi_0[dst] = _mm256_add_epi32(v_madd_hi[src + 0][i], v_madd_hi[src + 1][i]); + } + + __m256i v_add_lo_1[2]; + __m256i v_add_hi_1[2]; + for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) { + v_add_lo_1[dst] = _mm256_add_epi32(v_add_lo_0[src + 0], v_add_lo_0[src + 1]); + v_add_hi_1[dst] = _mm256_add_epi32(v_add_hi_0[src + 0], v_add_hi_0[src + 1]); + } + + v_trunc_lo[i] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0], v_add_lo_1[1]), debias, shift_2nd); + v_trunc_hi[i] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0], v_add_hi_1[1]), debias, shift_2nd); + } + __m256i v_result[PART_DIMENSION]; + for (int i = 0; i < PART_DIMENSION; ++i) { + v_result[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + } + + for (int i = 0; i < PART_DIMENSION; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + } + +#undef NUM_PARTS +#undef PART_DIMENSION + +} + + +static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + //const __m256i* v_src_raw = (const __m256i*)src; + + //__m256i v_madd_lo[8][16]; + //__m256i v_madd_hi[8][16]; + //for (int s = 0; s < 8; ++s) { + // __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[1]); + // __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[1]); + // v_src_raw += 2; + + // for (int c = 0; c < 16; ++c) { + // const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + // v_madd_lo[s][c] = _mm256_madd_epi16(v_src_lo, v_coeff); + // v_madd_hi[s][c] = _mm256_madd_epi16(v_src_hi, v_coeff); + // c_ptr++; + // } + //} + + //__m256i v_add_lo_0[4][16]; + //__m256i v_add_hi_0[4][16]; + //for (int s = 0, d = 0; d < 4; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_lo_0[d][c] = _mm256_add_epi32(v_madd_lo[s + 0][c], v_madd_lo[s + 1][c]); + // v_add_hi_0[d][c] = _mm256_add_epi32(v_madd_hi[s + 0][c], v_madd_hi[s + 1][c]); + // } + //} + + //__m256i v_add_lo_1[2][16]; + //__m256i v_add_hi_1[2][16]; + //for (int s = 0, d = 0; d < 2; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_lo_1[d][c] = _mm256_add_epi32(v_add_lo_0[s + 0][c], v_add_lo_0[s + 1][c]); + // v_add_hi_1[d][c] = _mm256_add_epi32(v_add_hi_0[s + 0][c], v_add_hi_0[s + 1][c]); + // } + //} + + //__m256i v_trunc_lo[16]; + //__m256i v_trunc_hi[16]; + //for (int c = 0; c < 16; ++c) { + // v_trunc_lo[c] = truncate_avx2(_mm256_add_epi32(v_add_lo_1[0][c], v_add_lo_1[1][c]), debias, shift); + // v_trunc_hi[c] = truncate_avx2(_mm256_add_epi32(v_add_hi_1[0][c], v_add_hi_1[1][c]), debias, shift); + //} + + //for (int i = 0; i < 16; ++i) { + // dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]); + //} + + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + + __m256i *coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src[j + i * 32]; + source[1] = src[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i v_coeff0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1); + + res_0 = _mm256_add_epi32(res_0, v_madd0); + res_1 = _mm256_add_epi32(res_1, v_madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + dst[j] = packed; + } +} + +static void fast_inverse_tr_16x16_avx2_ver(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + __m256i v_result[16]; + int16_t *src_p = (int16_t*)src; + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i* coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src_p[j + i * 32]; + source[1] = src_p[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i coeff_0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i coeff_1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i madd0 = _mm256_madd_epi16(v_src, coeff_0); + __m256i madd1 = _mm256_madd_epi16(v_src, coeff_1); + + res_0 = _mm256_add_epi32(res_0, madd0); + res_1 = _mm256_add_epi32(res_1, madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i *)dst, packed); + dst += 16; + } + //const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + //const __m256i* v_src_raw = src; + + //// Do a 32-bit transpose to arrange result from previous pass + //__m256i v_tmp32_lo[8]; + //__m256i v_tmp32_hi[8]; + //for (int d = 0, s = 0; d < 8; ++d, s += 2) { + // v_tmp32_lo[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 1]); + // v_tmp32_hi[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 1]); + //} + + //__m256i v_tmp64_lo[8]; + //__m256i v_tmp64_hi[8]; + //for (int d = 0, s = 0; d < 4; ++d, s += 2) { + // v_tmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]); + // v_tmp64_lo[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]); + + // v_tmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo[s + 0], v_tmp32_lo[s + 1]); + // v_tmp64_hi[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi[s + 0], v_tmp32_hi[s + 1]); + //} + // + //__m256i v_src[16]; + //v_src[ 0] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x20); + //v_src[ 1] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x20); + //v_src[ 2] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x20); + //v_src[ 3] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x20); + //v_src[ 4] = _mm256_permute2x128_si256(v_tmp64_lo[0], v_tmp64_lo[1], 0x31); + //v_src[ 5] = _mm256_permute2x128_si256(v_tmp64_hi[0], v_tmp64_hi[1], 0x31); + //v_src[ 6] = _mm256_permute2x128_si256(v_tmp64_lo[4], v_tmp64_lo[5], 0x31); + //v_src[ 7] = _mm256_permute2x128_si256(v_tmp64_hi[4], v_tmp64_hi[5], 0x31); + + //v_src[ 8] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x20); + //v_src[ 9] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x20); + //v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x20); + //v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x20); + //v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo[2], v_tmp64_lo[3], 0x31); + //v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi[2], v_tmp64_hi[3], 0x31); + //v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo[6], v_tmp64_lo[7], 0x31); + //v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi[6], v_tmp64_hi[7], 0x31); + + //__m256i v_madd_0[8][16]; + //__m256i v_madd_1[8][16]; + //for (int s = 0; s < 8; ++s) { + // for (int c = 0; c < 16; ++c) { + // const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + // v_madd_0[s][c] = _mm256_madd_epi16(v_src[0 + s], v_coeff); + // v_madd_1[s][c] = _mm256_madd_epi16(v_src[8 + s], v_coeff); + // c_ptr++; + // } + //} + + //__m256i v_add_00[4][16]; + //__m256i v_add_01[4][16]; + //for (int s = 0, d = 0; d < 4; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_00[d][c] = _mm256_add_epi32(v_madd_0[s + 0][c], v_madd_0[s + 1][c]); + // v_add_01[d][c] = _mm256_add_epi32(v_madd_1[s + 0][c], v_madd_1[s + 1][c]); + // } + //} + + //__m256i v_add_10[2][16]; + //__m256i v_add_11[2][16]; + //for (int s = 0, d = 0; d < 2; ++d, s += 2) { + // for (int c = 0; c < 16; ++c) { + // v_add_10[d][c] = _mm256_add_epi32(v_add_00[s + 0][c], v_add_00[s + 1][c]); + // v_add_11[d][c] = _mm256_add_epi32(v_add_01[s + 0][c], v_add_01[s + 1][c]); + // } + //} + + //__m256i v_trunc_0[16]; + //__m256i v_trunc_1[16]; + //for (int c = 0; c < 16; ++c) { + // v_trunc_0[c] = truncate_avx2(_mm256_add_epi32(v_add_10[0][c], v_add_10[1][c]), debias, shift); + // v_trunc_1[c] = truncate_avx2(_mm256_add_epi32(v_add_11[0][c], v_add_11[1][c]), debias, shift); + //} + + //__m256i v_result[16]; + //for (int d = 0; d < 16; ++d) { + // v_result[d] = _mm256_packs_epi32(v_trunc_0[d], v_trunc_1[d]); + //} + //for (int d = 0; d < 16; ++d) { + // v_result[d] = _mm256_permute4x64_epi64(v_result[d], _MM_SHUFFLE(3, 1, 2, 0)); + //} + + //transpose_avx2(v_result, (__m256i*)dst, 16, 16); +} + +void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 16; + + int skip_width = 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor; + const int16_t* ver_coeff = fi_dct2_16x16_coeff_ver; + if (hor == DST7) { + hor_coeff = fi_dst7_16x16_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x16_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_16x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_16x16_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height); +} + + +void fast_forward_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_16xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_16x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_16xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_16xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_16x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_16x32_coeff_ver; + } + + int16_t v_hor_pass_out[32*16]; + fast_forward_DCT2_B16_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + + __m256i temp_out[32]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + if(ver == DCT2) { + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 32]; + source[1] = v_hor_pass_out[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_1); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 16); + } + else { + for (int j = 0; j < 16; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 32]; + source[1] = v_hor_pass_out[j + i * 32 + 16]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 48; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 16); + } +#if 0 + // To how many parts the vertical pass should be split. + // At least on my testing it seems that there is no further gain by splitting to more than 4 parts. +#define NUM_PARTS 4 +#define PART_DIMENSION (32/NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got 32 / NUM_PARTS lines of samples. Handle two lines at a time (beacuse of unpack) + __m256i v_madd_hi[16][PART_DIMENSION]; + __m256i v_madd_lo[16][PART_DIMENSION]; + // Samples are the same between the parts + __m256i* v_src_ptr = v_hor_pass_out; + // However for coefficients, the starting point needs to be adjusted + const int32_t* line_coeff = (const int32_t*)ver_coeff + PART_DIMENSION * part; + for (int i = 0; i < 16; ++i) { + __m256i v_src_hi = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[1]); + __m256i v_src_lo = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[1]); + + // Apply coefficients + // TODO: Here try loading the coefficient directly instead of set1 + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_hi[i][ii] = _mm256_madd_epi16(v_src_hi, v_coeff); + v_madd_lo[i][ii] = _mm256_madd_epi16(v_src_lo, v_coeff); + } + + line_coeff += 32; + v_src_ptr += 2; + } + + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + // First round of additions + __m256i v_add_hi_0[8]; + __m256i v_add_lo_0[8]; + for (int i = 0; i < 8; ++i) { + const int offset = i * 2; + v_add_hi_0[i] = _mm256_add_epi32(v_madd_hi[offset][ii], v_madd_hi[offset + 1][ii]); + v_add_lo_0[i] = _mm256_add_epi32(v_madd_lo[offset][ii], v_madd_lo[offset + 1][ii]); + } + + // Second round of additions + __m256i v_add_hi_1[4]; + __m256i v_add_lo_1[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_add_hi_1[i] = _mm256_add_epi32(v_add_hi_0[offset], v_add_hi_0[offset + 1]); + v_add_lo_1[i] = _mm256_add_epi32(v_add_lo_0[offset], v_add_lo_0[offset + 1]); + } + + // Third round of addtions + __m256i v_add_hi_2[2]; + __m256i v_add_lo_2[2]; + for (int i = 0; i < 2; ++i) { + const int offset = i * 2; + v_add_hi_2[i] = _mm256_add_epi32(v_add_hi_1[offset], v_add_hi_1[offset + 1]); + v_add_lo_2[i] = _mm256_add_epi32(v_add_lo_1[offset], v_add_lo_1[offset + 1]); + } + + // Final round of additions, truncate and store + __m256i v_trunc_hi = truncate_avx2(_mm256_add_epi32(v_add_hi_2[0], v_add_hi_2[1]), debias, shift_2nd); + __m256i v_trunc_lo = truncate_avx2(_mm256_add_epi32(v_add_lo_2[0], v_add_lo_2[1]), debias, shift_2nd); + __m256i v_result = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + _mm256_store_si256((__m256i*)dst, v_result); + + dst += 16; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION +#endif + + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_16x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vectors at a time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_tmp16_lo[16]; + __m256i v_tmp16_hi[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_tmp16_lo[d] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 1]); + v_tmp16_hi[d] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 1]); + } + int row = 0; + for (; row < 32 - skip_line2; ++row) { + __m256i v_res_lo = _mm256_setzero_si256(); + __m256i v_res_hi = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_lo = _mm256_madd_epi16(v_tmp16_lo[i], v_coeff); + __m256i v_madd_hi = _mm256_madd_epi16(v_tmp16_hi[i], v_coeff); + c_ptr++; + + v_res_lo = _mm256_add_epi32(v_res_lo, v_madd_lo); + v_res_hi = _mm256_add_epi32(v_res_hi, v_madd_hi); + } + + __m256i v_trunc_lo = truncate_avx2(v_res_lo, debias, shift); + __m256i v_trunc_hi = truncate_avx2(v_res_hi, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc_lo, v_trunc_hi); + dst[row] = packed; + } + + for (; row < 32; ++row) { + dst[row] = _mm256_setzero_si256(); + } +} + +static void fast_inverse_tr_16x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + int32_t * src_32 = (int32_t *)src; + for (int row = 0, d = 0; row < 32; ++row) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i *coeff_start = (__m256i*) coeff; + for (int i = 0; i < 8; ++i) { + __m256i v_src = _mm256_set1_epi32(*src_32); + src_32++; + + __m256i v_madd_0 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start)); + coeff_start++; + __m256i v_madd_1 = _mm256_madd_epi16(v_src, _mm256_load_si256(coeff_start)); + coeff_start++; + + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*) dst + row, packed); + } +} + +void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 16; + const int height = 32; + + int skip_width = 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs + } else if (hor == DCT8) { + hor_coeff = fi_dct8_16x32_coeff_hor; + } + if (ver == DST7) { + ver_coeff = uvg_g_dst7_32_t; + } else if (ver == DCT8) { + ver_coeff = uvg_g_dct8_32; + } + + __m256i v_ver_pass_out[32]; + fast_inverse_tr_16x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + int16_t* ver_pass_out = (int16_t*)v_ver_pass_out; + fast_inverse_tr_16x32_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static void fast_forward_DCT2_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int reduced_line = line - skip_line; + + for(int j = 0; j < reduced_line; ++j) { + int32_t source[16]; + memcpy(source, src, sizeof(int16_t) * 32); + src += 32; + + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t *coeff_start = coeff; + for(int i = 0; i < 16; i++) { + __m256i v_src = _mm256_set1_epi32(source[i]); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + __m256i madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_1 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + if(line == 32) { + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1 = _mm256_permute4x64_epi64(v_trunc_1, _MM_SHUFFLE(3, 1, 2, 0)); + } + + _mm256_store_si256(dst, v_trunc_0); + dst++; + _mm256_store_si256(dst, v_trunc_1); + dst++; + } +} + +static void fast_forward_DCT8_B32_avx2_hor(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const int cutoff = 32 - skip_line2; + const int reduced_line = line - skip_line; + + ALIGNED(32) int16_t temp_source[32 * 32]; + __m256i* v_src_p = (__m256i*) src; + for (int i = 0; i < reduced_line / 2; ++i) { + __m256i first_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]); + __m256i first_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4], v_src_p[i * 4 + 2]); + __m256i second_half_lo = _mm256_unpacklo_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]); + __m256i second_half_hi = _mm256_unpackhi_epi32(v_src_p[i * 4 + 1], v_src_p[i * 4 + 3]); + + _mm256_store_si256((__m256i*)temp_source + i * 4, first_half_lo); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 1, first_half_hi); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 2, second_half_lo); + _mm256_store_si256((__m256i*)temp_source + i * 4 + 3, second_half_hi); + } + + for (int j = 0; j < reduced_line / 2; ++j) { + + int32_t source[32]; + memcpy(source, temp_source + 64 * j, sizeof(int16_t) * 64); + + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = coeff; + + for (int i = 0; i < 32; i += 2) { + __m256i v_src0 = _mm256_set1_epi32(source[i]); + __m256i v_src1 = _mm256_set1_epi32(source[i + 1]); + + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 48; + + __m256i madd_0 = _mm256_madd_epi16(v_src0, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src0, v_coeff_1); + __m256i madd_2 = _mm256_madd_epi16(v_src1, v_coeff_0); + __m256i madd_3 = _mm256_madd_epi16(v_src1, v_coeff_1); + + res_0 = _mm256_add_epi32(madd_0, res_0); + res_1 = _mm256_add_epi32(madd_1, res_1); + res_2 = _mm256_add_epi32(madd_2, res_2); + res_3 = _mm256_add_epi32(madd_3, res_3); + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(res_3, debias, shift); + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + + if (line == 32) { + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0)); + } + _mm256_store_si256(dst, v_trunc_0); + dst+=2; + _mm256_store_si256(dst, v_trunc_2); + dst+=2; + } +} + + +static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_src_ptr = src; + + // Prepare coeffs + // TODO: either rename these old coeff tables to be consistent with other new avx2 functions + // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[16]); + + // Got data for 4 vectors, 32 lines with 2 samples each + __m256i v_result_e[4]; + __m256i v_result_o[4]; + for (int j = 0; j < 4; ++j) { + const __m256i v_src = v_src_ptr[0]; + + v_result_e[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_0), debias, shift); + v_result_o[j] = truncate_avx2(_mm256_madd_epi16(v_src, v_coeff_1), debias, shift); + + v_src_ptr++; + } + + __m256i v_tmp[4]; + v_tmp[0] = _mm256_packs_epi32(v_result_e[0], v_result_e[1]); + v_tmp[1] = _mm256_packs_epi32(v_result_e[2], v_result_e[3]); + v_tmp[2] = _mm256_packs_epi32(v_result_o[0], v_result_o[1]); + v_tmp[3] = _mm256_packs_epi32(v_result_o[2], v_result_o[3]); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp[2], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp[3], _MM_SHUFFLE(3, 1, 2, 0)); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)&dst[i * 16], v_tmp[i]); + } +} + +static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Got data for 8 vectors, 32 lines with 4 samples each + + // Prepare coeffs + const int16_t* coeff = uvg_g_dct_4; + const int a = coeff[0]; + const int b = coeff[1 * 4 + 0]; + const int c = coeff[1 * 4 + 1]; + + __m256i v_coeff_0 = _mm256_set1_epi16(a); + __m256i v_coeff_1 = _mm256_setr_epi16(b, c, -c, -b, b, c, -c, -b, b, c, -c, -b, b, c, -c, -b); + __m256i v_coeff_2 = _mm256_setr_epi16(a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a); + __m256i v_coeff_3 = _mm256_setr_epi16(c, -b, b, -c, c, -b, b, -c, c, -b, b, -c, c, -b, b, -c); + + const __m256i* v_src_ptr = src; + __m256i v_trunc_0[8]; + __m256i v_trunc_1[8]; + for (int j = 0; j < 8; ++j) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_ptr[0], v_coeff_3); + + v_trunc_0[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_0, v_madd_1), debias, shift); + v_trunc_1[j] = truncate_avx2(_mm256_hadd_epi32(v_madd_2, v_madd_3), debias, shift); + + v_src_ptr++; + } + + __m256i v_result[8]; + __m256i v_tmp[8]; + for (int i = 0; i < 8; ++i) { + v_trunc_0[i] = _mm256_permute4x64_epi64(v_trunc_0[i], _MM_SHUFFLE(3, 1, 2, 0)); + v_trunc_1[i] = _mm256_permute4x64_epi64(v_trunc_1[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + v_tmp[0] = _mm256_packs_epi32(v_trunc_0[0], v_trunc_0[1]); + v_tmp[1] = _mm256_packs_epi32(v_trunc_0[2], v_trunc_0[3]); + v_tmp[2] = _mm256_packs_epi32(v_trunc_0[4], v_trunc_0[5]); + v_tmp[3] = _mm256_packs_epi32(v_trunc_0[6], v_trunc_0[7]); + v_tmp[4] = _mm256_packs_epi32(v_trunc_1[0], v_trunc_1[1]); + v_tmp[5] = _mm256_packs_epi32(v_trunc_1[2], v_trunc_1[3]); + v_tmp[6] = _mm256_packs_epi32(v_trunc_1[4], v_trunc_1[5]); + v_tmp[7] = _mm256_packs_epi32(v_trunc_1[6], v_trunc_1[7]); + + v_result[0] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31); + + v_result[4] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x20); + v_result[5] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x20); + v_result[6] = _mm256_permute2x128_si256(v_tmp[4], v_tmp[5], 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp[6], v_tmp[7], 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)&dst[i * 16], v_result[i]); + } +} + + +static void fast_forward_DCT2_32x8_avx2_ver(const __m256i* src, int16_t* dst, int32_t shift, int line, int skip_line, int skip_line2) +{ + int16_t* const p_dst = dst; + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + // Re-use coeff table + const __m256i* v_coeff = (const __m256i*)ff_dct2_16x8_coeff_ver; + + const int reduced_line = line - skip_line; + const __m256i* v_src_ptr = src; + __m256i v_tmp_result[16]; + // Handle 2 lines at a time (16 samples, 8 samples per line) + for (int i = 0; i < 16; ++i) { + // line 1 line 2 + // src vector: [s0 s1 s2 s3 s4 s5 s6 s7 | s0 s1 s2 s3 s4 s5 s6 s7] + // __m256i v_src = _mm256_load_si256((const __m256i*)src); + + // Rearrange source in a way samples can be added together column-wise using add + // after first round of madd operations. + // Need 4 source vectors arranged as follows. High 128 lanes are the same as low: + // vec_01 = [s0 s1 s0 s1 s0 s1 s0 s1 |...] + // vec_02 = [s2 s3 s2 s3 s2 s3 s2 s3 |...] + // vec_03 = [s4 s5 s4 s5 s4 s5 s4 s5 |...] + // vec_04 = [s6 s7 s6 s7 s6 s7 s6 s7 |...] + + __m256i v_src_0 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(0, 0, 0, 0)); + __m256i v_src_1 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(1, 1, 1, 1)); + __m256i v_src_2 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(2, 2, 2, 2)); + __m256i v_src_3 = _mm256_shuffle_epi32(v_src_ptr[0], _MM_SHUFFLE(3, 3, 3, 3)); + + // Lane 1 + __m256i v_madd_0 = _mm256_madd_epi16(v_src_0, v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src_1, v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src_2, v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src_3, v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + // Lane 2 + __m256i v_madd_4 = _mm256_madd_epi16(v_src_0, v_coeff[4]); + __m256i v_madd_5 = _mm256_madd_epi16(v_src_1, v_coeff[5]); + __m256i v_madd_6 = _mm256_madd_epi16(v_src_2, v_coeff[6]); + __m256i v_madd_7 = _mm256_madd_epi16(v_src_3, v_coeff[7]); + + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + // Trunc results from both lanes + __m256i v_trunc_0 = truncate_avx2(v_add_10, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_add_11, debias, shift); + + v_tmp_result[i] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + + v_src_ptr++; + } + + __m256i v_result[16]; + transpose_avx2(v_tmp_result, v_result, 8, 32); + + for (int i = 0; i < 16; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + if (skip_line) + { + dst = p_dst + reduced_line; + for (int j = 0; j < 8; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_line); + dst += line; + } + } +} + + +void fast_forward_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 2; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x2_coeff_ver; + + __m256i v_hor_pass_out[4]; + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 4 source vectors, 2 lines 32 samples each + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_hor_pass_out[0], v_hor_pass_out[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_hor_pass_out[0], v_hor_pass_out[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_hor_pass_out[1], v_hor_pass_out[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_hor_pass_out[1], v_hor_pass_out[3]); + + __m256i v_madd_hi_00 = _mm256_madd_epi16(v_src_hi_0, v_coeff[0]); + __m256i v_madd_hi_01 = _mm256_madd_epi16(v_src_hi_0, v_coeff[1]); + __m256i v_madd_hi_10 = _mm256_madd_epi16(v_src_hi_1, v_coeff[0]); + __m256i v_madd_hi_11 = _mm256_madd_epi16(v_src_hi_1, v_coeff[1]); + + __m256i v_madd_lo_00 = _mm256_madd_epi16(v_src_lo_0, v_coeff[0]); + __m256i v_madd_lo_01 = _mm256_madd_epi16(v_src_lo_0, v_coeff[1]); + __m256i v_madd_lo_10 = _mm256_madd_epi16(v_src_lo_1, v_coeff[0]); + __m256i v_madd_lo_11 = _mm256_madd_epi16(v_src_lo_1, v_coeff[1]); + + __m256i v_trunc_hi_00 = truncate_avx2(v_madd_hi_00, debias, shift_2nd); + __m256i v_trunc_hi_01 = truncate_avx2(v_madd_hi_01, debias, shift_2nd); + __m256i v_trunc_hi_10 = truncate_avx2(v_madd_hi_10, debias, shift_2nd); + __m256i v_trunc_hi_11 = truncate_avx2(v_madd_hi_11, debias, shift_2nd); + + __m256i v_trunc_lo_00 = truncate_avx2(v_madd_lo_00, debias, shift_2nd); + __m256i v_trunc_lo_01 = truncate_avx2(v_madd_lo_01, debias, shift_2nd); + __m256i v_trunc_lo_10 = truncate_avx2(v_madd_lo_10, debias, shift_2nd); + __m256i v_trunc_lo_11 = truncate_avx2(v_madd_lo_11, debias, shift_2nd); + + __m256i v_result_0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_hi_00); + __m256i v_result_1 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_hi_10); // Swap middle hi-lo lanes + __m256i v_result_2 = _mm256_packs_epi32(v_trunc_lo_01, v_trunc_hi_01); + __m256i v_result_3 = _mm256_packs_epi32(v_trunc_lo_11, v_trunc_hi_11); + + // Swap middle 64-bit chunks + v_result_0 = _mm256_permute4x64_epi64(v_result_0, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_1 = _mm256_permute4x64_epi64(v_result_1, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_2 = _mm256_permute4x64_epi64(v_result_2, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_3 = _mm256_permute4x64_epi64(v_result_3, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, v_result_0); + _mm256_store_si256((__m256i*)(dst + 16), v_result_1); + _mm256_store_si256((__m256i*)(dst + 32), v_result_2); + _mm256_store_si256((__m256i*)(dst + 48), v_result_3); +} + + +static void fast_inverse_tr_32x2_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*) & coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*) & coeff[16]); + + const __m256i* v_src = (const __m256i*)src; + + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src[0], v_src[2]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src[1], v_src[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src[0], v_src[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src[1], v_src[3]); + + __m256i v_trunc_lo_00 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_0), debias, shift); + __m256i v_trunc_lo_01 = truncate_avx2(_mm256_madd_epi16(v_src_lo_0, v_coeff_1), debias, shift); + __m256i v_trunc_lo_10 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_0), debias, shift); + __m256i v_trunc_lo_11 = truncate_avx2(_mm256_madd_epi16(v_src_lo_1, v_coeff_1), debias, shift); + + __m256i v_trunc_hi_00 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_0), debias, shift); + __m256i v_trunc_hi_01 = truncate_avx2(_mm256_madd_epi16(v_src_hi_0, v_coeff_1), debias, shift); + __m256i v_trunc_hi_10 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_0), debias, shift); + __m256i v_trunc_hi_11 = truncate_avx2(_mm256_madd_epi16(v_src_hi_1, v_coeff_1), debias, shift); + + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc_lo_00, v_trunc_lo_01); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc_hi_00, v_trunc_hi_01); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc_lo_10, v_trunc_lo_11); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc_hi_10, v_trunc_hi_11); + + dst[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + dst[1] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + dst[2] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + dst[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); +} + +static void fast_inverse_tr_32x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + __m256i v_src[4]; + for (int i = 0; i < 4; ++i) { + v_src[i] = _mm256_permute4x64_epi64(src[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff[0]); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff[1]); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff[2]); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff[3]); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + v_add[i] = _mm256_add_epi32(v_add_00, v_add_01); + v_coeff += 4; + } + + __m256i v_hadd_0[16]; + for (int src = 0, dst = 0; dst < 16; ++dst, src += 2) { + v_hadd_0[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_hadd_1[8]; + for (int src = 0, dst = 0; dst < 8; ++dst, src += 2) { + v_hadd_1[dst] = _mm256_hadd_epi32(v_hadd_0[src + 0], v_hadd_0[src + 1]); + } + + __m256i v_trunc[8]; + for (int i = 0; i < 8; ++i) { + v_trunc[i] = truncate_avx2(v_hadd_1[i], debias, shift); + } + + __m256i v_result[4]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + + v_result[0] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp0, v_tmp1, 0x31); + v_result[3] = _mm256_permute2x128_si256(v_tmp2, v_tmp3, 0x31); + + for (int i = 0; i < 4; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: cutoff for DCT8 and DST7 +} + +void fast_inverse_tr_32x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 2; + + int skip_width = 0; // DST7 and DCT8 are not defined for this size. Therefore no skip width needed. + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename + const int16_t* hor_coeff = fi_dct2_2x32_coeff_ver; // rename + // No DST7 and DCT8 tables needed. + + __m256i v_ver_pass_out[4]; + fast_inverse_tr_32x2_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x2_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 4; + + int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x4_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x4_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x4_coeff_ver; + } + + __m256i v_hor_pass_out[8]; + if(hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)ver_coeff; + + // Got samples for 8 vectors. 4 lines with 32 samples each. Need 2 vectors for each line + // Handle two lines at a time + __m256i v_madd_lo_even[2][4]; + __m256i v_madd_lo_odd[2][4]; + __m256i v_madd_hi_even[2][4]; + __m256i v_madd_hi_odd[2][4]; + __m256i* v_src_ptr = v_hor_pass_out; + for (int i = 0; i < 2; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < 4; ++ii) { + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]); + } + + v_coeff += 4; + v_src_ptr += 4; + } + + // Final add and truncate + __m256i v_trunc_lo_even[4]; + __m256i v_trunc_hi_even[4]; + __m256i v_trunc_lo_odd[4]; + __m256i v_trunc_hi_odd[4]; + for (int ii = 0; ii < 4; ++ii) { + v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_lo_even[0][ii], v_madd_lo_even[1][ii]), debias, shift_2nd); + v_trunc_lo_odd[ii] = truncate_avx2(_mm256_add_epi32( v_madd_lo_odd[0][ii], v_madd_lo_odd[1][ii]), debias, shift_2nd); + v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_madd_hi_even[0][ii], v_madd_hi_even[1][ii]), debias, shift_2nd); + v_trunc_hi_odd[ii] = truncate_avx2(_mm256_add_epi32( v_madd_hi_odd[0][ii], v_madd_hi_odd[1][ii]), debias, shift_2nd); + } + + // Permute and store + for (int i = 0; i < 4; ++i) { + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_32x4_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_madd_lo_0[2][4]; + __m256i v_madd_lo_1[2][4]; + __m256i v_madd_hi_0[2][4]; + __m256i v_madd_hi_1[2][4]; + const __m256i* v_c_ptr = v_coeff; + for (int src = 0; src < 2; ++src) { + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_raw[1], v_src_raw[3]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_raw[1], v_src_raw[3]); + + for (int i = 0; i < 4; i++) { + v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]); + v_madd_lo_1[src][i] = _mm256_madd_epi16(v_src_lo_1, v_c_ptr[i]); + v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]); + v_madd_hi_1[src][i] = _mm256_madd_epi16(v_src_hi_1, v_c_ptr[i]); + } + v_c_ptr += 4; + v_src_raw += 4; + } + + __m256i v_trunc_lo[8]; + __m256i v_trunc_hi[8]; + for (int dst = 0, src = 0; src < 4; ++src, dst += 2) { + v_trunc_lo[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift); + v_trunc_lo[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_lo_1[0][src], v_madd_lo_1[1][src]), debias, shift); + v_trunc_hi[dst + 0] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift); + v_trunc_hi[dst + 1] = truncate_avx2(_mm256_add_epi32(v_madd_hi_1[0][src], v_madd_hi_1[1][src]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[2]); + dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[2]); + dst[4] = _mm256_packs_epi32(v_trunc_lo[4], v_trunc_lo[6]); + dst[6] = _mm256_packs_epi32(v_trunc_hi[4], v_trunc_hi[6]); + + if(skip_line == 0) { + dst[1] = _mm256_packs_epi32(v_trunc_lo[1], v_trunc_lo[3]); + dst[3] = _mm256_packs_epi32(v_trunc_hi[1], v_trunc_hi[3]); + dst[5] = _mm256_packs_epi32(v_trunc_lo[5], v_trunc_lo[7]); + dst[7] = _mm256_packs_epi32(v_trunc_hi[5], v_trunc_hi[7]); + } + else { + dst[1] = _mm256_setzero_si256(); + dst[3] = _mm256_setzero_si256(); + dst[5] = _mm256_setzero_si256(); + dst[7] = _mm256_setzero_si256(); + } + + // TODO: mts cutoff +} +static void fast_inverse_tr_32x4_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_madd_lo_0[2][4]; + __m256i v_madd_hi_0[2][4]; + const __m256i* v_c_ptr = v_coeff; + for (int src = 0; src < 2; ++src) { + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_raw[0], v_src_raw[2]); + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_raw[0], v_src_raw[2]); + + for (int i = 0; i < 4; i++) { + v_madd_lo_0[src][i] = _mm256_madd_epi16(v_src_lo_0, v_c_ptr[i]); + v_madd_hi_0[src][i] = _mm256_madd_epi16(v_src_hi_0, v_c_ptr[i]); + } + v_c_ptr += 4; + v_src_raw += 4; + } + + __m256i v_trunc_lo[4]; + __m256i v_trunc_hi[4]; + for (int src = 0; src < 4; ++src) { + v_trunc_lo[src] = truncate_avx2(_mm256_add_epi32(v_madd_lo_0[0][src], v_madd_lo_0[1][src]), debias, shift); + v_trunc_hi[src] = truncate_avx2(_mm256_add_epi32(v_madd_hi_0[0][src], v_madd_hi_0[1][src]), debias, shift); + } + + dst[0] = _mm256_packs_epi32(v_trunc_lo[0], v_trunc_lo[1]); + dst[2] = _mm256_packs_epi32(v_trunc_hi[0], v_trunc_hi[1]); + dst[4] = _mm256_packs_epi32(v_trunc_lo[2], v_trunc_lo[3]); + dst[6] = _mm256_packs_epi32(v_trunc_hi[2], v_trunc_hi[3]); + + dst[1] = _mm256_setzero_si256(); + dst[3] = _mm256_setzero_si256(); + dst[5] = _mm256_setzero_si256(); + dst[7] = _mm256_setzero_si256(); + + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x4_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + + v_src[4] = _mm256_permute2x128_si256(src[1], src[5], 0x20); + v_src[5] = _mm256_permute2x128_si256(src[3], src[7], 0x20); + v_src[6] = _mm256_permute2x128_si256(src[1], src[5], 0x31); + v_src[7] = _mm256_permute2x128_si256(src[3], src[7], 0x31); + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]); + __m256i v_coeff_4 = _mm256_set1_epi64x(c_ptr[4]); + __m256i v_coeff_5 = _mm256_set1_epi64x(c_ptr[5]); + __m256i v_coeff_6 = _mm256_set1_epi64x(c_ptr[6]); + __m256i v_coeff_7 = _mm256_set1_epi64x(c_ptr[7]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + __m256i v_madd_4 = _mm256_madd_epi16(v_src[4], v_coeff_4); + __m256i v_madd_5 = _mm256_madd_epi16(v_src[5], v_coeff_5); + __m256i v_madd_6 = _mm256_madd_epi16(v_src[6], v_coeff_6); + __m256i v_madd_7 = _mm256_madd_epi16(v_src[7], v_coeff_7); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + __m256i v_add_02 = _mm256_add_epi32(v_madd_4, v_madd_5); + __m256i v_add_03 = _mm256_add_epi32(v_madd_6, v_madd_7); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + __m256i v_add_11 = _mm256_add_epi32(v_add_02, v_add_03); + + v_add[i] = _mm256_add_epi32(v_add_10, v_add_11); + c_ptr += 8; + } + + __m256i v_hadd[16]; + for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[8]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]); + __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]); + __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]); + __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle); + v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle); + v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle); + v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + // TODO: cutoff for dct8 and dst7 +} +static void fast_inverse_tr_32x4_avx2_mts_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) { + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int64_t* c_ptr = (const int64_t*)coeff; // Cast to 64 bit integer to read four coeffs at a time + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)fi_tr_4x8_result_shuffle_ver); // Can use existing shuffle vector + + __m256i v_src[8]; + v_src[0] = _mm256_permute2x128_si256(src[0], src[4], 0x20); + v_src[1] = _mm256_permute2x128_si256(src[2], src[6], 0x20); + v_src[2] = _mm256_permute2x128_si256(src[0], src[4], 0x31); + v_src[3] = _mm256_permute2x128_si256(src[2], src[6], 0x31); + + + __m256i v_add[32]; + for (int i = 0; i < 32; ++i) { + __m256i v_coeff_0 = _mm256_set1_epi64x(c_ptr[0]); + __m256i v_coeff_1 = _mm256_set1_epi64x(c_ptr[1]); + __m256i v_coeff_2 = _mm256_set1_epi64x(c_ptr[2]); + __m256i v_coeff_3 = _mm256_set1_epi64x(c_ptr[3]); + + __m256i v_madd_0 = _mm256_madd_epi16(v_src[0], v_coeff_0); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[1], v_coeff_1); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[2], v_coeff_2); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[3], v_coeff_3); + + __m256i v_add_00 = _mm256_add_epi32(v_madd_0, v_madd_1); + __m256i v_add_01 = _mm256_add_epi32(v_madd_2, v_madd_3); + + __m256i v_add_10 = _mm256_add_epi32(v_add_00, v_add_01); + + v_add[i] = v_add_10; + c_ptr += 8; + } + + __m256i v_hadd[16]; + for (int dst = 0, src = 0; dst < 16; ++dst, src += 2) { + v_hadd[dst] = _mm256_hadd_epi32(v_add[src + 0], v_add[src + 1]); + } + + __m256i v_trunc[16]; + for (int i = 0; i < 16; ++i) { + v_trunc[i] = truncate_avx2(v_hadd[i], debias, shift); + } + + __m256i v_result[8]; + __m256i v_tmp0 = _mm256_packs_epi32(v_trunc[0], v_trunc[1]); + __m256i v_tmp1 = _mm256_packs_epi32(v_trunc[2], v_trunc[3]); + __m256i v_tmp2 = _mm256_packs_epi32(v_trunc[4], v_trunc[5]); + __m256i v_tmp3 = _mm256_packs_epi32(v_trunc[6], v_trunc[7]); + __m256i v_tmp4 = _mm256_packs_epi32(v_trunc[8], v_trunc[9]); + __m256i v_tmp5 = _mm256_packs_epi32(v_trunc[10], v_trunc[11]); + __m256i v_tmp6 = _mm256_packs_epi32(v_trunc[12], v_trunc[13]); + __m256i v_tmp7 = _mm256_packs_epi32(v_trunc[14], v_trunc[15]); + + v_tmp0 = _mm256_shuffle_epi8(v_tmp0, v_res_shuffle); + v_tmp1 = _mm256_shuffle_epi8(v_tmp1, v_res_shuffle); + v_tmp2 = _mm256_shuffle_epi8(v_tmp2, v_res_shuffle); + v_tmp3 = _mm256_shuffle_epi8(v_tmp3, v_res_shuffle); + v_tmp4 = _mm256_shuffle_epi8(v_tmp4, v_res_shuffle); + v_tmp5 = _mm256_shuffle_epi8(v_tmp5, v_res_shuffle); + v_tmp6 = _mm256_shuffle_epi8(v_tmp6, v_res_shuffle); + v_tmp7 = _mm256_shuffle_epi8(v_tmp7, v_res_shuffle); + + __m256i v_tmp_lo_0 = _mm256_unpacklo_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_lo_1 = _mm256_unpacklo_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_lo_2 = _mm256_unpacklo_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_lo_3 = _mm256_unpacklo_epi64(v_tmp6, v_tmp7); + __m256i v_tmp_hi_0 = _mm256_unpackhi_epi64(v_tmp0, v_tmp1); + __m256i v_tmp_hi_1 = _mm256_unpackhi_epi64(v_tmp2, v_tmp3); + __m256i v_tmp_hi_2 = _mm256_unpackhi_epi64(v_tmp4, v_tmp5); + __m256i v_tmp_hi_3 = _mm256_unpackhi_epi64(v_tmp6, v_tmp7); + + v_result[0] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x20); + v_result[1] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x20); + v_result[2] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x20); + v_result[3] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_tmp_lo_0, v_tmp_lo_1, 0x31); + v_result[5] = _mm256_permute2x128_si256(v_tmp_lo_2, v_tmp_lo_3, 0x31); + v_result[6] = _mm256_permute2x128_si256(v_tmp_hi_0, v_tmp_hi_1, 0x31); + v_result[7] = _mm256_permute2x128_si256(v_tmp_hi_2, v_tmp_hi_3, 0x31); + + for (int i = 0; i < 8; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + // TODO: cutoff for dct8 and dst7 +} + +void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 4; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename + const int16_t* hor_coeff = uvg_g_dct_32_t; + if (hor == DST7) { + hor_coeff = uvg_g_dst7_32_t; + } else if (hor == DCT8) { + hor_coeff = uvg_g_dct8_32; + } + if (ver == DST7) { + ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename + } else if (ver == DCT8) { + ver_coeff = fi_dct8_4x32_coeff_hor; // TODO: rename + } + + __m256i v_ver_pass_out[8]; + if(ver == DCT2) { + fast_inverse_tr_32x4_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + } + else { + fast_inverse_tr_32x4_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + } + + if(hor == DCT2) { + fast_inverse_tr_32x4_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + } + else { + fast_inverse_tr_32x4_avx2_mts_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); + } +} + + +void fast_forward_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 8; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x8_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x8_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x8_coeff_ver; + } + + __m256i v_hor_pass_out[16]; + if (hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + // Same as for the other 32 and other dimension 8 or 16 + // However all 1,2,4 seem to be producing similar results as with increasing the value + // just shifts the pressure from one point to another +#define NUM_PARTS 4 +#define PART_DIMENSION (8 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got data for 16 vectors, 8 lines 32 samples each + // Handle two lines at a time + __m256i v_madd_lo_even[4][PART_DIMENSION]; + __m256i v_madd_lo_odd[4][PART_DIMENSION]; + __m256i v_madd_hi_even[4][PART_DIMENSION]; + __m256i v_madd_hi_odd[4][PART_DIMENSION]; + __m256i* v_src_ptr = v_hor_pass_out; + const __m256i* v_coeff = (const __m256i*)ver_coeff + part * PART_DIMENSION; + for (int i = 0; i < 4; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff[ii]); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff[ii]); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff[ii]); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff[ii]); + } + + v_coeff += 8; + v_src_ptr += 4; + } + + // First round of additions + __m256i v_add_lo_even[2][PART_DIMENSION]; + __m256i v_add_hi_even[2][PART_DIMENSION]; + __m256i v_add_lo_odd[2][PART_DIMENSION]; + __m256i v_add_hi_odd[2][PART_DIMENSION]; + for (int i = 0; i < 2; ++i) { + const int offset = 2 * i; + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_add_lo_even[i][ii] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]); + v_add_hi_even[i][ii] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]); + v_add_lo_odd[i][ii] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]); + v_add_hi_odd[i][ii] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]); + } + } + + // Final add and truncate + __m256i v_trunc_lo_even[PART_DIMENSION]; + __m256i v_trunc_hi_even[PART_DIMENSION]; + __m256i v_trunc_lo_odd[PART_DIMENSION]; + __m256i v_trunc_hi_odd[PART_DIMENSION]; + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + v_trunc_lo_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_even[0][ii], v_add_lo_even[1][ii]), debias, shift_2nd); + v_trunc_hi_even[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_even[0][ii], v_add_hi_even[1][ii]), debias, shift_2nd); + v_trunc_lo_odd[ii] = truncate_avx2(_mm256_add_epi32(v_add_lo_odd[0][ii], v_add_lo_odd[1][ii]), debias, shift_2nd); + v_trunc_hi_odd[ii] = truncate_avx2(_mm256_add_epi32(v_add_hi_odd[0][ii], v_add_hi_odd[1][ii]), debias, shift_2nd); + } + + // Permute and store + for (int i = 0; i < PART_DIMENSION; ++i) { + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even[i], v_trunc_hi_even[i]); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd[i], v_trunc_hi_odd[i]); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_32x8_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[8]; + __m256i v_src_hi[8]; + for (int d = 0, s = 0; d < 8; d += 2, s += 4) { + v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_lo[d + 1] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + + v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_hi[d + 1] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + for (int c = 0; c < 8; ++c) { + __m256i v_madd_lo_0[4]; + __m256i v_madd_lo_1[4]; + __m256i v_madd_hi_0[4]; + __m256i v_madd_hi_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]); + v_madd_lo_1[d] = _mm256_madd_epi16(v_src_lo[s + 1], v_coeff[d]); + v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]); + v_madd_hi_1[d] = _mm256_madd_epi16(v_src_hi[s + 1], v_coeff[d]); + } + v_coeff += 4; + + __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]); + __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]); + __m256i v_add_lo_10 = _mm256_add_epi32(v_madd_lo_1[0], v_madd_lo_1[1]); + __m256i v_add_lo_11 = _mm256_add_epi32(v_madd_lo_1[2], v_madd_lo_1[3]); + + __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]); + __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]); + __m256i v_add_hi_10 = _mm256_add_epi32(v_madd_hi_1[0], v_madd_hi_1[1]); + __m256i v_add_hi_11 = _mm256_add_epi32(v_madd_hi_1[2], v_madd_hi_1[3]); + + __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift); + __m256i v_trunc_lo_1 = truncate_avx2(_mm256_add_epi32(v_add_lo_10, v_add_lo_11), debias, shift); + + __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift); + __m256i v_trunc_hi_1 = truncate_avx2(_mm256_add_epi32(v_add_hi_10, v_add_hi_11), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + dst[1] = _mm256_packs_epi32(v_trunc_lo_1, v_trunc_hi_1); + dst += 2; + } + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x8_avx2_mts_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_coeff = (const __m256i*)coeff; + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src_lo[4]; + __m256i v_src_hi[4]; + for (int d = 0, s = 0; d < 4; d += 1, s += 4) { + v_src_lo[d + 0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src_hi[d + 0] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + } + + for (int c = 0; c < 8; ++c) { + __m256i v_madd_lo_0[4]; + __m256i v_madd_hi_0[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 1) { + v_madd_lo_0[d] = _mm256_madd_epi16(v_src_lo[s + 0], v_coeff[d]); + v_madd_hi_0[d] = _mm256_madd_epi16(v_src_hi[s + 0], v_coeff[d]); + } + v_coeff += 4; + + __m256i v_add_lo_00 = _mm256_add_epi32(v_madd_lo_0[0], v_madd_lo_0[1]); + __m256i v_add_lo_01 = _mm256_add_epi32(v_madd_lo_0[2], v_madd_lo_0[3]); + + __m256i v_add_hi_00 = _mm256_add_epi32(v_madd_hi_0[0], v_madd_hi_0[1]); + __m256i v_add_hi_01 = _mm256_add_epi32(v_madd_hi_0[2], v_madd_hi_0[3]); + + __m256i v_trunc_lo_0 = truncate_avx2(_mm256_add_epi32(v_add_lo_00, v_add_lo_01), debias, shift); + + __m256i v_trunc_hi_0 = truncate_avx2(_mm256_add_epi32(v_add_hi_00, v_add_hi_01), debias, shift); + + dst[0] = _mm256_packs_epi32(v_trunc_lo_0, v_trunc_hi_0); + dst[1] = _mm256_setzero_si256(); + dst += 2; + } + + // TODO: mts cutoff +} + +static void fast_inverse_tr_32x8_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int limit = skip_line2 == 16 ? 8 : 16; + + int32_t *src_32 = (int32_t*)src; + for (int j = 0; j < line; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + + __m256i *coeff_start = (__m256i*)coeff; + for (int i = 0; i < limit; ++i) { + __m256i v_src = _mm256_set1_epi32(*src_32); + src_32++; + + __m256i v_coeff0 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff2 = _mm256_loadu_si256(coeff_start); + coeff_start++; + __m256i v_coeff3 = _mm256_loadu_si256(coeff_start); + coeff_start++; + + __m256i madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i madd1 = _mm256_madd_epi16(v_src, v_coeff1); + __m256i madd2 = _mm256_madd_epi16(v_src, v_coeff2); + __m256i madd3 = _mm256_madd_epi16(v_src, v_coeff3); + + res_0 = _mm256_add_epi32(res_0, madd0); + res_1 = _mm256_add_epi32(res_1, madd1); + res_2 = _mm256_add_epi32(res_2, madd2); + res_3 = _mm256_add_epi32(res_3, madd3); + } + src_32 += limit == 8 ? 8 : 0; + + __m256i v_trunk0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunk1 = truncate_avx2(res_1, debias, shift); + __m256i v_trunk2 = truncate_avx2(res_2, debias, shift); + __m256i v_trunk3 = truncate_avx2(res_3, debias, shift); + + __m256i packed0 = _mm256_packs_epi32(v_trunk0, v_trunk1); + __m256i packed1 = _mm256_packs_epi32(v_trunk2, v_trunk3); + + packed0 = _mm256_permute4x64_epi64(packed0, _MM_SHUFFLE(3, 1, 2, 0)); + packed1 = _mm256_permute4x64_epi64(packed1, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, packed0); + _mm256_store_si256((__m256i*)dst + 1, packed1); + dst += 32; + } + + // TODO: cutoff for dct8 and dst7 +} + +void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 8; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_8x32_coeff_hor; // TODO: rename this table + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_8x32_coeff_hor; // TODO: rename + } else if (ver == DCT8) { + ver_coeff = fi_dct8_8x32_coeff_hor; // TODO: rename + } + + __m256i v_ver_pass_out[16]; + if(ver == DCT2) { + fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_inverse_tr_32x8_avx2_mts_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width); + } + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 16; + + int skip_width = (ver != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x16_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x16_coeff_ver; + } + + __m256i v_hor_pass_out[32]; + if (hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + // Same as for 8x32 and 16x32, 4 parts is optimal +#define NUM_PARTS 4 +#define PART_DIMENSION (16 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + // Got samples for 32 vectors, 16 lines with 32 samples each + // Handle two lines at a time + __m256i v_madd_lo_even[8][PART_DIMENSION]; + __m256i v_madd_lo_odd[8][PART_DIMENSION]; + __m256i v_madd_hi_even[8][PART_DIMENSION]; + __m256i v_madd_hi_odd[8][PART_DIMENSION]; + __m256i* v_src_ptr = v_hor_pass_out; + const int32_t* line_coeff = (const int32_t*)ver_coeff + part * PART_DIMENSION; + for (int i = 0; i < 8; ++i) { + __m256i v_src_hi_0 = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_0 = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_1 = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_lo_1 = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + + // Apply coeffs + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + const int32_t coeff = line_coeff[ii]; + const __m256i v_coeff = _mm256_set1_epi32(coeff); + v_madd_lo_even[i][ii] = _mm256_madd_epi16(v_src_lo_0, v_coeff); + v_madd_hi_even[i][ii] = _mm256_madd_epi16(v_src_hi_0, v_coeff); + v_madd_lo_odd[i][ii] = _mm256_madd_epi16(v_src_lo_1, v_coeff); + v_madd_hi_odd[i][ii] = _mm256_madd_epi16(v_src_hi_1, v_coeff); + } + + line_coeff += 16; + v_src_ptr += 4; + } + + for (int ii = 0; ii < PART_DIMENSION; ++ii) { + // First round of additions + __m256i v_add_lo_even_0[4]; + __m256i v_add_hi_even_0[4]; + __m256i v_add_lo_odd_0[4]; + __m256i v_add_hi_odd_0[4]; + for (int i = 0; i < 4; ++i) { + const int offset = i * 2; + v_add_lo_even_0[i] = _mm256_add_epi32(v_madd_lo_even[offset][ii], v_madd_lo_even[offset + 1][ii]); + v_add_hi_even_0[i] = _mm256_add_epi32(v_madd_hi_even[offset][ii], v_madd_hi_even[offset + 1][ii]); + v_add_lo_odd_0[i] = _mm256_add_epi32(v_madd_lo_odd[offset][ii], v_madd_lo_odd[offset + 1][ii]); + v_add_hi_odd_0[i] = _mm256_add_epi32(v_madd_hi_odd[offset][ii], v_madd_hi_odd[offset + 1][ii]); + } + + // Second round of additions + __m256i v_add_lo_even_1[2]; + __m256i v_add_hi_even_1[2]; + __m256i v_add_lo_odd_1[2]; + __m256i v_add_hi_odd_1[2]; + for (int i = 0; i < 2; ++i) { + const int offset = 2 * i; + v_add_lo_even_1[i] = _mm256_add_epi32(v_add_lo_even_0[offset], v_add_lo_even_0[offset + 1]); + v_add_hi_even_1[i] = _mm256_add_epi32(v_add_hi_even_0[offset], v_add_hi_even_0[offset + 1]); + v_add_lo_odd_1[i] = _mm256_add_epi32(v_add_lo_odd_0[offset], v_add_lo_odd_0[offset + 1]); + v_add_hi_odd_1[i] = _mm256_add_epi32(v_add_hi_odd_0[offset], v_add_hi_odd_0[offset + 1]); + } + + // Final add and truncate + __m256i v_trunc_lo_even; + __m256i v_trunc_hi_even; + __m256i v_trunc_lo_odd; + __m256i v_trunc_hi_odd; + v_trunc_lo_even = truncate_avx2(_mm256_add_epi32(v_add_lo_even_1[0], v_add_lo_even_1[1]), debias, shift_2nd); + v_trunc_hi_even = truncate_avx2(_mm256_add_epi32(v_add_hi_even_1[0], v_add_hi_even_1[1]), debias, shift_2nd); + v_trunc_lo_odd = truncate_avx2(_mm256_add_epi32(v_add_lo_odd_1[0], v_add_lo_odd_1[1]), debias, shift_2nd); + v_trunc_hi_odd = truncate_avx2(_mm256_add_epi32(v_add_hi_odd_1[0], v_add_hi_odd_1[1]), debias, shift_2nd); + + + // Permute and store + __m256i v_result_even = _mm256_packs_epi32(v_trunc_lo_even, v_trunc_hi_even); + __m256i v_result_odd = _mm256_packs_epi32(v_trunc_lo_odd, v_trunc_hi_odd); + // Flip the middle 64 bit chunks + v_result_even = _mm256_permute4x64_epi64(v_result_even, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_odd = _mm256_permute4x64_epi64(v_result_odd, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256((__m256i*)dst, v_result_even); + _mm256_store_si256((__m256i*)(dst + 16), v_result_odd); + dst += 32; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_32x16_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int limit = 32 - skip_line; + __m256i temp[32]; + for (int j = 0; j < limit; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + + __m256i* coeff_start = (__m256i*)coeff; + for (int i = 0; i < 8; ++i) { + int16_t source[2]; + source[0] = src[j + i * 64]; + source[1] = src[j + i * 64 + 32]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + + __m256i v_coeff0 = _mm256_load_si256(coeff_start); + coeff_start++; + __m256i v_coeff1 = _mm256_load_si256(coeff_start); + coeff_start++; + + __m256i v_madd0 = _mm256_madd_epi16(v_src, v_coeff0); + __m256i v_madd1 = _mm256_madd_epi16(v_src, v_coeff1); + + res_0 = _mm256_add_epi32(res_0, v_madd0); + res_1 = _mm256_add_epi32(res_1, v_madd1); + } + + __m256i v_trunc0 = truncate_avx2(res_0, debias, shift); + __m256i v_trunc1 = truncate_avx2(res_1, debias, shift); + + __m256i packed = _mm256_packs_epi32(v_trunc0, v_trunc1); + packed = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + temp[j] = packed; + } + for (int j = limit; j < 32; ++j) { + temp[j] = _mm256_setzero_si256(); + } + transpose_avx2(temp, dst, 16, 32); +} + +static void fast_inverse_tr_32x16_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const __m256i* v_src_raw = src; + const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)shuffle_16b_0415); + + // Do a 32-bit transpose to arrange result from previous pass + __m256i v_tmp32_lo_e[8]; + __m256i v_tmp32_hi_e[8]; + __m256i v_tmp32_lo_o[8]; + __m256i v_tmp32_hi_o[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 4) { + v_tmp32_lo_e[d] = _mm256_unpacklo_epi32(v_src_raw[s + 0], v_src_raw[s + 2]); + v_tmp32_hi_e[d] = _mm256_unpackhi_epi32(v_src_raw[s + 0], v_src_raw[s + 2]); + v_tmp32_lo_o[d] = _mm256_unpacklo_epi32(v_src_raw[s + 1], v_src_raw[s + 3]); + v_tmp32_hi_o[d] = _mm256_unpackhi_epi32(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + __m256i v_tmp64_lo_e[8]; + __m256i v_tmp64_hi_e[8]; + __m256i v_tmp64_lo_o[8]; + __m256i v_tmp64_hi_o[8]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_tmp64_lo_e[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]); + v_tmp64_lo_e[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]); + + v_tmp64_hi_e[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_e[s + 0], v_tmp32_lo_e[s + 1]); + v_tmp64_hi_e[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_e[s + 0], v_tmp32_hi_e[s + 1]); + + v_tmp64_lo_o[0 + d] = _mm256_unpacklo_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]); + v_tmp64_lo_o[4 + d] = _mm256_unpacklo_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]); + + v_tmp64_hi_o[0 + d] = _mm256_unpackhi_epi64(v_tmp32_lo_o[s + 0], v_tmp32_lo_o[s + 1]); + v_tmp64_hi_o[4 + d] = _mm256_unpackhi_epi64(v_tmp32_hi_o[s + 0], v_tmp32_hi_o[s + 1]); + } + + __m256i v_src[32]; + v_src[0] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x20); + v_src[1] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x20); + v_src[2] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x20); + v_src[3] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x20); + + v_src[4] = _mm256_permute2x128_si256(v_tmp64_lo_e[0], v_tmp64_lo_e[1], 0x31); + v_src[5] = _mm256_permute2x128_si256(v_tmp64_hi_e[0], v_tmp64_hi_e[1], 0x31); + v_src[6] = _mm256_permute2x128_si256(v_tmp64_lo_e[4], v_tmp64_lo_e[5], 0x31); + v_src[7] = _mm256_permute2x128_si256(v_tmp64_hi_e[4], v_tmp64_hi_e[5], 0x31); + + v_src[8] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x20); + v_src[9] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x20); + v_src[10] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x20); + v_src[11] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x20); + + v_src[12] = _mm256_permute2x128_si256(v_tmp64_lo_o[0], v_tmp64_lo_o[1], 0x31); + v_src[13] = _mm256_permute2x128_si256(v_tmp64_hi_o[0], v_tmp64_hi_o[1], 0x31); + v_src[14] = _mm256_permute2x128_si256(v_tmp64_lo_o[4], v_tmp64_lo_o[5], 0x31); + v_src[15] = _mm256_permute2x128_si256(v_tmp64_hi_o[4], v_tmp64_hi_o[5], 0x31); + + v_src[16] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x20); + v_src[17] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x20); + v_src[18] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x20); + v_src[19] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x20); + + v_src[20] = _mm256_permute2x128_si256(v_tmp64_lo_e[2], v_tmp64_lo_e[3], 0x31); + v_src[21] = _mm256_permute2x128_si256(v_tmp64_hi_e[2], v_tmp64_hi_e[3], 0x31); + v_src[22] = _mm256_permute2x128_si256(v_tmp64_lo_e[6], v_tmp64_lo_e[7], 0x31); + v_src[23] = _mm256_permute2x128_si256(v_tmp64_hi_e[6], v_tmp64_hi_e[7], 0x31); + + v_src[24] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x20); + v_src[25] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x20); + v_src[26] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x20); + v_src[27] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x20); + + v_src[28] = _mm256_permute2x128_si256(v_tmp64_lo_o[2], v_tmp64_lo_o[3], 0x31); + v_src[29] = _mm256_permute2x128_si256(v_tmp64_hi_o[2], v_tmp64_hi_o[3], 0x31); + v_src[30] = _mm256_permute2x128_si256(v_tmp64_lo_o[6], v_tmp64_lo_o[7], 0x31); + v_src[31] = _mm256_permute2x128_si256(v_tmp64_hi_o[6], v_tmp64_hi_o[7], 0x31); + + __m256i v_trunc[64]; + __m256i* v_src_ptr = v_src; + __m256i* v_tr_ptr = v_trunc; + + + for (int chunk = 0; chunk < 2; ++chunk) { + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + for (int c = 0; c < 32; ++c) { + __m256i v_madd[16]; + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + v_madd[i] = _mm256_madd_epi16(v_src_ptr[i], v_coeff); + c_ptr++; + } + + __m256i v_add_0[8]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_add_0[d] = _mm256_add_epi32(v_madd[s + 0], v_madd[s + 1]); + } + + __m256i v_add_1[4]; + for (int d = 0, s = 0; d < 4; ++d, s += 2) { + v_add_1[d] = _mm256_add_epi32(v_add_0[s + 0], v_add_0[s + 1]); + } + + __m256i v_add_2[2]; + for (int d = 0, s = 0; d < 2; ++d, s += 2) { + v_add_2[d] = _mm256_add_epi32(v_add_1[s + 0], v_add_1[s + 1]); + } + + v_tr_ptr[c] = truncate_avx2(_mm256_add_epi32(v_add_2[0], v_add_2[1]), debias, shift); + } + v_tr_ptr += 32; + v_src_ptr += 16; + } + + __m256i v_tmp[32]; + __m256i v_result[32]; + for (int i = 0, s = 0; i < 32; ++i, s += 2) { + v_tmp[i] = _mm256_packs_epi32(v_trunc[s + 0], v_trunc[s + 1]); + v_tmp[i] = _mm256_shuffle_epi8(v_tmp[i], v_res_shuffle); + } + + __m256i v_rtmp32_lo[16]; + __m256i v_rtmp32_hi[16]; + for (int d = 0, s = 0; d < 16; ++d, s += 2) { + v_rtmp32_lo[d] = _mm256_unpacklo_epi32(v_tmp[s + 0], v_tmp[s + 1]); + v_rtmp32_hi[d] = _mm256_unpackhi_epi32(v_tmp[s + 0], v_tmp[s + 1]); + } + + __m256i v_rtmp64_lo[16]; + __m256i v_rtmp64_hi[16]; + for (int d = 0, s = 0; d < 8; ++d, s += 2) { + v_rtmp64_lo[0 + d] = _mm256_unpacklo_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]); + v_rtmp64_lo[8 + d] = _mm256_unpacklo_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]); + + v_rtmp64_hi[0 + d] = _mm256_unpackhi_epi64(v_rtmp32_lo[s + 0], v_rtmp32_lo[s + 1]); + v_rtmp64_hi[8 + d] = _mm256_unpackhi_epi64(v_rtmp32_hi[s + 0], v_rtmp32_hi[s + 1]); + } + + v_result[0] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x20); + v_result[1] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x20); + v_result[2] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x20); + v_result[3] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x20); + + v_result[4] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x20); + v_result[5] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x20); + v_result[6] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x20); + v_result[7] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x20); + + v_result[8] = _mm256_permute2x128_si256(v_rtmp64_lo[0], v_rtmp64_lo[1], 0x31); + v_result[9] = _mm256_permute2x128_si256(v_rtmp64_lo[2], v_rtmp64_lo[3], 0x31); + v_result[10] = _mm256_permute2x128_si256(v_rtmp64_hi[0], v_rtmp64_hi[1], 0x31); + v_result[11] = _mm256_permute2x128_si256(v_rtmp64_hi[2], v_rtmp64_hi[3], 0x31); + + v_result[12] = _mm256_permute2x128_si256(v_rtmp64_lo[8], v_rtmp64_lo[9], 0x31); + v_result[13] = _mm256_permute2x128_si256(v_rtmp64_lo[10], v_rtmp64_lo[11], 0x31); + v_result[14] = _mm256_permute2x128_si256(v_rtmp64_hi[8], v_rtmp64_hi[9], 0x31); + v_result[15] = _mm256_permute2x128_si256(v_rtmp64_hi[10], v_rtmp64_hi[11], 0x31); + + v_result[16] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x20); + v_result[17] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x20); + v_result[18] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x20); + v_result[19] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x20); + + v_result[20] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x20); + v_result[21] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x20); + v_result[22] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x20); + v_result[23] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x20); + + v_result[24] = _mm256_permute2x128_si256(v_rtmp64_lo[4], v_rtmp64_lo[5], 0x31); + v_result[25] = _mm256_permute2x128_si256(v_rtmp64_lo[6], v_rtmp64_lo[7], 0x31); + v_result[26] = _mm256_permute2x128_si256(v_rtmp64_hi[4], v_rtmp64_hi[5], 0x31); + v_result[27] = _mm256_permute2x128_si256(v_rtmp64_hi[6], v_rtmp64_hi[7], 0x31); + + v_result[28] = _mm256_permute2x128_si256(v_rtmp64_lo[12], v_rtmp64_lo[13], 0x31); + v_result[29] = _mm256_permute2x128_si256(v_rtmp64_lo[14], v_rtmp64_lo[15], 0x31); + v_result[30] = _mm256_permute2x128_si256(v_rtmp64_hi[12], v_rtmp64_hi[13], 0x31); + v_result[31] = _mm256_permute2x128_si256(v_rtmp64_hi[14], v_rtmp64_hi[15], 0x31); + + for (int i = 0; i < 32; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } + + // TODO: MTS cutoff +} + +void fast_inverse_tr_32x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 16; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = 0; + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = fi_dct2_32x16_coeff_ver; + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = fi_dst7_32x16_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = fi_dct8_32x16_coeff_ver; + } + + __m256i v_ver_pass_out[32]; + fast_inverse_tr_32x16_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +void fast_forward_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 32; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int reduced_line = width - skip_width; + const int cutoff = height - skip_height; + int16_t* p_dst = dst; + + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + const int32_t shift_1st = log2_width_minus1 + UVG_BIT_DEPTH - 8; + const int32_t shift_2nd = log2_height_minus1 + 7; + + const int16_t* hor_coeff = ff_dct2_32xN_coeff_hor; + const int16_t* ver_coeff = ff_dct2_32x32_coeff_ver; + if (hor == DST7) { + hor_coeff = ff_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = ff_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = ff_dst7_32x32_coeff_ver; + } else if (ver == DCT8) { + ver_coeff = ff_dct8_32x32_coeff_ver; + } + + ALIGNED(32) int16_t v_hor_pass_out[32 * 32]; + if(hor == DCT2) { + fast_forward_DCT2_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + else { + fast_forward_DCT8_B32_avx2_hor(src, (__m256i*)v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width); + } + + __m256i temp_out[32 * 2]; + // Vertical pass + const int32_t add = (shift_2nd > 0) ? (1 << (shift_2nd - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0 + const __m256i debias = _mm256_set1_epi32(add); + for (int j = 0; j < reduced_line; ++j) { + __m256i res_0 = _mm256_setzero_si256(); + __m256i res_1 = _mm256_setzero_si256(); + __m256i res_2 = _mm256_setzero_si256(); + __m256i res_3 = _mm256_setzero_si256(); + const int16_t* coeff_start = ver_coeff; + for (int i = 0; i < 16; ++i) { + int16_t source[2]; + source[0] = v_hor_pass_out[j + i * 64]; + source[1] = v_hor_pass_out[j + i * 64 + 32]; + int32_t paired_source; + memcpy(&paired_source, source, sizeof(int32_t)); + + __m256i v_src = _mm256_set1_epi32(paired_source); + __m256i v_coeff_0 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + __m256i v_coeff_1 = _mm256_load_si256((__m256i*) coeff_start); + __m256i v_coeff_2; + __m256i v_coeff_3; + if(skip_height == 0) { + coeff_start += 16; + v_coeff_2 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + v_coeff_3 = _mm256_load_si256((__m256i*) coeff_start); + coeff_start += 16; + } + else { + coeff_start += 48; + } + + __m256i madd_0 = _mm256_madd_epi16(v_src, v_coeff_0); + __m256i madd_1 = _mm256_madd_epi16(v_src, v_coeff_1); + __m256i madd_2; + __m256i madd_3; + if(skip_height == 0) { + madd_2 = _mm256_madd_epi16(v_src, v_coeff_2); + madd_3 = _mm256_madd_epi16(v_src, v_coeff_3); + } + + res_0 = _mm256_add_epi32(res_0, madd_0); + res_1 = _mm256_add_epi32(res_1, madd_1); + if(skip_height == 0) { + res_2 = _mm256_add_epi32(res_2, madd_2); + res_3 = _mm256_add_epi32(res_3, madd_3); + } + } + __m256i v_trunc_0 = truncate_avx2(res_0, debias, shift_2nd); + __m256i v_trunc_1 = truncate_avx2(res_1, debias, shift_2nd); + __m256i v_trunc_2; + __m256i v_trunc_3; + if(skip_height == 0) { + v_trunc_2 = truncate_avx2(res_2, debias, shift_2nd); + v_trunc_3 = truncate_avx2(res_3, debias, shift_2nd); + } + + v_trunc_0 = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_trunc_0 = _mm256_permute4x64_epi64(v_trunc_0, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j, v_trunc_0); + if(skip_height == 0) { + v_trunc_2 = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + v_trunc_2 = _mm256_permute4x64_epi64(v_trunc_2, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_store_si256(temp_out + 2 * j + 1, v_trunc_2); + } + } + transpose_avx2(temp_out, (__m256i*) dst, 32, 32); +#if 0 + // 8 is probably best, though difference to 16 is not that large +#define NUM_PARTS 8 +#define PART_DIMENSION (32 / NUM_PARTS) + for (int part = 0; part < NUM_PARTS; ++part) { + const int32_t* coeff_ptr = (const int32_t*)ver_coeff + part * PART_DIMENSION; // Cast to 32 bit integer to read 2 coeffs at a time + const __m256i* v_src_ptr = v_hor_pass_out; + + __m256i v_madd_lo_e[16][PART_DIMENSION]; + __m256i v_madd_lo_o[16][PART_DIMENSION]; + __m256i v_madd_hi_e[16][PART_DIMENSION]; + __m256i v_madd_hi_o[16][PART_DIMENSION]; + for (int i = 0; i < 16; ++i) { + __m256i v_src_lo_e = _mm256_unpacklo_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_lo_o = _mm256_unpacklo_epi16(v_src_ptr[1], v_src_ptr[3]); + __m256i v_src_hi_e = _mm256_unpackhi_epi16(v_src_ptr[0], v_src_ptr[2]); + __m256i v_src_hi_o = _mm256_unpackhi_epi16(v_src_ptr[1], v_src_ptr[3]); + + + for (int c = 0; c < PART_DIMENSION; ++c) { + const __m256i v_coeff = _mm256_set1_epi32(coeff_ptr[c]); + v_madd_lo_e[i][c] = _mm256_madd_epi16(v_src_lo_e, v_coeff); + v_madd_lo_o[i][c] = _mm256_madd_epi16(v_src_lo_o, v_coeff); + v_madd_hi_e[i][c] = _mm256_madd_epi16(v_src_hi_e, v_coeff); + v_madd_hi_o[i][c] = _mm256_madd_epi16(v_src_hi_o, v_coeff); + } + coeff_ptr += 32; + v_src_ptr += 4; + } + + for (int c = 0; c < PART_DIMENSION; ++c) { + __m256i v_add_lo_e0[8]; + __m256i v_add_lo_o0[8]; + __m256i v_add_hi_e0[8]; + __m256i v_add_hi_o0[8]; + for (int dst = 0, src = 0; dst < 8; ++dst, src += 2) { + v_add_lo_e0[dst] = _mm256_add_epi32(v_madd_lo_e[src + 0][c], v_madd_lo_e[src + 1][c]); + v_add_lo_o0[dst] = _mm256_add_epi32(v_madd_lo_o[src + 0][c], v_madd_lo_o[src + 1][c]); + v_add_hi_e0[dst] = _mm256_add_epi32(v_madd_hi_e[src + 0][c], v_madd_hi_e[src + 1][c]); + v_add_hi_o0[dst] = _mm256_add_epi32(v_madd_hi_o[src + 0][c], v_madd_hi_o[src + 1][c]); + } + + __m256i v_add_lo_e1[4]; + __m256i v_add_lo_o1[4]; + __m256i v_add_hi_e1[4]; + __m256i v_add_hi_o1[4]; + for (int dst = 0, src = 0; dst < 4; ++dst, src += 2) { + v_add_lo_e1[dst] = _mm256_add_epi32(v_add_lo_e0[src + 0], v_add_lo_e0[src + 1]); + v_add_lo_o1[dst] = _mm256_add_epi32(v_add_lo_o0[src + 0], v_add_lo_o0[src + 1]); + v_add_hi_e1[dst] = _mm256_add_epi32(v_add_hi_e0[src + 0], v_add_hi_e0[src + 1]); + v_add_hi_o1[dst] = _mm256_add_epi32(v_add_hi_o0[src + 0], v_add_hi_o0[src + 1]); + } + + __m256i v_add_lo_e2[2]; + __m256i v_add_lo_o2[2]; + __m256i v_add_hi_e2[2]; + __m256i v_add_hi_o2[2]; + for (int dst = 0, src = 0; dst < 2; ++dst, src += 2) { + v_add_lo_e2[dst] = _mm256_add_epi32(v_add_lo_e1[src + 0], v_add_lo_e1[src + 1]); + v_add_lo_o2[dst] = _mm256_add_epi32(v_add_lo_o1[src + 0], v_add_lo_o1[src + 1]); + v_add_hi_e2[dst] = _mm256_add_epi32(v_add_hi_e1[src + 0], v_add_hi_e1[src + 1]); + v_add_hi_o2[dst] = _mm256_add_epi32(v_add_hi_o1[src + 0], v_add_hi_o1[src + 1]); + } + + __m256i v_trunc_lo_e = truncate_avx2(_mm256_add_epi32(v_add_lo_e2[0], v_add_lo_e2[1]), debias, shift_2nd); + __m256i v_trunc_lo_o = truncate_avx2(_mm256_add_epi32(v_add_lo_o2[0], v_add_lo_o2[1]), debias, shift_2nd); + __m256i v_trunc_hi_e = truncate_avx2(_mm256_add_epi32(v_add_hi_e2[0], v_add_hi_e2[1]), debias, shift_2nd); + __m256i v_trunc_hi_o = truncate_avx2(_mm256_add_epi32(v_add_hi_o2[0], v_add_hi_o2[1]), debias, shift_2nd); + + __m256i v_result_e = _mm256_packs_epi32(v_trunc_lo_e, v_trunc_hi_e); + __m256i v_result_o = _mm256_packs_epi32(v_trunc_lo_o, v_trunc_hi_o); + + v_result_e = _mm256_permute4x64_epi64(v_result_e, _MM_SHUFFLE(3, 1, 2, 0)); + v_result_o = _mm256_permute4x64_epi64(v_result_o, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)dst, v_result_e); + dst += 16; + _mm256_store_si256((__m256i*)dst, v_result_o); + dst += 16; + } + } +#undef NUM_PARTS +#undef PART_DIMENSION +#endif + + if (skip_width) { + dst = p_dst + reduced_line; + for (int j = 0; j < cutoff; j++) + { + memset(dst, 0, sizeof(int16_t) * skip_width); + dst += width; + } + } + + if (skip_height) { + dst = p_dst + width * cutoff; + memset(dst, 0, sizeof(int16_t) * width * skip_height); + } +} + + +static void fast_inverse_tr_32x32_avx2_ver(const int16_t* src, __m256i* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + const __m256i* v_src_raw = (const __m256i*)src; + + __m256i v_src[16][4]; + for (int d = 0, s = 0; d < 16; ++d, s += 4) { + v_src[d][0] = _mm256_unpacklo_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src[d][1] = _mm256_unpackhi_epi16(v_src_raw[s + 0], v_src_raw[s + 2]); + v_src[d][2] = _mm256_unpacklo_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + v_src[d][3] = _mm256_unpackhi_epi16(v_src_raw[s + 1], v_src_raw[s + 3]); + } + + for (int row = 0, d = 0; row < 32; ++row, d += 2) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i v_res_2 = _mm256_setzero_si256(); + __m256i v_res_3 = _mm256_setzero_si256(); + if(skip_line == 0) { + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[i][2], v_coeff); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[i][3], v_coeff); + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + v_res_2 = _mm256_add_epi32(v_res_2, v_madd_2); + v_res_3 = _mm256_add_epi32(v_res_3, v_madd_3); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift); + + dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + } + else { + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i][0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i][1], v_coeff); + v_res_0 = _mm256_add_epi32(v_res_0, v_madd_0); + v_res_1 = _mm256_add_epi32(v_res_1, v_madd_1); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + + dst[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + dst[d + 1] = _mm256_setzero_si256(); + } + } +} + +static void fast_inverse_tr_32x32_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2) +{ + const int32_t add = 1 << (shift - 1); + const __m256i debias = _mm256_set1_epi32(add); + + const int32_t* c_ptr = (const int32_t*)coeff; // Handle as 32 bit integer to load two coeffs into vector at the same time + + // Do a 32 bit transpose on input + __m256i v_tmp32_lo[32]; + __m256i v_tmp32_hi[32]; + for (int d = 0, s = 0; d < 32; d += 2, s += 4) { + v_tmp32_lo[d + 0] = _mm256_unpacklo_epi32(src[s + 0], src[s + 2]); + v_tmp32_lo[d + 1] = _mm256_unpacklo_epi32(src[s + 1], src[s + 3]); + v_tmp32_hi[d + 0] = _mm256_unpackhi_epi32(src[s + 0], src[s + 2]); + v_tmp32_hi[d + 1] = _mm256_unpackhi_epi32(src[s + 1], src[s + 3]); + } + + __m256i v_tmp64_lo[32]; + __m256i v_tmp64_hi[32]; + for (int i = 0; i < 32; i += 4) { + v_tmp64_lo[i + 0] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]); + v_tmp64_lo[i + 1] = _mm256_unpacklo_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]); + v_tmp64_lo[i + 2] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]); + v_tmp64_lo[i + 3] = _mm256_unpacklo_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]); + + v_tmp64_hi[i + 0] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 0], v_tmp32_lo[i + 2]); + v_tmp64_hi[i + 1] = _mm256_unpackhi_epi64(v_tmp32_lo[i + 1], v_tmp32_lo[i + 3]); + v_tmp64_hi[i + 2] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 0], v_tmp32_hi[i + 2]); + v_tmp64_hi[i + 3] = _mm256_unpackhi_epi64(v_tmp32_hi[i + 1], v_tmp32_hi[i + 3]); + } + + __m256i v_src[64]; + for (int d = 0, s = 0; d < 64; d += 16, s += 8) { + v_src[d + 0] = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x20); + v_src[d + 1] = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x20); + v_src[d + 2] = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x20); + v_src[d + 3] = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x20); + + v_src[d + 4] = _mm256_permute2x128_si256(v_tmp64_lo[s + 0], v_tmp64_lo[s + 4], 0x31); + v_src[d + 5] = _mm256_permute2x128_si256(v_tmp64_hi[s + 0], v_tmp64_hi[s + 4], 0x31); + v_src[d + 6] = _mm256_permute2x128_si256(v_tmp64_lo[s + 2], v_tmp64_lo[s + 6], 0x31); + v_src[d + 7] = _mm256_permute2x128_si256(v_tmp64_hi[s + 2], v_tmp64_hi[s + 6], 0x31); + + v_src[d + 8] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x20); + v_src[d + 9] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x20); + v_src[d + 10] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x20); + v_src[d + 11] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x20); + + v_src[d + 12] = _mm256_permute2x128_si256(v_tmp64_lo[s + 1], v_tmp64_lo[s + 5], 0x31); + v_src[d + 13] = _mm256_permute2x128_si256(v_tmp64_hi[s + 1], v_tmp64_hi[s + 5], 0x31); + v_src[d + 14] = _mm256_permute2x128_si256(v_tmp64_lo[s + 3], v_tmp64_lo[s + 7], 0x31); + v_src[d + 15] = _mm256_permute2x128_si256(v_tmp64_hi[s + 3], v_tmp64_hi[s + 7], 0x31); + } + + __m256i v_tmp[64]; + for (int row = 0, d = 0; row < 32; ++row, d += 2) { + __m256i v_res_0 = _mm256_setzero_si256(); + __m256i v_res_1 = _mm256_setzero_si256(); + __m256i v_res_2 = _mm256_setzero_si256(); + __m256i v_res_3 = _mm256_setzero_si256(); + for (int i = 0; i < 16; ++i) { + const __m256i v_coeff = _mm256_set1_epi32(*c_ptr); + __m256i v_madd_0 = _mm256_madd_epi16(v_src[i + 0], v_coeff); + __m256i v_madd_1 = _mm256_madd_epi16(v_src[i + 16], v_coeff); + __m256i v_madd_2 = _mm256_madd_epi16(v_src[i + 32], v_coeff); + __m256i v_madd_3 = _mm256_madd_epi16(v_src[i + 48], v_coeff); + + v_res_0 = _mm256_add_epi32(v_madd_0, v_res_0); + v_res_1 = _mm256_add_epi32(v_madd_1, v_res_1); + v_res_2 = _mm256_add_epi32(v_madd_2, v_res_2); + v_res_3 = _mm256_add_epi32(v_madd_3, v_res_3); + c_ptr++; + } + + __m256i v_trunc_0 = truncate_avx2(v_res_0, debias, shift); + __m256i v_trunc_1 = truncate_avx2(v_res_1, debias, shift); + __m256i v_trunc_2 = truncate_avx2(v_res_2, debias, shift); + __m256i v_trunc_3 = truncate_avx2(v_res_3, debias, shift); + + v_tmp[d + 0] = _mm256_packs_epi32(v_trunc_0, v_trunc_1); + v_tmp[d + 1] = _mm256_packs_epi32(v_trunc_2, v_trunc_3); + } + + for (int i = 0; i < 64; ++i) { + v_tmp[i] = _mm256_permute4x64_epi64(v_tmp[i], _MM_SHUFFLE(3, 1, 2, 0)); + } + + __m256i v_result[64]; + transpose_avx2(v_tmp, v_result, 32, 32); + + for (int i = 0; i < 64; ++i) { + _mm256_store_si256((__m256i*)dst, v_result[i]); + dst += 16; + } +} + +void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver) +{ + const int width = 32; + const int height = 32; + + int skip_width = (hor != DCT2 && width == 32) ? 16 : width > 32 ? width - 32 : 0; + int skip_height = (ver != DCT2 && height == 32) ? 16 : (height > 32 ? height - 32 : 0); + + const int32_t shift_1st = INVERSE_SHIFT_1ST; + const int32_t shift_2nd = INVERSE_SHIFT_2ND; + + const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; + if (hor == DST7) { + hor_coeff = fi_dst7_32xN_coeff_hor; + } else if (hor == DCT8) { + hor_coeff = fi_dct8_32xN_coeff_hor; + } + if (ver == DST7) { + ver_coeff = uvg_g_dst7_32_t; + } else if (ver == DCT8) { + ver_coeff = uvg_g_dct8_32; + } + + __m256i v_ver_pass_out[64]; + fast_inverse_tr_32x32_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height); + + fast_inverse_tr_32x8_avx2_hor(v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width); +} + + +static dct_full_pass* dct_function_table[6][6] = { + { NULL, NULL, fast_forward_tr_2x8_avx2, fast_forward_tr_2x16_avx2, fast_forward_tr_2x32_avx2, NULL }, + { NULL, fast_forward_tr_4x4_avx2, fast_forward_tr_4x8_avx2, fast_forward_tr_4x16_avx2, fast_forward_tr_4x32_avx2, NULL }, + { fast_forward_tr_8x2_avx2, fast_forward_tr_8x4_avx2, fast_forward_tr_8x8_avx2, fast_forward_tr_8x16_avx2, fast_forward_tr_8x32_avx2, NULL }, + { fast_forward_tr_16x2_avx2, fast_forward_tr_16x4_avx2, fast_forward_tr_16x8_avx2, fast_forward_tr_16x16_avx2, fast_forward_tr_16x32_avx2, NULL }, + { fast_forward_tr_32x2_avx2, fast_forward_tr_32x4_avx2, fast_forward_tr_32x8_avx2, fast_forward_tr_32x16_avx2, fast_forward_tr_32x32_avx2, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL } +}; + + +static dct_full_pass* idct_function_table[6][6] = { + { NULL, NULL, fast_inverse_tr_2x8_avx2, fast_inverse_tr_2x16_avx2, fast_inverse_tr_2x32_avx2, NULL }, + { NULL, fast_inverse_tr_4x4_avx2, fast_inverse_tr_4x8_avx2, fast_inverse_tr_4x16_avx2, fast_inverse_tr_4x32_avx2, NULL }, + { fast_inverse_tr_8x2_avx2, fast_inverse_tr_8x4_avx2, fast_inverse_tr_8x8_avx2, fast_inverse_tr_8x16_avx2, fast_inverse_tr_8x32_avx2, NULL }, + { fast_inverse_tr_16x2_avx2, fast_inverse_tr_16x4_avx2, fast_inverse_tr_16x8_avx2, fast_inverse_tr_16x16_avx2, fast_inverse_tr_16x32_avx2, NULL }, + { fast_inverse_tr_32x2_avx2, fast_inverse_tr_32x4_avx2, fast_inverse_tr_32x8_avx2, fast_inverse_tr_32x16_avx2, fast_inverse_tr_32x32_avx2, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL }, +}; + extern void uvg_get_tr_type( int8_t width, @@ -1606,13 +8074,27 @@ static void mts_dct_avx2( dct_func* dct_func = uvg_get_dct_func(width, height, color, tu->type); dct_func(bitdepth, input, output); } - else - { - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; - - tr_func* dct = dct_table[log2_width_minus2]; - - dct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx); + else{ + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; + if (height == 1) { + if (width == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0); + } else if (width == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0); + } + } + else if (width == 1){ + if (height == 16) { + fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? ff_dct2_16xN_coeff_hor : ff_dst7_16xN_coeff_hor, 3, 1, 0, 0); + } else if (height == 32) { + fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, ff_dct2_32xN_coeff_hor, 4, 1, 0, 0); + } + } + else { + dct_full_pass* dct_func = dct_function_table[log2_width_minus1][log2_height_minus1]; + dct_func(input, output, type_hor, type_ver); + } } } @@ -1637,13 +8119,12 @@ static void mts_idct_avx2( dct_func* idct_func = uvg_get_idct_func(width, height, color, tu->type); idct_func(bitdepth, input, output); } - else - { - const int log2_width_minus2 = uvg_g_convert_to_bit[width]; + else { + const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1; + const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1; - tr_func* idct = idct_table[log2_width_minus2]; - - idct(input, output, type_hor, type_ver, bitdepth, tu->lfnst_idx); + dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1]; + idct_func(input, output, type_hor, type_ver); } } @@ -1658,19 +8139,19 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) if (bitdepth == 8){ //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); - //success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2); - //success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2); - //success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2); - //success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2); + success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2); + success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2); + success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2); + success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2); - //success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); + // success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); - //success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2); - //success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2); - //success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); - //success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); + success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2); + success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2); + success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); + success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); - //success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2); + success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2); //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2); } diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h new file mode 100644 index 00000000..2233916b --- /dev/null +++ b/src/strategies/avx2/dct_avx2_tables.h @@ -0,0 +1,4785 @@ +#ifndef DCT_AVX2_TABLES_H +#define DCT_AVX2_TABLES_H + +#include "global.h" + +// Shuffle tables for simple avx2 functions + +ALIGNED(32) const int32_t ff_dct2_b4_permute_0[8] = { 0, 2, 4, 6, 0, 2, 4, 6 }; +ALIGNED(32) const int32_t ff_dct2_b4_permute_1[8] = { 1, 3, 5, 7, 1, 3, 5, 7 }; + +ALIGNED(32) const int32_t fi_dct2_b4_permute_0[8] = { 0, 0, 0, 0, 2, 2, 2, 2 }; +ALIGNED(32) const int32_t fi_dct2_b4_permute_1[8] = { 4, 4, 4, 4, 6, 6, 6, 6 }; +ALIGNED(32) const int32_t fi_dct2_b4_permute_2[8] = { 1, 1, 1, 1, 3, 3, 3, 3 }; +ALIGNED(32) const int32_t fi_dct2_b4_permute_3[8] = { 5, 5, 5, 5, 7, 7, 7, 7 }; + +ALIGNED(32) const int32_t ff_dct2_b32_permute[8][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 1, 1, 1, 1, 1, 1, 1}, + {2, 2, 2, 2, 2, 2, 2, 2}, + {3, 3, 3, 3, 3, 3, 3, 3}, + {4, 4, 4, 4, 4, 4, 4, 4}, + {5, 5, 5, 5, 5, 5, 5, 5}, + {6, 6, 6, 6, 6, 6, 6, 6}, + {7, 7, 7, 7, 7, 7, 7, 7}, +}; + + +// Coeff tables for simple avx2 functions + +ALIGNED(32) const int16_t fast_forward_dct2_b2_coeff[32] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +}; + + const int16_t* fast_inverse_dct2_b2_coeff = fast_forward_dct2_b2_coeff; // Inverse coeffs for this transform are same as forward + +// Coeff arrays for B4 +ALIGNED(32) const int16_t fast_forward_dct2_b4_coeff[64] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83, +-36, -83, -36, -83, -36, -83, -36, -83, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) const int16_t fast_forward_dst7_b4_coeff[64] = { + 29, 55, 29, 55, 29, 55, 29, 55, 84, -29, 84, -29, 84, -29, 84, -29, + 74, 84, 74, 84, 74, 84, 74, 84, -74, 55, -74, 55, -74, 55, -74, 55, + 74, 74, 74, 74, 74, 74, 74, 74, 55, -84, 55, -84, 55, -84, 55, -84, + 0, -74, 0, -74, 0, -74, 0, -74, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) const int16_t fast_forward_dct8_b4_coeff[64] = { + 84, 74, 84, 74, 84, 74, 84, 74, 55, -74, 55, -74, 55, -74, 55, -74, + 55, 29, 55, 29, 55, 29, 55, 29, -29, 84, -29, 84, -29, 84, -29, 84, + 74, 0, 74, 0, 74, 0, 74, 0, 29, -74, 29, -74, 29, -74, 29, -74, +-74, -74, -74, -74, -74, -74, -74, -74, 84, -55, 84, -55, 84, -55, 84, -55, +}; + +// Coeff arrays for inverse B4 +ALIGNED(32) const int16_t fast_inverse_dct2_b4_coeff[64] = { + 64, 83, 64, 36, 64, -36, 64, -83, 64, 83, 64, 36, 64, -36, 64, -83, + 64, 36, -64, -83, -64, 83, 64, -36, 64, 36, -64, -83, -64, 83, 64, -36, + 64, 83, 64, 36, 64, -36, 64, -83, 64, 83, 64, 36, 64, -36, 64, -83, + 64, 36, -64, -83, -64, 83, 64, -36, 64, 36, -64, -83, -64, 83, 64, -36, +}; + +ALIGNED(32) const int16_t fast_inverse_dst7_b4_coeff[64] = { + 29, 74, 55, 74, 74, 0, 84, -74, 29, 74, 55, 74, 74, 0, 84, -74, + 84, 55, -29, -84, -74, 74, 55, -29, 84, 55, -29, -84, -74, 74, 55, -29, + 29, 74, 55, 74, 74, 0, 84, -74, 29, 74, 55, 74, 74, 0, 84, -74, + 84, 55, -29, -84, -74, 74, 55, -29, 84, 55, -29, -84, -74, 74, 55, -29, +}; + +ALIGNED(32) const int16_t fast_inverse_dct8_b4_coeff[64] = { + 84, 74, 74, 0, 55, -74, 29, -74, 84, 74, 74, 0, 55, -74, 29, -74, + 55, 29, -74, -74, -29, 84, 84, -55, 55, 29, -74, -74, -29, 84, 84, -55, + 84, 74, 74, 0, 55, -74, 29, -74, 84, 74, 74, 0, 55, -74, 29, -74, + 55, 29, -74, -74, -29, 84, 84, -55, 55, 29, -74, -74, -29, 84, 84, -55, +}; + +// Coeff arrays for forward B8 +ALIGNED(32) const int16_t fast_forward_dct2_b8_coeff[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, +-64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, +-64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18, +}; + +ALIGNED(32) const int16_t fast_forward_dst7_b8_coeff[128] = { + 17, 32, 46, 78, 71, 85, 85, 46, 17, 32, 46, 78, 71, 85, 85, 46, + 46, 60, 86, 71, 32, -46, -60, -78, 46, 60, 86, 71, 32, -46, -60, -78, + 71, 78, 32, -17, -86, -60, 17, 86, 71, 78, 32, -17, -86, -60, 17, 86, + 85, 86, -60, -85, 17, 78, 32, -71, 85, 86, -60, -85, 17, 78, 32, -71, + 86, -17, 78, -71, 60, -86, 32, -60, 86, -17, 78, -71, 60, -86, 32, -60, +-85, 32, -17, 85, 71, -17, 78, -86, -85, 32, -17, 85, 71, -17, 78, -86, + 78, -46, -60, -32, -46, 85, 85, -71, 78, -46, -60, -32, -46, 85, 85, -71, +-71, 60, 86, -46, -78, 32, 46, -17, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) const int16_t fast_forward_dct8_b8_coeff[128] = { + 86, 85, 85, 60, 78, 17, 71, -32, 86, 85, 85, 60, 78, 17, 71, -32, + 78, 71, 17, -32, -60, -86, -86, -17, 78, 71, 17, -32, -60, -86, -86, -17, + 60, 46, -71, -86, -46, 32, 78, 60, 60, 46, -71, -86, -46, 32, 78, 60, + 32, 17, -78, -46, 85, 71, -46, -85, 32, 17, -78, -46, 85, 71, -46, -85, + 60, -71, 46, -86, 32, -78, 17, -46, 60, -71, 46, -86, 32, -78, 17, -46, +-46, 78, 32, 60, 85, -46, 71, -85, -46, 78, 32, 60, 85, -46, 71, -85, + 32, -85, -85, 17, -17, 71, 86, -78, 32, -85, -85, 17, -17, 71, 86, -78, +-17, 86, 71, -78, -86, 60, 60, -32, -17, 86, 71, -78, -86, 60, 60, -32, +}; + +// Coeff arrays for inverse B8 +ALIGNED(32) const int16_t fast_inverse_dct2_b8_coeff[128] = { + 64, 89, 64, 75, 64, 50, 64, 18, 64, 89, 64, 75, 64, 50, 64, 18, + 83, 75, 36, -18, -36, -89, -83, -50, 83, 75, 36, -18, -36, -89, -83, -50, + 64, 50, -64, -89, -64, 18, 64, 75, 64, 50, -64, -89, -64, 18, 64, 75, + 36, 18, -83, -50, 83, 75, -36, -89, 36, 18, -83, -50, 83, 75, -36, -89, + 64, -18, 64, -50, 64, -75, 64, -89, 64, -18, 64, -50, 64, -75, 64, -89, +-83, 50, -36, 89, 36, 18, 83, -75, -83, 50, -36, 89, 36, 18, 83, -75, + 64, -75, -64, -18, -64, 89, 64, -50, 64, -75, -64, -18, -64, 89, 64, -50, +-36, 89, 83, -75, -83, 50, 36, -18, -36, 89, 83, -75, -83, 50, 36, -18, +}; + +ALIGNED(32) const int16_t fast_inverse_dst7_b8_coeff[128] = { + 17, 46, 32, 78, 46, 86, 60, 71, 17, 46, 32, 78, 46, 86, 60, 71, + 71, 85, 85, 46, 32, -60, -46, -78, 71, 85, 85, 46, 32, -60, -46, -78, + 86, 78, -17, -71, -85, -17, 32, 85, 86, 78, -17, -71, -85, -17, 32, 85, + 60, 32, -86, -60, 71, 78, -17, -86, 60, 32, -86, -60, 71, 78, -17, -86, + 71, 32, 78, -17, 85, -60, 86, -85, 71, 32, 78, -17, 85, -60, 86, -85, +-86, 17, -60, 86, 17, 32, 78, -71, -86, 17, -60, 86, 17, 32, 78, -71, + 78, -60, -46, -32, -71, 86, 60, -46, 78, -60, -46, -32, -71, 86, 60, -46, +-46, 85, 85, -71, -78, 46, 32, -17, -46, 85, 85, -71, -78, 46, 32, -17, +}; + + const int16_t* fast_inverse_dct8_b8_coeff = fast_forward_dct8_b8_coeff; // The table used in forward transform works with inverse also. + +// Coeff arrays for forward B16 +ALIGNED(32) const int16_t fast_forward_dct2_b16_coeff[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 64, -64, 57, -80, 50, -89, 43, -90, + 64, 64, 80, 70, 50, 18, 9, -43, -64, 64, -25, 90, 18, 75, 57, 25, + 64, 64, 57, 43, -18, -50, -80, -90, 64, -64, -9, -87, -75, -18, -87, 70, + 64, 64, 25, 9, -75, -89, -70, -25, -64, 64, 43, 70, 89, -50, 9, -80, + 64, 64, -9, -25, -89, -75, 25, 70, 64, -64, -70, -43, -50, 89, 80, -9, + 64, 64, -43, -57, -50, -18, 90, 80, -64, 64, 87, 9, -18, -75, -70, 87, + 64, 64, -70, -80, 18, 50, 43, -9, 64, -64, -90, 25, 75, 18, -25, -57, + 64, 64, -87, -90, 75, 89, -57, -87, -64, 64, 80, -57, -89, 50, 90, -43, + 83, 36, 80, 9, 75, -18, 70, -43, 36, -83, 25, -70, 18, -50, 9, -25, +-36, -83, -70, -87, -89, -50, -87, 9, 83, -36, 90, -80, 75, -89, 43, -57, +-83, -36, -25, 57, 50, 89, 90, 25, -36, 83, 43, 9, 89, -75, 70, -80, + 36, 83, 90, 43, 18, -75, -80, -57, -83, 36, -57, 87, 50, -18, 87, -90, + 83, 36, -43, -90, -75, 18, 57, 80, 36, -83, -87, 57, -18, 50, 90, -87, +-36, -83, -57, 25, 89, 50, -25, -90, 83, -36, -9, -43, -75, 89, 80, -70, +-83, -36, 87, 70, -50, -89, -9, 87, -36, 83, 80, -90, -89, 75, 57, -43, + 36, 83, -9, -80, -18, 75, 43, -70, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) const int16_t fast_forward_dst7_b16_coeff[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 88, -8, 87, -40, 81, -68, 73, -85, // 0 + 25, 33, 68, 81, 88, 85, 81, 40, -88, 17, -68, 73, -25, 88, 25, 55, + 40, 48, 88, 88, 62, 25, -17, -68, 87, -25, 33, -88, -48, -48, -88, 48, + 55, 62, 81, 68, -17, -55, -88, -73, -85, 33, 8, 85, 88, -25, 33, -87, + 68, 73, 48, 25, -81, -88, -25, 33, 81, -40, -48, -62, -68, 81, 68, 8, + 77, 81, 0, -25, -77, -48, 77, 88, -77, 48, 77, 25, 0, -81, -77, 81, + 85, 87, -48, -68, -8, 33, 62, 8, 73, -55, -88, 17, 68, 25, -17, -62, + 88, 88, -81, -88, 68, 87, -48, -85, -68, 62, 81, -55, -88, 48, 88, -40, + 68, 88, 77, 77, 85, 55, 88, 25, 62, -88, 48, -81, 33, -62, 17, -33, // 8 + 48, -25, 0, -77, -48, -87, -81, -48, 68, -8, 88, -68, 81, -88, 48, -62, +-81, -81, -77, 0, -8, 81, 68, 68, -55, 88, 25, 25, 85, -68, 73, -81, +-25, 48, 77, 77, 62, -40, -48, -81, -73, 17, -68, 88, 40, -8, 87, -88, + 88, 68, 0, -77, -88, -17, 25, 88, 48, -87, -81, 48, -25, 55, 88, -85, + 0, -68, -77, 0, 77, 68, 0, -88, 77, -25, 0, -48, -77, 88, 77, -68, +-88, -48, 77, 77, -33, -88, -25, 81, -40, 85, 81, -88, -87, 73, 55, -40, + 25, 81, 0, -77, -25, 73, 48, -68, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) const int16_t fast_forward_dct8_b16_coeff[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 62, -68, 55, -81, 48, -88, 40, -88, // 0 + 87, 85, 68, 48, 33, -8, -8, -62, -55, 73, -17, 88, 25, 68, 62, 17, + 81, 77, 25, 0, -48, -77, -88, -77, 48, -77, -25, -77, -81, 0, -81, 77, + 73, 68, -25, -48, -88, -81, -33, 25, -40, 81, 62, 48, 81, -68, -8, -68, + 62, 55, -68, -81, -55, -17, 73, 88, 33, -85, -85, -8, -25, 88, 87, -33, + 48, 40, -88, -88, 25, 62, 68, 17, -25, 87, 88, -33, -48, -48, -48, 88, + 33, 25, -81, -68, 85, 88, -40, -81, 17, -88, -73, 68, 88, -25, -55, -25, + 17, 8, -48, -25, 73, 40, -87, -55, -8, 88, 40, -87, -68, 81, 85, -73, + 81, 25, 77, 0, 73, -25, 68, -48, 33, -81, 25, -68, 17, -48, 8, -25, // 8 +-48, -88, -77, -77, -88, -33, -81, 25, 85, -40, 88, -81, 73, -87, 40, -55, +-68, 0, 0, 77, 68, 77, 88, 0, -25, 77, 48, 0, 88, -77, 68, -77, + 68, 88, 77, 0, -17, -88, -88, -25, -87, 48, -48, 81, 55, -25, 85, -88, + 48, -25, -77, -77, -40, 62, 81, 48, 17, -73, -88, 68, -8, 40, 88, -87, +-81, -81, 0, 77, 81, -8, -68, -68, 88, -55, -25, -25, -68, 85, 81, -73, +-25, 48, 77, 0, -87, -48, 48, 81, -8, 68, 68, -88, -88, 81, 62, -48, + 88, 68, -77, -77, 55, 85, -25, -88, -88, 62, 81, -48, -62, 33, 33, -17, +}; + +// Coeff arrays for inverse B16 +ALIGNED(32) const int16_t fast_inverse_dct2_b16_coeff[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, -9, 64, -25, 64, -43, 64, -57, + 89, 87, 75, 57, 50, 9, 18, -43, -89, 25, -75, 70, -50, 90, -18, 80, + 83, 80, 36, 9, -36, -70, -83, -87, 83, -43, 36, -90, -36, -57, -83, 25, + 75, 70, -18, -43, -89, -87, -50, 9, -75, 57, 18, 80, 89, -25, 50, -90, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -70, -64, -43, -64, 87, 64, 9, + 50, 43, -89, -90, 18, 57, 75, 25, -50, 80, 89, -9, -18, -70, -75, 87, + 36, 25, -83, -70, 83, 90, -36, -80, 36, -87, -83, 57, 83, -9, -36, -43, + 18, 9, -50, -25, 75, 43, -89, -57, -18, 90, 50, -87, -75, 80, 89, -70, + 64, 57, 64, 43, 64, 25, 64, 9, 64, -70, 64, -80, 64, -87, 64, -90, + -18, -80, -50, -90, -75, -70, -89, -25, 18, 43, 50, -9, 75, -57, 89, -87, + -83, -25, -36, 57, 36, 90, 83, 43, -83, 87, -36, 70, 36, -9, 83, -80, + 50, 90, 89, 25, 18, -80, -75, -57, -50, -9, -89, 87, -18, 43, 75, -70, + 64, -9, -64, -87, -64, 43, 64, 70, 64, -90, -64, 25, -64, 80, 64, -57, + -75, -87, -18, 70, 89, 9, -50, -80, 75, -25, 18, -57, -89, 90, 50, -43, + -36, 43, 83, 9, -83, -57, 36, 87, -36, 80, 83, -90, -83, 70, 36, -25, + 89, 70, -75, -80, 50, 87, -18, -90, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) const int16_t fast_inverse_dst7_b16_coeff[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 68, 48, 73, 25, 77, 0, 81, -25, // 0 + 40, 55, 73, 87, 88, 81, 85, 40, -81, -25, -88, 33, -77, 77, -48, 88, + 68, 77, 88, 77, 48, 0, -25, -77, 88, 0, 68, -77, 0, -77, -68, 0, + 85, 88, 55, 25, -48, -81, -87, -48, -88, 25, -17, 88, 77, 0, 68, -88, + 88, 87, -8, -40, -88, -68, 17, 73, 81, -48, -40, -62, -77, 77, 48, 25, + 81, 73, -68, -85, -25, 25, 88, 55, -68, 68, 81, 8, 0, -77, -81, 81, + 62, 48, -88, -81, 68, 88, -8, -68, 48, -81, -87, 48, 77, 0, -25, -48, + 33, 17, -62, -33, 81, 48, -88, -62, -25, 88, 55, -85, -77, 77, 88, -68, + 40, 88, 48, 88, 55, 81, 62, 68, 85, -48, 87, -68, 88, -81, 88, -88, // 8 + 62, -17, 25, -68, -17, -88, -55, -73, -8, 62, 33, 8, 68, -48, 87, -85, +-81, -77, -81, 0, -25, 77, 48, 77, -88, 77, -48, 77, 25, 0, 81, -77, + -8, 68, 81, 68, 62, -48, -40, -81, -33, -25, -88, 81, -25, 48, 73, -68, + 87, 33, -25, -88, -85, 8, 33, 85, 73, -88, -55, 17, -68, 81, 62, -55, +-48, -88, -48, 48, 88, 33, -25, -87, 68, -17, 25, -62, -88, 88, 48, -40, +-55, 25, 88, 25, -73, -68, 17, 88, -40, 81, 85, -88, -81, 68, 33, -25, + 85, 73, -68, -81, 40, 87, -8, -88, -87, 55, 73, -40, -48, 25, 17, -8, +}; + + const int16_t* fast_inverse_dct8_b16_coeff = fast_forward_dct8_b16_coeff; + +// Coeff arrays for forward B32 +ALIGNED(32) const int16_t fast_forward_dct2_b32_coeff[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) const int16_t fast_forward_dst7_b32_coeff[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) const int16_t fast_forward_dct8_b32_coeff[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +// Coeff arrays for inverse B32 +ALIGNED(32) const int16_t fast_inverse_dct2_b32_coeff[1024] = { + 64, 90, 64, 90, 64, 88, 64, 85, 64, 82, 64, 78, 64, 73, 64, 67, // 0 + 64, 61, 64, 54, 64, 46, 64, 38, 64, 31, 64, 22, 64, 13, 64, 4, + 64, -4, 64, -13, 64, -22, 64, -31, 64, -38, 64, -46, 64, -54, 64, -61, + 64, -67, 64, -73, 64, -78, 64, -82, 64, -85, 64, -88, 64, -90, 64, -90, + 90, 90, 87, 82, 80, 67, 70, 46, 57, 22, 43, -4, 25, -31, 9, -54, + -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13, +-90, 13, -87, 38, -80, 61, -70, 78, -57, 88, -43, 90, -25, 85, -9, 73, + 9, 54, 25, 31, 43, 4, 57, -22, 70, -46, 80, -67, 87, -82, 90, -90, + 89, 88, 75, 67, 50, 31, 18, -13, -18, -54, -50, -82, -75, -90, -89, -78, // 8 +-89, -46, -75, -4, -50, 38, -18, 73, 18, 90, 50, 85, 75, 61, 89, 22, + 89, -22, 75, -61, 50, -85, 18, -90, -18, -73, -50, -38, -75, 4, -89, 46, +-89, 78, -75, 90, -50, 82, -18, 54, 18, 13, 50, -31, 75, -67, 89, -88, + 87, 85, 57, 46, 9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25, 38, + 25, 82, 70, 88, 90, 54, 80, -4, 43, -61, -9, -90, -57, -78, -87, -31, +-87, 31, -57, 78, -9, 90, 43, 61, 80, 4, 90, -54, 70, -88, 25, -82, +-25, -38, -70, 22, -90, 73, -80, 90, -43, 67, 9, 13, 57, -46, 87, -85, + 83, 82, 36, 22, -36, -54, -83, -90, -83, -61, -36, 13, 36, 78, 83, 85, // 16 + 83, 31, 36, -46, -36, -90, -83, -67, -83, 4, -36, 73, 36, 88, 83, 38, + 83, -38, 36, -88, -36, -73, -83, -4, -83, 67, -36, 90, 36, 46, 83, -31, + 83, -85, 36, -78, -36, -13, -83, 61, -83, 90, -36, 54, 36, -22, 83, -82, + 80, 78, 9, -4, -70, -82, -87, -73, -25, 13, 57, 85, 90, 67, 43, -22, +-43, -88, -90, -61, -57, 31, 25, 90, 87, 54, 70, -38, -9, -90, -80, -46, +-80, 46, -9, 90, 70, 38, 87, -54, 25, -90, -57, -31, -90, 61, -43, 88, + 43, 22, 90, -67, 57, -85, -25, -13, -87, 73, -70, 82, 9, 4, 80, -78, + 75, 73, -18, -31, -89, -90, -50, -22, 50, 78, 89, 67, 18, -38, -75, -90, // 24 +-75, -13, 18, 82, 89, 61, 50, -46, -50, -88, -89, -4, -18, 85, 75, 54, + 75, -54, -18, -85, -89, 4, -50, 88, 50, 46, 89, -61, 18, -82, -75, 13, +-75, 90, 18, 38, 89, -67, 50, -78, -50, 22, -89, 90, -18, 31, 75, -73, + 70, 67, -43, -54, -87, -78, 9, 38, 90, 85, 25, -22, -80, -90, -57, 4, + 57, 90, 80, 13, -25, -88, -90, -31, -9, 82, 87, 46, 43, -73, -70, -61, +-70, 61, 43, 73, 87, -46, -9, -82, -90, 31, -25, 88, 80, -13, 57, -90, +-57, -4, -80, 90, 25, 22, 90, -85, 9, -38, -87, 78, -43, 54, 70, -67, + 64, 61, -64, -73, -64, -46, 64, 82, 64, 31, -64, -88, -64, -13, 64, 90, // 32 + 64, -4, -64, -90, -64, 22, 64, 85, 64, -38, -64, -78, -64, 54, 64, 67, + 64, -67, -64, -54, -64, 78, 64, 38, 64, -85, -64, -22, -64, 90, 64, 4, + 64, -90, -64, 13, -64, 88, 64, -31, 64, -82, -64, 46, -64, 73, 64, -61, + 57, 54, -80, -85, -25, -4, 90, 88, -9, -46, -87, -61, 43, 82, 70, 13, +-70, -90, -43, 38, 87, 67, 9, -78, -90, -22, 25, 90, 80, -31, -57, -73, +-57, 73, 80, 31, 25, -90, -90, 22, 9, 78, 87, -67, -43, -38, -70, 90, + 70, -13, 43, -82, -87, 61, -9, 46, 90, -88, -25, 4, -80, 85, 57, -54, + 50, 46, -89, -90, 18, 38, 75, 54, -75, -90, -18, 31, 89, 61, -50, -88, // 40 +-50, 22, 89, 67, -18, -85, -75, 13, 75, 73, 18, -82, -89, 4, 50, 78, + 50, -78, -89, -4, 18, 82, 75, -73, -75, -13, -18, 85, 89, -67, -50, -22, +-50, 88, 89, -61, -18, -31, -75, 90, 75, -54, 18, -38, -89, 90, 50, -46, + 43, 38, -90, -88, 57, 73, 25, -4, -87, -67, 70, 90, 9, -46, -80, -31, + 80, 85, -9, -78, -70, 13, 87, 61, -25, -90, -57, 54, 90, 22, -43, -82, +-43, 82, 90, -22, -57, -54, -25, 90, 87, -61, -70, -13, -9, 78, 80, -85, +-80, 31, 9, 46, 70, -90, -87, 67, 25, 4, 57, -73, -90, 88, 43, -38, + 36, 31, -83, -78, 83, 90, -36, -61, -36, 4, 83, 54, -83, -88, 36, 82, // 48 + 36, -38, -83, -22, 83, 73, -36, -90, -36, 67, 83, -13, -83, -46, 36, 85, + 36, -85, -83, 46, 83, 13, -36, -67, -36, 90, 83, -73, -83, 22, 36, 38, + 36, -82, -83, 88, 83, -54, -36, -4, -36, 61, 83, -90, -83, 78, 36, -31, + 25, 22, -70, -61, 90, 85, -80, -90, 43, 73, 9, -38, -57, -4, 87, 46, +-87, -78, 57, 90, -9, -82, -43, 54, 80, -13, -90, -31, 70, 67, -25, -88, +-25, 88, 70, -67, -90, 31, 80, 13, -43, -54, -9, 82, 57, -90, -87, 78, + 87, -46, -57, 4, 9, 38, 43, -73, -80, 90, 90, -85, -70, 61, 25, -22, + 18, 13, -50, -38, 75, 61, -89, -78, 89, 88, -75, -90, 50, 85, -18, -73, // 56 +-18, 54, 50, -31, -75, 4, 89, 22, -89, -46, 75, 67, -50, -82, 18, 90, + 18, -90, -50, 82, 75, -67, -89, 46, 89, -22, -75, -4, 50, 31, -18, -54, +-18, 73, 50, -85, -75, 90, 89, -88, -89, 78, 75, -61, -50, 38, 18, -13, + 9, 4, -25, -13, 43, 22, -57, -31, 70, 38, -80, -46, 87, 54, -90, -61, + 90, 67, -87, -73, 80, 78, -70, -82, 57, 85, -43, -88, 25, 90, -9, -90, + -9, 90, 25, -90, -43, 88, 57, -85, -70, 82, 80, -78, -87, 73, 90, -67, +-90, 61, 87, -54, -80, 46, 70, -38, -57, 31, 43, -22, -25, 13, 9, -4, +}; + +ALIGNED(32) const int16_t fast_inverse_dst7_b32_coeff[1024] = { + 4, 13, 9, 26, 13, 38, 17, 50, 21, 60, 26, 68, 30, 77, 34, 82, // 0 + 38, 86, 42, 89, 46, 90, 50, 88, 53, 85, 56, 80, 60, 74, 63, 66, + 66, 56, 68, 46, 72, 34, 74, 21, 77, 9, 78, -4, 80, -17, 82, -30, + 84, -42, 85, -53, 86, -63, 87, -72, 88, -78, 89, -84, 90, -87, 90, -90, + 21, 30, 42, 56, 60, 77, 74, 87, 84, 89, 89, 80, 89, 63, 84, 38, + 74, 9, 60, -21, 42, -50, 21, -72, 0, -85, -21, -90, -42, -84, -60, -68, +-74, -46, -84, -17, -89, 13, -89, 42, -84, 66, -74, 82, -60, 90, -42, 86, +-21, 74, 0, 53, 21, 26, 42, -4, 60, -34, 74, -60, 84, -78, 89, -88, + 38, 46, 68, 78, 86, 90, 88, 77, 74, 42, 46, -4, 9, -50, -30, -80, // 8 +-63, -90, -84, -74, -90, -38, -78, 9, -53, 53, -17, 82, 21, 89, 56, 72, + 80, 34, 90, -13, 82, -56, 60, -84, 26, -88, -13, -68, -50, -30, -77, 17, +-89, 60, -85, 85, -66, 87, -34, 66, 4, 26, 42, -21, 72, -63, 87, -86, + 53, 60, 85, 89, 85, 74, 53, 21, 0, -42, -53, -84, -85, -84, -85, -42, +-53, 21, 0, 74, 53, 89, 85, 60, 85, 0, 53, -60, 0, -89, -53, -74, +-85, -21, -85, 42, -53, 84, 0, 84, 53, 42, 85, -21, 85, -74, 53, -89, + 0, -60, -53, 0, -85, 60, -85, 89, -53, 74, 0, 21, 53, -42, 85, -84, + 66, 72, 90, 86, 56, 34, -13, -46, -74, -89, -87, -63, -46, 13, 26, 78, // 16 + 80, 82, 84, 21, 34, -56, -38, -90, -85, -53, -78, 26, -21, 84, 50, 77, + 88, 9, 72, -66, 9, -88, -60, -42, -90, 38, -63, 87, 4, 68, 68, -4, + 89, -74, 53, -85, -17, -30, -77, 50, -86, 90, -42, 60, 30, -17, 82, -80, + 77, 80, 80, 72, 9, -17, -72, -86, -84, -60, -17, 34, 66, 90, 86, 46, + 26, -50, -60, -89, -88, -30, -34, 63, 53, 85, 90, 13, 42, -74, -46, -78, +-90, 4, -50, 82, 38, 68, 89, -21, 56, -87, -30, -56, -87, 38, -63, 90, + 21, 42, 85, -53, 68, -88, -13, -26, -82, 66, -74, 84, 4, 9, 78, -77, + 84, 86, 60, 46, -42, -63, -89, -78, -21, 21, 74, 90, 74, 26, -21, -77, // 24 +-89, -66, -42, 42, 60, 87, 84, 4, 0, -85, -84, -50, -60, 60, 42, 80, + 89, -17, 21, -90, -74, -30, -74, 74, 21, 68, 89, -38, 42, -88, -60, -9, +-84, 84, 0, 53, 84, -56, 60, -82, -42, 13, -89, 89, -21, 34, 74, -72, + 88, 90, 30, 13, -78, -87, -56, -26, 60, 84, 77, 38, -34, -78, -87, -50, + 4, 72, 89, 60, 26, -63, -80, -68, -53, 53, 63, 77, 74, -42, -38, -82, +-86, 30, 9, 86, 90, -17, 21, -89, -82, 4, -50, 90, 66, 9, 72, -88, +-42, -21, -85, 85, 13, 34, 90, -80, 17, -46, -84, 74, -46, 56, 68, -66, + 90, 89, -4, -21, -90, -84, 9, 42, 89, 74, -13, -60, -88, -60, 17, 74, // 32 + 87, 42, -21, -84, -86, -21, 26, 89, 85, 0, -30, -89, -84, 21, 34, 84, + 82, -42, -38, -74, -80, 60, 42, 60, 78, -74, -46, -42, -77, 84, 50, 21, + 74, -89, -53, 0, -72, 89, 56, -21, 68, -84, -60, 42, -66, 74, 63, -60, + 87, 85, -38, -53, -72, -53, 68, 85, 42, 0, -86, -85, -4, 53, 88, 53, +-34, -85, -74, 0, 66, 85, 46, -53, -85, -53, -9, 85, 89, 0, -30, -85, +-77, 53, 63, 53, 50, -85, -84, 0, -13, 85, 90, -53, -26, -53, -78, 85, + 60, 0, 53, -85, -82, 53, -17, 53, 90, -85, -21, 0, -80, 85, 56, -53, + 82, 78, -66, -77, -30, -4, 90, 80, -42, -74, -56, -9, 86, 82, -13, -72, // 40 +-77, -13, 74, 84, 17, -68, -87, -17, 53, 85, 46, -66, -89, -21, 26, 86, + 68, -63, -80, -26, -4, 87, 84, -60, -63, -30, -34, 88, 90, -56, -38, -34, +-60, 89, 85, -53, -9, -38, -78, 90, 72, -50, 21, -42, -88, 90, 50, -46, + 74, 68, -84, -88, 21, 46, 60, 30, -89, -84, 42, 78, 42, -17, -89, -56, + 60, 90, 21, -60, -84, -13, 74, 77, 0, -85, -74, 34, 84, 42, -21, -87, +-60, 72, 89, -4, -42, -66, -42, 89, 89, -50, -60, -26, -21, 82, 84, -80, +-74, 21, 0, 53, 74, -90, -84, 63, 21, 9, 60, -74, -89, 86, 42, -38, + 63, 56, -90, -87, 66, 80, -4, -38, -60, -21, 90, 72, -68, -90, 9, 68, // 48 + 56, -17, -89, -42, 72, 82, -13, -86, -53, 53, 88, 4, -74, -60, 17, 88, + 50, -78, -87, 34, 77, 26, -21, -74, -46, 90, 86, -66, -78, 13, 26, 46, + 42, -84, -85, 85, 80, -50, -30, -9, -38, 63, 84, -89, -82, 77, 34, -30, + 50, 42, -82, -74, 88, 89, -66, -84, 21, 60, 30, -21, -72, -21, 90, 60, +-78, -84, 42, 89, 9, -74, -56, 42, 85, 0, -86, -42, 60, 74, -13, -89, +-38, 84, 77, -60, -90, 21, 74, 21, -34, -60, -17, 84, 63, -89, -87, 74, + 84, -42, -53, 0, 4, 42, 46, -74, -80, 89, 89, -84, -68, 60, 26, -21, + 34, 26, -63, -50, 82, 68, -90, -82, 84, 89, -66, -88, 38, 80, -4, -66, // 56 +-30, 46, 60, -21, -80, -4, 90, 30, -85, -53, 68, 72, -42, -84, 9, 90, + 26, -87, -56, 78, 78, -63, -89, 42, 86, -17, -72, -9, 46, 34, -13, -56, +-21, 74, 53, -85, -77, 90, 88, -86, -87, 77, 74, -60, -50, 38, 17, -13, + 17, 9, -34, -17, 50, 26, -63, -34, 74, 42, -82, -50, 87, 56, -90, -63, + 88, 68, -84, -74, 77, 78, -66, -82, 53, 85, -38, -87, 21, 89, -4, -90, +-13, 90, 30, -88, -46, 86, 60, -84, -72, 80, 80, -77, -86, 72, 90, -66, +-89, 60, 85, -53, -78, 46, 68, -38, -56, 30, 42, -21, -26, 13, 9, -4, +}; + + const int16_t* fast_inverse_dct8_b32_coeff = fast_forward_dct8_b32_coeff; + + +// Shuffle tables for advanced and optimized avx2 functions + +// Shuffle 16 bit samples inside lanes. Put each sample four spaces from each other adjacent to each other. +// _mm256_shuffle_epi8 +// Input [0 1 2 3 4 5 6 7 | XX +// Output [0 4 1 5 2 6 3 7 | XX +ALIGNED(32) const int8_t shuffle_16b_0415[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +// Shuffle 16 bit samples inside lanes. Put each even indexed sample next to each other, then each odd sample. +// _mm256_shuffle_epi8 +// Input [0 1 2 3 4 5 6 7 | +// Output [0 2 4 6 1 3 5 7 | +ALIGNED(32) const int8_t shuffle_16b_0246[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +// Permute 32 bit samples across lanes. Put each sample four spaces from each other adjacent to each other. +// _mm256_permutevar8x32_epi32 +// Input [0 1 2 3 | 4 5 6 7] +// Output [0 1 4 5 | 2 6 3 7] +ALIGNED(32) const int32_t permute_32b_0415[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + + + const int8_t* fi_tr_2x8_shuffle_hor = shuffle_16b_0415; + +ALIGNED(32) const int8_t fi_tr_2x8_result_shuffle1_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) const int8_t ff_dct2_2x8_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) const int8_t ff_dct2_2x8_result_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) const int8_t fi_tr_2x8_result_shuffle2_ver[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) const int8_t ff_dct2_2x16_ver_result_shuffle[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) const int8_t fi_tr_4x4_shuffle_hor[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +ALIGNED(32) const int8_t fi_tr_4x4_result_shuffle_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) const int8_t fi_tr_4x8_result_shuffle_ver[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) const int8_t ff_dct2_8x2_ver_pass_shuffle[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; + +ALIGNED(32) const int8_t fi_tr_8x2_shuffle_hor[32] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, +}; + +ALIGNED(32) const int8_t fi_tr_8x2_shuffle_ver[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + + const int8_t* fi_tr_8x2_res_shuffle_ver = shuffle_16b_0415; + +ALIGNED(32) const int8_t ff_dct2_8x4_ver_pass_shuffle[32] = { + 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15, + 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15, +}; + +// TODO: remove duplicate tables. Rename with a more descriptive name. +ALIGNED(32) const int8_t ff_dct2_8x4_ver_pass_result_shuffle[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, +}; + +ALIGNED(32) const int8_t ff_dct2_8x16_butterfly_shuffle[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +ALIGNED(32) const int8_t ff_dct2_8x16_butterfly_shuffle_order[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +// Arrange samples into butterfly formation +ALIGNED(32) const int8_t ff_dct2_16x8_butterfly_shuffle[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +// Swap two middle 16-bit values in each 64-bit chunk +ALIGNED(32) const int8_t ff_dct2_16x8_butterfly_res_shuffle_ver[32] = { + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, + 16, 17, 20, 21, 18, 19, 22, 23, 24, 25, 28, 29, 26, 27, 30, 31 +}; + +ALIGNED(32) const int8_t ff_dct2_16x32_reverse_64b_order[32] = { + 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, + 22, 23, 20, 21, 18, 19, 16, 17, 30, 31, 28, 29, 26, 27, 24, 25, +}; + +ALIGNED(32) const int8_t ff_dct2_32x2_butterfly_order_shuffle[32] = { + 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, + 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 +}; + +ALIGNED(32) const int8_t ff_dct2_32x8_shuffle_order[32] = { + 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9, + 16, 17, 30, 31, 18, 19, 28, 29, 20, 21, 26, 27, 22, 23, 24, 25 +}; + +ALIGNED(32) const int8_t ff_dct2_32x8_shuffle_result[32] = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 +}; + + +// Coeff tables for advanced and optimized avx2 functions + +// 2xN +ALIGNED(32) const int16_t ff_dct2_2xN_coeff_hor[32] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64 +}; + +ALIGNED(32) const int16_t ff_dct2_2x8_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 89, 75, 50, 18, -18, -50, -75, -89, 89, 75, 50, 18, -18, -50, -75, -89, + 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, + 75, -18, -89, -50, 50, 89, 18, -75, 75, -18, -89, -50, 50, 89, 18, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, + 50, -89, 18, 75, -75, -18, 89, -50, 50, -89, 18, 75, -75, -18, 89, -50, + 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, + 18, -50, 75, -89, 89, -75, 50, -18, 18, -50, 75, -89, 89, -75, 50, -18 +}; + +ALIGNED(32) +const int16_t ff_dst7_2x8_coeff_ver[128] = { + 17, 32, 46, 60, 71, 78, 85, 86, 17, 32, 46, 60, 71, 78, 85, 86, + 46, 78, 86, 71, 32, -17, -60, -85, 46, 78, 86, 71, 32, -17, -60, -85, + 71, 85, 32, -46, -86, -60, 17, 78, 71, 85, 32, -46, -86, -60, 17, 78, + 85, 46, -60, -78, 17, 86, 32, -71, 85, 46, -60, -78, 17, 86, 32, -71, + 86, -17, -85, 32, 78, -46, -71, 60, 86, -17, -85, 32, 78, -46, -71, 60, + 78, -71, -17, 85, -60, -32, 86, -46, 78, -71, -17, 85, -60, -32, 86, -46, + 60, -86, 71, -17, -46, 85, -78, 32, 60, -86, 71, -17, -46, 85, -78, 32, + 32, -60, 78, -86, 85, -71, 46, -17, 32, -60, 78, -86, 85, -71, 46, -17, +}; + + +ALIGNED(32) const int16_t fi_dct2_2x8_coeff_ver[128] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 75, 36, -18, 64, 75, 36, -18, + 64, 50, 36, 18, 64, 50, 36, 18, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 18, -83, -50, 64, 18, -83, -50, +-64, 18, 83, 75, -64, 18, 83, 75, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -50, -36, 89, 64, -50, -36, 89, + 64, -75, -36, 89, 64, -75, -36, 89, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -89, 83, -75, 64, -89, 83, -75, +-64, 89, -83, 50, -64, 89, -83, 50, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) const int16_t fi_dct2_2x16_coeff_ver[512] = { + 64, 90, 89, 87, 64, 90, 89, 87, 64, 57, 50, 43, 64, 57, 50, 43, // 0 + 83, 80, 75, 70, 83, 80, 75, 70, 36, 25, 18, 9, 36, 25, 18, 9, + 64, 87, 75, 57, 64, 87, 75, 57, -64, -80, -89, -90, -64, -80, -89, -90, + 36, 9, -18, -43, 36, 9, -18, -43, -83, -70, -50, -25, -83, -70, -50, -25, + 64, 80, 50, 9, 64, 80, 50, 9, -64, -25, 18, 57, -64, -25, 18, 57, +-36, -70, -89, -87, -36, -70, -89, -87, 83, 90, 75, 43, 83, 90, 75, 43, + 64, 70, 18, -43, 64, 70, 18, -43, 64, 90, 75, 25, 64, 90, 75, 25, +-83, -87, -50, 9, -83, -87, -50, 9, -36, -80, -89, -57, -36, -80, -89, -57, + 64, 57, -18, -80, 64, 57, -18, -80, 64, -9, -75, -87, 64, -9, -75, -87, // 8 +-83, -25, 50, 90, -83, -25, 50, 90, -36, 43, 89, 70, -36, 43, 89, 70, + 64, 43, -50, -90, 64, 43, -50, -90, -64, -87, -18, 70, -64, -87, -18, 70, +-36, 57, 89, 25, -36, 57, 89, 25, 83, 9, -75, -80, 83, 9, -75, -80, + 64, 25, -75, -70, 64, 25, -75, -70, -64, 43, 89, 9, -64, 43, 89, 9, + 36, 90, 18, -80, 36, 90, 18, -80, -83, -57, 50, 87, -83, -57, 50, 87, + 64, 9, -89, -25, 64, 9, -89, -25, 64, 70, -50, -80, 64, 70, -50, -80, + 83, 43, -75, -57, 83, 43, -75, -57, 36, 87, -18, -90, 36, 87, -18, -90, + 64, -9, -89, 25, 64, -9, -89, 25, 64, -70, -50, 80, 64, -70, -50, 80, // 16 + 83, -43, -75, 57, 83, -43, -75, 57, 36, -87, -18, 90, 36, -87, -18, 90, + 64, -25, -75, 70, 64, -25, -75, 70, -64, -43, 89, -9, -64, -43, 89, -9, + 36, -90, 18, 80, 36, -90, 18, 80, -83, 57, 50, -87, -83, 57, 50, -87, + 64, -43, -50, 90, 64, -43, -50, 90, -64, 87, -18, -70, -64, 87, -18, -70, +-36, -57, 89, -25, -36, -57, 89, -25, 83, -9, -75, 80, 83, -9, -75, 80, + 64, -57, -18, 80, 64, -57, -18, 80, 64, 9, -75, 87, 64, 9, -75, 87, +-83, 25, 50, -90, -83, 25, 50, -90, -36, -43, 89, -70, -36, -43, 89, -70, + 64, -70, 18, 43, 64, -70, 18, 43, 64, -90, 75, -25, 64, -90, 75, -25, // 24 +-83, 87, -50, -9, -83, 87, -50, -9, -36, 80, -89, 57, -36, 80, -89, 57, + 64, -80, 50, -9, 64, -80, 50, -9, -64, 25, 18, -57, -64, 25, 18, -57, +-36, 70, -89, 87, -36, 70, -89, 87, 83, -90, 75, -43, 83, -90, 75, -43, + 64, -87, 75, -57, 64, -87, 75, -57, -64, 80, -89, 90, -64, 80, -89, 90, + 36, -9, -18, 43, 36, -9, -18, 43, -83, 70, -50, 25, -83, 70, -50, 25, + 64, -90, 89, -87, 64, -90, 89, -87, 64, -57, 50, -43, 64, -57, 50, -43, + 83, -80, 75, -70, 83, -80, 75, -70, 36, -25, 18, -9, 36, -25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dct2_2x32_coeff_ver[2048] = { + 64, 90, 90, 90, 89, 88, 87, 85, 64, 90, 90, 90, 89, 88, 87, 85, // 0 + 83, 82, 80, 78, 75, 73, 70, 67, 83, 82, 80, 78, 75, 73, 70, 67, + 64, 61, 57, 54, 50, 46, 43, 38, 64, 61, 57, 54, 50, 46, 43, 38, + 36, 31, 25, 22, 18, 13, 9, 4, 36, 31, 25, 22, 18, 13, 9, 4, + 64, 90, 87, 82, 75, 67, 57, 46, 64, 90, 87, 82, 75, 67, 57, 46, + 36, 22, 9, -4, -18, -31, -43, -54, 36, 22, 9, -4, -18, -31, -43, -54, +-64, -73, -80, -85, -89, -90, -90, -88, -64, -73, -80, -85, -89, -90, -90, -88, +-83, -78, -70, -61, -50, -38, -25, -13, -83, -78, -70, -61, -50, -38, -25, -13, + 64, 88, 80, 67, 50, 31, 9, -13, 64, 88, 80, 67, 50, 31, 9, -13, // 8 +-36, -54, -70, -82, -89, -90, -87, -78, -36, -54, -70, -82, -89, -90, -87, -78, +-64, -46, -25, -4, 18, 38, 57, 73, -64, -46, -25, -4, 18, 38, 57, 73, + 83, 90, 90, 85, 75, 61, 43, 22, 83, 90, 90, 85, 75, 61, 43, 22, + 64, 85, 70, 46, 18, -13, -43, -67, 64, 85, 70, 46, 18, -13, -43, -67, +-83, -90, -87, -73, -50, -22, 9, 38, -83, -90, -87, -73, -50, -22, 9, 38, + 64, 82, 90, 88, 75, 54, 25, -4, 64, 82, 90, 88, 75, 54, 25, -4, +-36, -61, -80, -90, -89, -78, -57, -31, -36, -61, -80, -90, -89, -78, -57, -31, + 64, 82, 57, 22, -18, -54, -80, -90, 64, 82, 57, 22, -18, -54, -80, -90, // 16 +-83, -61, -25, 13, 50, 78, 90, 85, -83, -61, -25, 13, 50, 78, 90, 85, + 64, 31, -9, -46, -75, -90, -87, -67, 64, 31, -9, -46, -75, -90, -87, -67, +-36, 4, 43, 73, 89, 88, 70, 38, -36, 4, 43, 73, 89, 88, 70, 38, + 64, 78, 43, -4, -50, -82, -90, -73, 64, 78, 43, -4, -50, -82, -90, -73, +-36, 13, 57, 85, 89, 67, 25, -22, -36, 13, 57, 85, 89, 67, 25, -22, +-64, -88, -87, -61, -18, 31, 70, 90, -64, -88, -87, -61, -18, 31, 70, 90, + 83, 54, 9, -38, -75, -90, -80, -46, 83, 54, 9, -38, -75, -90, -80, -46, + 64, 73, 25, -31, -75, -90, -70, -22, 64, 73, 25, -31, -75, -90, -70, -22, // 24 + 36, 78, 90, 67, 18, -38, -80, -90, 36, 78, 90, 67, 18, -38, -80, -90, +-64, -13, 43, 82, 89, 61, 9, -46, -64, -13, 43, 82, 89, 61, 9, -46, +-83, -88, -57, -4, 50, 85, 87, 54, -83, -88, -57, -4, 50, 85, 87, 54, + 64, 67, 9, -54, -89, -78, -25, 38, 64, 67, 9, -54, -89, -78, -25, 38, + 83, 85, 43, -22, -75, -90, -57, 4, 83, 85, 43, -22, -75, -90, -57, 4, + 64, 90, 70, 13, -50, -88, -80, -31, 64, 90, 70, 13, -50, -88, -80, -31, + 36, 82, 87, 46, -18, -73, -90, -61, 36, 82, 87, 46, -18, -73, -90, -61, + 64, 61, -9, -73, -89, -46, 25, 82, 64, 61, -9, -73, -89, -46, 25, 82, // 32 + 83, 31, -43, -88, -75, -13, 57, 90, 83, 31, -43, -88, -75, -13, 57, 90, + 64, -4, -70, -90, -50, 22, 80, 85, 64, -4, -70, -90, -50, 22, 80, 85, + 36, -38, -87, -78, -18, 54, 90, 67, 36, -38, -87, -78, -18, 54, 90, 67, + 64, 54, -25, -85, -75, -4, 70, 88, 64, 54, -25, -85, -75, -4, 70, 88, + 36, -46, -90, -61, 18, 82, 80, 13, 36, -46, -90, -61, 18, 82, 80, 13, +-64, -90, -43, 38, 89, 67, -9, -78, -64, -90, -43, 38, 89, 67, -9, -78, +-83, -22, 57, 90, 50, -31, -87, -73, -83, -22, 57, 90, 50, -31, -87, -73, + 64, 46, -43, -90, -50, 38, 90, 54, 64, 46, -43, -90, -50, 38, 90, 54, // 40 +-36, -90, -57, 31, 89, 61, -25, -88, -36, -90, -57, 31, 89, 61, -25, -88, +-64, 22, 87, 67, -18, -85, -70, 13, -64, 22, 87, 67, -18, -85, -70, 13, + 83, 73, -9, -82, -75, 4, 80, 78, 83, 73, -9, -82, -75, 4, 80, 78, + 64, 38, -57, -88, -18, 73, 80, -4, 64, 38, -57, -88, -18, 73, 80, -4, +-83, -67, 25, 90, 50, -46, -90, -31, -83, -67, 25, 90, 50, -46, -90, -31, + 64, 85, 9, -78, -75, 13, 87, 61, 64, 85, 9, -78, -75, 13, 87, 61, +-36, -90, -43, 54, 89, 22, -70, -82, -36, -90, -43, 54, 89, 22, -70, -82, + 64, 31, -70, -78, 18, 90, 43, -61, 64, 31, -70, -78, 18, 90, 43, -61, // 48 +-83, 4, 87, 54, -50, -88, -9, 82, -83, 4, 87, 54, -50, -88, -9, 82, + 64, -38, -90, -22, 75, 73, -25, -90, 64, -38, -90, -22, 75, 73, -25, -90, +-36, 67, 80, -13, -89, -46, 57, 85, -36, 67, 80, -13, -89, -46, 57, 85, + 64, 22, -80, -61, 50, 85, -9, -90, 64, 22, -80, -61, 50, 85, -9, -90, +-36, 73, 70, -38, -89, -4, 87, 46, -36, 73, 70, -38, -89, -4, 87, 46, +-64, -78, 25, 90, 18, -82, -57, 54, -64, -78, 25, 90, 18, -82, -57, 54, + 83, -13, -90, -31, 75, 67, -43, -88, 83, -13, -90, -31, 75, 67, -43, -88, + 64, 13, -87, -38, 75, 61, -57, -78, 64, 13, -87, -38, 75, 61, -57, -78, // 56 + 36, 88, -9, -90, -18, 85, 43, -73, 36, 88, -9, -90, -18, 85, 43, -73, +-64, 54, 80, -31, -89, 4, 90, 22, -64, 54, 80, -31, -89, 4, 90, 22, +-83, -46, 70, 67, -50, -82, 25, 90, -83, -46, 70, 67, -50, -82, 25, 90, + 64, 4, -90, -13, 89, 22, -87, -31, 64, 4, -90, -13, 89, 22, -87, -31, + 83, 38, -80, -46, 75, 54, -70, -61, 83, 38, -80, -46, 75, 54, -70, -61, + 64, 67, -57, -73, 50, 78, -43, -82, 64, 67, -57, -73, 50, 78, -43, -82, + 36, 85, -25, -88, 18, 90, -9, -90, 36, 85, -25, -88, 18, 90, -9, -90, + 64, -4, -90, 13, 89, -22, -87, 31, 64, -4, -90, 13, 89, -22, -87, 31, // 64 + 83, -38, -80, 46, 75, -54, -70, 61, 83, -38, -80, 46, 75, -54, -70, 61, + 64, -67, -57, 73, 50, -78, -43, 82, 64, -67, -57, 73, 50, -78, -43, 82, + 36, -85, -25, 88, 18, -90, -9, 90, 36, -85, -25, 88, 18, -90, -9, 90, + 64, -13, -87, 38, 75, -61, -57, 78, 64, -13, -87, 38, 75, -61, -57, 78, + 36, -88, -9, 90, -18, -85, 43, 73, 36, -88, -9, 90, -18, -85, 43, 73, +-64, -54, 80, 31, -89, -4, 90, -22, -64, -54, 80, 31, -89, -4, 90, -22, +-83, 46, 70, -67, -50, 82, 25, -90, -83, 46, 70, -67, -50, 82, 25, -90, + 64, -22, -80, 61, 50, -85, -9, 90, 64, -22, -80, 61, 50, -85, -9, 90, // 72 +-36, -73, 70, 38, -89, 4, 87, -46, -36, -73, 70, 38, -89, 4, 87, -46, +-64, 78, 25, -90, 18, 82, -57, -54, -64, 78, 25, -90, 18, 82, -57, -54, + 83, 13, -90, 31, 75, -67, -43, 88, 83, 13, -90, 31, 75, -67, -43, 88, + 64, -31, -70, 78, 18, -90, 43, 61, 64, -31, -70, 78, 18, -90, 43, 61, +-83, -4, 87, -54, -50, 88, -9, -82, -83, -4, 87, -54, -50, 88, -9, -82, + 64, 38, -90, 22, 75, -73, -25, 90, 64, 38, -90, 22, 75, -73, -25, 90, +-36, -67, 80, 13, -89, 46, 57, -85, -36, -67, 80, 13, -89, 46, 57, -85, + 64, -38, -57, 88, -18, -73, 80, 4, 64, -38, -57, 88, -18, -73, 80, 4, // 80 +-83, 67, 25, -90, 50, 46, -90, 31, -83, 67, 25, -90, 50, 46, -90, 31, + 64, -85, 9, 78, -75, -13, 87, -61, 64, -85, 9, 78, -75, -13, 87, -61, +-36, 90, -43, -54, 89, -22, -70, 82, -36, 90, -43, -54, 89, -22, -70, 82, + 64, -46, -43, 90, -50, -38, 90, -54, 64, -46, -43, 90, -50, -38, 90, -54, +-36, 90, -57, -31, 89, -61, -25, 88, -36, 90, -57, -31, 89, -61, -25, 88, +-64, -22, 87, -67, -18, 85, -70, -13, -64, -22, 87, -67, -18, 85, -70, -13, + 83, -73, -9, 82, -75, -4, 80, -78, 83, -73, -9, 82, -75, -4, 80, -78, + 64, -54, -25, 85, -75, 4, 70, -88, 64, -54, -25, 85, -75, 4, 70, -88, // 88 + 36, 46, -90, 61, 18, -82, 80, -13, 36, 46, -90, 61, 18, -82, 80, -13, +-64, 90, -43, -38, 89, -67, -9, 78, -64, 90, -43, -38, 89, -67, -9, 78, +-83, 22, 57, -90, 50, 31, -87, 73, -83, 22, 57, -90, 50, 31, -87, 73, + 64, -61, -9, 73, -89, 46, 25, -82, 64, -61, -9, 73, -89, 46, 25, -82, + 83, -31, -43, 88, -75, 13, 57, -90, 83, -31, -43, 88, -75, 13, 57, -90, + 64, 4, -70, 90, -50, -22, 80, -85, 64, 4, -70, 90, -50, -22, 80, -85, + 36, 38, -87, 78, -18, -54, 90, -67, 36, 38, -87, 78, -18, -54, 90, -67, + 64, -67, 9, 54, -89, 78, -25, -38, 64, -67, 9, 54, -89, 78, -25, -38, // 96 + 83, -85, 43, 22, -75, 90, -57, -4, 83, -85, 43, 22, -75, 90, -57, -4, + 64, -90, 70, -13, -50, 88, -80, 31, 64, -90, 70, -13, -50, 88, -80, 31, + 36, -82, 87, -46, -18, 73, -90, 61, 36, -82, 87, -46, -18, 73, -90, 61, + 64, -73, 25, 31, -75, 90, -70, 22, 64, -73, 25, 31, -75, 90, -70, 22, + 36, -78, 90, -67, 18, 38, -80, 90, 36, -78, 90, -67, 18, 38, -80, 90, +-64, 13, 43, -82, 89, -61, 9, 46, -64, 13, 43, -82, 89, -61, 9, 46, +-83, 88, -57, 4, 50, -85, 87, -54, -83, 88, -57, 4, 50, -85, 87, -54, + 64, -78, 43, 4, -50, 82, -90, 73, 64, -78, 43, 4, -50, 82, -90, 73, // 104 +-36, -13, 57, -85, 89, -67, 25, 22, -36, -13, 57, -85, 89, -67, 25, 22, +-64, 88, -87, 61, -18, -31, 70, -90, -64, 88, -87, 61, -18, -31, 70, -90, + 83, -54, 9, 38, -75, 90, -80, 46, 83, -54, 9, 38, -75, 90, -80, 46, + 64, -82, 57, -22, -18, 54, -80, 90, 64, -82, 57, -22, -18, 54, -80, 90, +-83, 61, -25, -13, 50, -78, 90, -85, -83, 61, -25, -13, 50, -78, 90, -85, + 64, -31, -9, 46, -75, 90, -87, 67, 64, -31, -9, 46, -75, 90, -87, 67, +-36, -4, 43, -73, 89, -88, 70, -38, -36, -4, 43, -73, 89, -88, 70, -38, + 64, -85, 70, -46, 18, 13, -43, 67, 64, -85, 70, -46, 18, 13, -43, 67, // 112 +-83, 90, -87, 73, -50, 22, 9, -38, -83, 90, -87, 73, -50, 22, 9, -38, + 64, -82, 90, -88, 75, -54, 25, 4, 64, -82, 90, -88, 75, -54, 25, 4, +-36, 61, -80, 90, -89, 78, -57, 31, -36, 61, -80, 90, -89, 78, -57, 31, + 64, -88, 80, -67, 50, -31, 9, 13, 64, -88, 80, -67, 50, -31, 9, 13, +-36, 54, -70, 82, -89, 90, -87, 78, -36, 54, -70, 82, -89, 90, -87, 78, +-64, 46, -25, 4, 18, -38, 57, -73, -64, 46, -25, 4, 18, -38, 57, -73, + 83, -90, 90, -85, 75, -61, 43, -22, 83, -90, 90, -85, 75, -61, 43, -22, + 64, -90, 87, -82, 75, -67, 57, -46, 64, -90, 87, -82, 75, -67, 57, -46, // 120 + 36, -22, 9, 4, -18, 31, -43, 54, 36, -22, 9, 4, -18, 31, -43, 54, +-64, 73, -80, 85, -89, 90, -90, 88, -64, 73, -80, 85, -89, 90, -90, 88, +-83, 78, -70, 61, -50, 38, -25, 13, -83, 78, -70, 61, -50, 38, -25, 13, + 64, -90, 90, -90, 89, -88, 87, -85, 64, -90, 90, -90, 89, -88, 87, -85, + 83, -82, 80, -78, 75, -73, 70, -67, 83, -82, 80, -78, 75, -73, 70, -67, + 64, -61, 57, -54, 50, -46, 43, -38, 64, -61, 57, -54, 50, -46, 43, -38, + 36, -31, 25, -22, 18, -13, 9, -4, 36, -31, 25, -22, 18, -13, 9, -4, +}; + + +// 4xN +ALIGNED(32) const int16_t ff_dct2_4x8_coeff_ver[256] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 0 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 89, 75, 50, 18, 89, 75, 50, 18, 89, 75, 50, 18, 89, 75, 50, 18, +-18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, -18, -50, -75, -89, + 83, 36, -36, -83, 83, 36, -36, -83, 83, 36, -36, -83, 83, 36, -36, -83, +-83, -36, 36, 83, -83, -36, 36, 83, -83, -36, 36, 83, -83, -36, 36, 83, + 75, -18, -89, -50, 75, -18, -89, -50, 75, -18, -89, -50, 75, -18, -89, -50, + 50, 89, 18, -75, 50, 89, 18, -75, 50, 89, 18, -75, 50, 89, 18, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, // 8 + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, + 50, -89, 18, 75, 50, -89, 18, 75, 50, -89, 18, 75, 50, -89, 18, 75, +-75, -18, 89, -50, -75, -18, 89, -50, -75, -18, 89, -50, -75, -18, 89, -50, + 36, -83, 83, -36, 36, -83, 83, -36, 36, -83, 83, -36, 36, -83, 83, -36, +-36, 83, -83, 36, -36, 83, -83, 36, -36, 83, -83, 36, -36, 83, -83, 36, + 18, -50, 75, -89, 18, -50, 75, -89, 18, -50, 75, -89, 18, -50, 75, -89, + 89, -75, 50, -18, 89, -75, 50, -18, 89, -75, 50, -18, 89, -75, 50, -18, +}; + +ALIGNED(32) const int16_t ff_dst7_4x8_coeff_ver[256] = { + 17, 32, 46, 60, 17, 32, 46, 60, 17, 32, 46, 60, 17, 32, 46, 60, // 0 + 71, 78, 85, 86, 71, 78, 85, 86, 71, 78, 85, 86, 71, 78, 85, 86, + 46, 78, 86, 71, 46, 78, 86, 71, 46, 78, 86, 71, 46, 78, 86, 71, + 32, -17, -60, -85, 32, -17, -60, -85, 32, -17, -60, -85, 32, -17, -60, -85, + 71, 85, 32, -46, 71, 85, 32, -46, 71, 85, 32, -46, 71, 85, 32, -46, +-86, -60, 17, 78, -86, -60, 17, 78, -86, -60, 17, 78, -86, -60, 17, 78, + 85, 46, -60, -78, 85, 46, -60, -78, 85, 46, -60, -78, 85, 46, -60, -78, + 17, 86, 32, -71, 17, 86, 32, -71, 17, 86, 32, -71, 17, 86, 32, -71, + 86, -17, -85, 32, 86, -17, -85, 32, 86, -17, -85, 32, 86, -17, -85, 32, // 8 + 78, -46, -71, 60, 78, -46, -71, 60, 78, -46, -71, 60, 78, -46, -71, 60, + 78, -71, -17, 85, 78, -71, -17, 85, 78, -71, -17, 85, 78, -71, -17, 85, +-60, -32, 86, -46, -60, -32, 86, -46, -60, -32, 86, -46, -60, -32, 86, -46, + 60, -86, 71, -17, 60, -86, 71, -17, 60, -86, 71, -17, 60, -86, 71, -17, +-46, 85, -78, 32, -46, 85, -78, 32, -46, 85, -78, 32, -46, 85, -78, 32, + 32, -60, 78, -86, 32, -60, 78, -86, 32, -60, 78, -86, 32, -60, 78, -86, + 85, -71, 46, -17, 85, -71, 46, -17, 85, -71, 46, -17, 85, -71, 46, -17, +}; + +ALIGNED(32) const int16_t ff_dct8_4x8_coeff_ver[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) const int16_t fi_dct2_4xN_coeff_hor[64] = { + 64, 83, 64, 36, 64, 83, 64, 36, 64, 83, 64, 36, 64, 83, 64, 36, + 64, 36, -64, -83, 64, 36, -64, -83, 64, 36, -64, -83, 64, 36, -64, -83, + 64, -36, -64, 83, 64, -36, -64, 83, 64, -36, -64, 83, 64, -36, -64, 83, + 64, -83, 64, -36, 64, -83, 64, -36, 64, -83, 64, -36, 64, -83, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_4xN_coeff_hor[64] = { + 29, 74, 84, 55, 29, 74, 84, 55, 29, 74, 84, 55, 29, 74, 84, 55, + 55, 74, -29, -84, 55, 74, -29, -84, 55, 74, -29, -84, 55, 74, -29, -84, + 74, 0, -74, 74, 74, 0, -74, 74, 74, 0, -74, 74, 74, 0, -74, 74, + 84, -74, 55, -29, 84, -74, 55, -29, 84, -74, 55, -29, 84, -74, 55, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_4xN_coeff_hor[64] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 74, 55, 29, 84, 74, 55, 29, 84, 74, 55, 29, 84, 74, 55, 29, + 74, 0, -74, -74, 74, 0, -74, -74, 74, 0, -74, -74, 74, 0, -74, -74, + 55, -74, -29, 84, 55, -74, -29, 84, 55, -74, -29, 84, 55, -74, -29, 84, + 29, -74, 84, -55, 29, -74, 84, -55, 29, -74, 84, -55, 29, -74, 84, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_4x8_coeff_hor[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, +-36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, +-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_4x8_coeff_hor[128] = { + 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, + 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, + 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, 74, -84, + 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, +-74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, -74, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_4x8_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, + 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, 29, 84, + 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, +-74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, +-74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, -74, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_4x8_coeff_ver[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) const int16_t fi_dst7_4x8_coeff_ver[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) const int16_t fi_dct8_4x8_coeff_ver[256] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) const int16_t fi_dct2_4x16_coeff_hor[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_4x16_coeff_hor[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_4x16_coeff_hor[128] = { // TODO: this is probably identical to forward table, remove this if unnecessary + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_4x16_coeff_ver[512] = { + 64, 90, 89, 87, 83, 80, 75, 70, 64, 90, 89, 87, 83, 80, 75, 70, // 0 + 64, 57, 50, 43, 36, 25, 18, 9, 64, 57, 50, 43, 36, 25, 18, 9, + 64, 87, 75, 57, 36, 9, -18, -43, 64, 87, 75, 57, 36, 9, -18, -43, +-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25, + 64, 80, 50, 9, -36, -70, -89, -87, 64, 80, 50, 9, -36, -70, -89, -87, +-64, -25, 18, 57, 83, 90, 75, 43, -64, -25, 18, 57, 83, 90, 75, 43, + 64, 70, 18, -43, -83, -87, -50, 9, 64, 70, 18, -43, -83, -87, -50, 9, + 64, 90, 75, 25, -36, -80, -89, -57, 64, 90, 75, 25, -36, -80, -89, -57, + 64, 57, -18, -80, -83, -25, 50, 90, 64, 57, -18, -80, -83, -25, 50, 90, // 8 + 64, -9, -75, -87, -36, 43, 89, 70, 64, -9, -75, -87, -36, 43, 89, 70, + 64, 43, -50, -90, -36, 57, 89, 25, 64, 43, -50, -90, -36, 57, 89, 25, +-64, -87, -18, 70, 83, 9, -75, -80, -64, -87, -18, 70, 83, 9, -75, -80, + 64, 25, -75, -70, 36, 90, 18, -80, 64, 25, -75, -70, 36, 90, 18, -80, +-64, 43, 89, 9, -83, -57, 50, 87, -64, 43, 89, 9, -83, -57, 50, 87, + 64, 9, -89, -25, 83, 43, -75, -57, 64, 9, -89, -25, 83, 43, -75, -57, + 64, 70, -50, -80, 36, 87, -18, -90, 64, 70, -50, -80, 36, 87, -18, -90, + 64, -9, -89, 25, 83, -43, -75, 57, 64, -9, -89, 25, 83, -43, -75, 57, // 16 + 64, -70, -50, 80, 36, -87, -18, 90, 64, -70, -50, 80, 36, -87, -18, 90, + 64, -25, -75, 70, 36, -90, 18, 80, 64, -25, -75, 70, 36, -90, 18, 80, +-64, -43, 89, -9, -83, 57, 50, -87, -64, -43, 89, -9, -83, 57, 50, -87, + 64, -43, -50, 90, -36, -57, 89, -25, 64, -43, -50, 90, -36, -57, 89, -25, +-64, 87, -18, -70, 83, -9, -75, 80, -64, 87, -18, -70, 83, -9, -75, 80, + 64, -57, -18, 80, -83, 25, 50, -90, 64, -57, -18, 80, -83, 25, 50, -90, + 64, 9, -75, 87, -36, -43, 89, -70, 64, 9, -75, 87, -36, -43, 89, -70, + 64, -70, 18, 43, -83, 87, -50, -9, 64, -70, 18, 43, -83, 87, -50, -9, // 24 + 64, -90, 75, -25, -36, 80, -89, 57, 64, -90, 75, -25, -36, 80, -89, 57, + 64, -80, 50, -9, -36, 70, -89, 87, 64, -80, 50, -9, -36, 70, -89, 87, +-64, 25, 18, -57, 83, -90, 75, -43, -64, 25, 18, -57, 83, -90, 75, -43, + 64, -87, 75, -57, 36, -9, -18, 43, 64, -87, 75, -57, 36, -9, -18, 43, +-64, 80, -89, 90, -83, 70, -50, 25, -64, 80, -89, 90, -83, 70, -50, 25, + 64, -90, 89, -87, 83, -80, 75, -70, 64, -90, 89, -87, 83, -80, 75, -70, + 64, -57, 50, -43, 36, -25, 18, -9, 64, -57, 50, -43, 36, -25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_4x16_coeff_ver[512] = { + 8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0 + 88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17, + 17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25, + -8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33, + 25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81, +-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48, + 33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48, + 17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62, + 40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8 + 87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73, + 48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68, +-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81, + 55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48, +-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87, + 62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81, + 33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88, + 68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16 + 81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88, + 73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88, +-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85, + 77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0, +-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77, + 81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88, + 48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68, + 85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24 + 73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55, + 87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81, +-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40, + 88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48, +-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25, + 88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68, + 62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8, +}; + +ALIGNED(32) const int16_t fi_dct8_4x16_coeff_ver[512] = { + 88, 88, 87, 85, 81, 77, 73, 68, 88, 88, 87, 85, 81, 77, 73, 68, // 0 + 62, 55, 48, 40, 33, 25, 17, 8, 62, 55, 48, 40, 33, 25, 17, 8, + 88, 81, 68, 48, 25, 0, -25, -48, 88, 81, 68, 48, 25, 0, -25, -48, +-68, -81, -88, -88, -81, -68, -48, -25, -68, -81, -88, -88, -81, -68, -48, -25, + 87, 68, 33, -8, -48, -77, -88, -81, 87, 68, 33, -8, -48, -77, -88, -81, +-55, -17, 25, 62, 85, 88, 73, 40, -55, -17, 25, 62, 85, 88, 73, 40, + 85, 48, -8, -62, -88, -77, -33, 25, 85, 48, -8, -62, -88, -77, -33, 25, + 73, 88, 68, 17, -40, -81, -87, -55, 73, 88, 68, 17, -40, -81, -87, -55, + 81, 25, -48, -88, -68, 0, 68, 88, 81, 25, -48, -88, -68, 0, 68, 88, // 8 + 48, -25, -81, -81, -25, 48, 88, 68, 48, -25, -81, -81, -25, 48, 88, 68, + 77, 0, -77, -77, 0, 77, 77, 0, 77, 0, -77, -77, 0, 77, 77, 0, +-77, -77, 0, 77, 77, 0, -77, -77, -77, -77, 0, 77, 77, 0, -77, -77, + 73, -25, -88, -33, 68, 77, -17, -88, 73, -25, -88, -33, 68, 77, -17, -88, +-40, 62, 81, -8, -87, -48, 55, 85, -40, 62, 81, -8, -87, -48, 55, 85, + 68, -48, -81, 25, 88, 0, -88, -25, 68, -48, -81, 25, 88, 0, -88, -25, + 81, 48, -68, -68, 48, 81, -25, -88, 81, 48, -68, -68, 48, 81, -25, -88, + 62, -68, -55, 73, 48, -77, -40, 81, 62, -68, -55, 73, 48, -77, -40, 81, // 16 + 33, -85, -25, 87, 17, -88, -8, 88, 33, -85, -25, 87, 17, -88, -8, 88, + 55, -81, -17, 88, -25, -77, 62, 48, 55, -81, -17, 88, -25, -77, 62, 48, +-85, -8, 88, -33, -73, 68, 40, -87, -85, -8, 88, -33, -73, 68, 40, -87, + 48, -88, 25, 68, -81, 0, 81, -68, 48, -88, 25, 68, -81, 0, 81, -68, +-25, 88, -48, -48, 88, -25, -68, 81, -25, 88, -48, -48, 88, -25, -68, 81, + 40, -88, 62, 17, -81, 77, -8, -68, 40, -88, 62, 17, -81, 77, -8, -68, + 87, -33, -48, 88, -55, -25, 85, -73, 87, -33, -48, 88, -55, -25, 85, -73, + 33, -81, 85, -40, -25, 77, -87, 48, 33, -81, 85, -40, -25, 77, -87, 48, // 24 + 17, -73, 88, -55, -8, 68, -88, 62, 17, -73, 88, -55, -8, 68, -88, 62, + 25, -68, 88, -81, 48, 0, -48, 81, 25, -68, 88, -81, 48, 0, -48, 81, +-88, 68, -25, -25, 68, -88, 81, -48, -88, 68, -25, -25, 68, -88, 81, -48, + 17, -48, 73, -87, 88, -77, 55, -25, 17, -48, 73, -87, 88, -77, 55, -25, + -8, 40, -68, 85, -88, 81, -62, 33, -8, 40, -68, 85, -88, 81, -62, 33, + 8, -25, 40, -55, 68, -77, 85, -88, 8, -25, 40, -55, 68, -77, 85, -88, + 88, -87, 81, -73, 62, -48, 33, -17, 88, -87, 81, -73, 62, -48, 33, -17, +}; + + +ALIGNED(32) const int16_t fi_dct2_4x32_coeff_hor[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_4x32_coeff_hor[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_4x32_coeff_hor[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +// 8xN +ALIGNED(32) const int16_t ff_dct2_8xN_coeff_hor[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, +-64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, +-64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18 +}; + +ALIGNED(32) const int16_t ff_dst7_8xN_coeff_hor[128] = { + 17, 32, 46, 78, 71, 85, 85, 46, 17, 32, 46, 78, 71, 85, 85, 46, + 46, 60, 86, 71, 32, -46, -60, -78, 46, 60, 86, 71, 32, -46, -60, -78, + 71, 78, 32, -17, -86, -60, 17, 86, 71, 78, 32, -17, -86, -60, 17, 86, + 85, 86, -60, -85, 17, 78, 32, -71, 85, 86, -60, -85, 17, 78, 32, -71, + 86, -17, 78, -71, 60, -86, 32, -60, 86, -17, 78, -71, 60, -86, 32, -60, +-85, 32, -17, 85, 71, -17, 78, -86, -85, 32, -17, 85, 71, -17, 78, -86, + 78, -46, -60, -32, -46, 85, 85, -71, 78, -46, -60, -32, -46, 85, 85, -71, +-71, 60, 86, -46, -78, 32, 46, -17, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) const int16_t ff_dct8_8xN_coeff_hor[128] = { + 86, 85, 85, 60, 78, 17, 71, -32, 86, 85, 85, 60, 78, 17, 71, -32, + 78, 71, 17, -32, -60, -86, -86, -17, 78, 71, 17, -32, -60, -86, -86, -17, + 60, 46, -71, -86, -46, 32, 78, 60, 60, 46, -71, -86, -46, 32, 78, 60, + 32, 17, -78, -46, 85, 71, -46, -85, 32, 17, -78, -46, 85, 71, -46, -85, + 60, -71, 46, -86, 32, -78, 17, -46, 60, -71, 46, -86, 32, -78, 17, -46, +-46, 78, 32, 60, 85, -46, 71, -85, -46, 78, 32, 60, 85, -46, 71, -85, + 32, -85, -85, 17, -17, 71, 86, -78, 32, -85, -85, 17, -17, 71, 86, -78, +-17, 86, 71, -78, -86, 60, 60, -32, -17, 86, 71, -78, -86, 60, 60, -32, +}; + + + const int16_t* ff_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) const int16_t fi_dct2_8x2_coeff_hor[128] = { + 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, + 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, + 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, + 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, + 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18, +}; + + const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) const int16_t ff_dct2_8x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) const int16_t ff_dst7_8x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) const int16_t ff_dct8_8x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_8x4_coeff_hor[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) const int16_t fi_dst7_8x4_coeff_hor[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) const int16_t fi_dct8_8x4_coeff_hor[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) const int16_t fi_dct2_8x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_8x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + + const int16_t* fi_dct8_8x4_coeff_ver = ff_dct8_8x4_coeff_ver; // Duplicate table + + +ALIGNED(32) const int16_t ff_dct2_8x8_coeff_ver[64] = { + 64, 64, 64, 64, 64, 64, 64, 64, 89, 50, 75, 18, -18, -75, -50, -89, + 83, -36, 36, -83, -83, 36, -36, 83, 75, -89, -18, -50, 50, 18, 89, -75, + 64, -64, -64, 64, 64, -64, -64, 64, 50, 18, -89, 75, -75, 89, -18, -50, + 36, 83, -83, -36, -36, -83, 83, 36, 18, 75, -50, -89, 89, 50, -75, -18, +}; + +ALIGNED(32) const int16_t ff_dst7_8x8_coeff_ver[64] = { + 17, 46, 32, 60, 71, 85, 78, 86, 46, 86, 78, 71, 32, -60, -17, -85, + 71, 32, 85, -46, -86, 17, -60, 78, 85, -60, 46, -78, 17, 32, 86, -71, + 86, -85, -17, 32, 78, -71, -46, 60, 78, -17, -71, 85, -60, 86, -32, -46, + 60, 71, -86, -17, -46, -78, 85, 32, 32, 78, -60, -86, 85, 46, -71, -17, +}; + +ALIGNED(32) const int16_t ff_dct8_8x8_coeff_ver[64] = { + 86, 78, 85, 71, 60, 32, 46, 17, 85, 17, 60, -32, -71, -78, -86, -46, + 78, -60, 17, -86, -46, 85, 32, 71, 71, -86, -32, -17, 78, -46, 60, -85, + 60, -46, -71, 78, 32, -17, -85, 86, 46, 32, -86, 60, -85, 71, 17, -78, + 32, 85, -78, -46, -17, -86, 71, 60, 17, 71, -46, -85, 86, 60, -78, -32, +}; + + +ALIGNED(32) const int16_t fi_dct2_8x8_coeff_hor[512] = { + 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, 64, 89, // 0 + 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, + 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, + 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, + 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, + 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, +-64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, -64, -89, +-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, + 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, 64, 50, // 8 +-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, +-64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, -64, 18, + 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, 83, 75, + 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, 64, 18, +-83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, -83, -50, + 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, 64, 75, +-36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, -36, -89, + 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, 64, -18, // 16 +-83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, + 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, +-36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, + 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, +-36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, -36, 89, +-64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, -64, -18, + 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, + 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, 64, -75, // 24 + 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, 36, 18, +-64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, -64, 89, +-83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, -83, 50, + 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, 64, -89, + 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, 83, -75, + 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, 64, -50, + 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, 36, -18, +}; + +ALIGNED(32) const int16_t fi_dst7_8x8_coeff_hor[512] = { + 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, 17, 46, // 0 + 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, + 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, 86, 78, + 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, + 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, 32, 78, + 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, +-17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, -17, -71, +-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, + 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, 46, 86, // 8 + 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, +-85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, -85, -17, + 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, + 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, 60, 71, +-46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, + 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, 32, 85, +-17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, + 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, 71, 32, // 16 +-86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, -86, 17, + 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, 78, -60, +-46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, + 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, 78, -17, +-60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, -60, 86, +-46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, -46, -32, + 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, + 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, 85, -60, // 24 + 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, +-71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, -71, 86, +-78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, -78, 46, + 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, 86, -85, + 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, + 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, 60, -46, + 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, +}; + +ALIGNED(32) const int16_t fi_dct8_8x8_coeff_hor[512] = { + 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, // 0 + 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, + 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, + 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, + 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, + 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, +-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, +-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, + 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, // 8 +-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, +-46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, + 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, + 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, +-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, + 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, +-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, + 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, // 16 +-46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, + 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, +-17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, + 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, + 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, +-85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, + 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, + 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, // 24 + 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, +-17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, +-86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, + 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, + 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, + 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, + 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, +}; + + +ALIGNED(32) const int16_t ff_dct2_8x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) const int16_t ff_dst7_8x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) const int16_t ff_dct8_8x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + +ALIGNED(32) const int16_t ff_dct2_8x16_butterfly_o_row_coeff_hor[256] = { + 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, // 0 + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, // 8 +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +}; + + + const int16_t* fi_dct2_8x16_coeff_hor = fi_dct2_8x8_coeff_hor; + + const int16_t* fi_dst7_8x16_coeff_hor = fi_dst7_8x8_coeff_hor; + + const int16_t* fi_dct8_8x16_coeff_hor = fi_dct8_8x8_coeff_hor; + + +ALIGNED(32) const int16_t fi_dct2_8x16_coeff_ver[2048] = { + 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, // 0 + 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, 89, 87, + 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, 83, 80, + 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, 75, 70, + 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, + 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, 50, 43, + 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, 36, 25, + 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, 18, 9, + 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, 64, 87, // 8 + 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, 75, 57, + 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, 36, 9, +-18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, -18, -43, +-64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, -64, -80, +-89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, -89, -90, +-83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, -83, -70, +-50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, -50, -25, + 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, 64, 80, // 16 + 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, 50, 9, +-36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, -36, -70, +-89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, -89, -87, +-64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, -64, -25, + 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, 18, 57, + 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, 83, 90, + 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, 75, 43, + 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, // 24 + 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, 18, -43, +-83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, -83, -87, +-50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, -50, 9, + 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, 64, 90, + 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, 75, 25, +-36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, -36, -80, +-89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, -89, -57, + 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, 64, 57, // 32 +-18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, -18, -80, +-83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, -83, -25, + 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, 50, 90, + 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, +-75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, -75, -87, +-36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, -36, 43, + 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, 89, 70, + 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, 64, 43, // 40 +-50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, -50, -90, +-36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, -36, 57, + 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, 89, 25, +-64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, -64, -87, +-18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, -18, 70, + 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, 83, 9, +-75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, -75, -80, + 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, 64, 25, // 48 +-75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, -75, -70, + 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, 36, 90, + 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, 18, -80, +-64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, -64, 43, + 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, 89, 9, +-83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, -83, -57, + 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, 50, 87, + 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, // 56 +-89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, -89, -25, + 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, 83, 43, +-75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, -75, -57, + 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, 64, 70, +-50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, -50, -80, + 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, 36, 87, +-18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, -18, -90, + 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, 64, -9, // 64 +-89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, -89, 25, + 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, 83, -43, +-75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, -75, 57, + 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, +-50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, -50, 80, + 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, 36, -87, +-18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, -18, 90, + 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, 64, -25, // 72 +-75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, -75, 70, + 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, 36, -90, + 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, 18, 80, +-64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, -64, -43, + 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, 89, -9, +-83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, -83, 57, + 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, 50, -87, + 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, 64, -43, // 80 +-50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, -50, 90, +-36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, -36, -57, + 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, 89, -25, +-64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, -64, 87, +-18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, -18, -70, + 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, 83, -9, +-75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, -75, 80, + 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, // 88 +-18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, -18, 80, +-83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, -83, 25, + 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, 50, -90, + 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, 64, 9, +-75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, -75, 87, +-36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, -36, -43, + 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, 89, -70, + 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, 64, -70, // 96 + 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, 18, 43, +-83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, -83, 87, +-50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, -50, -9, + 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, + 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, 75, -25, +-36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, -36, 80, +-89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, -89, 57, + 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, 64, -80, // 104 + 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, 50, -9, +-36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, -36, 70, +-89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, -89, 87, +-64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, -64, 25, + 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, 18, -57, + 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, 83, -90, + 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, 75, -43, + 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, 64, -87, // 112 + 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, 75, -57, + 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, 36, -9, +-18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, -18, 43, +-64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, -64, 80, +-89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, -89, 90, +-83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, -83, 70, +-50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, -50, 25, + 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, 64, -90, // 120 + 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, 89, -87, + 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, 83, -80, + 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, 75, -70, + 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, 64, -57, + 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, 50, -43, + 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, 36, -25, + 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_8x16_coeff_ver[2048] = { + 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, 8, 25, // 0 + 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, 40, 55, + 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, + 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, + 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, 88, 87, + 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, 81, 73, + 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, + 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, 33, 17, + 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, 17, 48, // 8 + 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, 73, 87, + 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, 88, 77, + 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, 55, 25, + -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, -8, -40, +-68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, -68, -85, +-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, +-62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, -62, -33, + 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, // 16 + 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, + 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, +-48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, -48, -81, +-88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, -88, -68, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, + 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, + 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, 33, 81, // 24 + 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, 85, 40, +-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, +-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, + 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, 17, 73, + 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, 88, 55, + -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, +-88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, -88, -62, + 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, 40, 88, // 32 + 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, 62, -17, +-81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, -81, -77, + -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, + 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, 87, 33, +-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, +-55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, -55, 25, + 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, 85, 73, + 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, 48, 88, // 40 + 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, +-81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, + 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, 81, 68, +-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, +-48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, -48, 48, + 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, 88, 25, +-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, + 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, 55, 81, // 48 +-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, +-25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, + 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, +-85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, -85, 8, + 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, 88, 33, +-73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, -73, -68, + 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, 40, 87, + 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, 62, 68, // 56 +-55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, -55, -73, + 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, 48, 77, +-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, + 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, 33, 85, +-25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, -25, -87, + 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, 17, 88, + -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, -8, -88, + 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, // 64 +-81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, -81, -25, + 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, +-88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, -88, 25, + 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, +-68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, -68, 68, + 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, 48, -81, +-25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, + 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, 73, 25, // 72 +-88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, -88, 33, + 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, +-17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, +-40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, -40, -62, + 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, 81, 8, +-87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, + 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, 55, -85, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, // 80 +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, 0, -77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, -77, 77, + 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, // 88 +-48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, +-68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, + 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, + 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, 48, 25, +-81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, -81, 81, +-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, + 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, 88, -68, + 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, 85, -48, // 96 + -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, -8, 62, +-88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, -88, 77, +-33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, -33, -25, + 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, 73, -88, + 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, 68, -17, +-40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, +-87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, -87, 55, + 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, 87, -68, // 104 + 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, 33, 8, +-48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, -48, 77, +-88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, +-55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, -55, 17, + 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, 25, -62, + 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, + 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, 73, -40, + 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, // 112 + 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, + 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, +-25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, +-68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, -81, 68, +-48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, -48, 25, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, // 120 + 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, 87, -85, + 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, 81, -77, + 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, 73, -68, + 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, 62, -55, + 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, 48, -40, + 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, 33, -25, + 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, 17, -8, +}; + +ALIGNED(32) const int16_t fi_dct8_8x16_coeff_ver[2048] = { + 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, // 0 + 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, 87, 85, + 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, 81, 77, + 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, 73, 68, + 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, 62, 55, + 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, 48, 40, + 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, 33, 25, + 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, 17, 8, + 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, 88, 81, // 8 + 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, 68, 48, + 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, 25, 0, +-25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, +-68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, +-88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, -88, +-81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, -81, -68, +-48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, -48, -25, + 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, 87, 68, // 16 + 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, 33, -8, +-48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, -48, -77, +-88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, -88, -81, +-55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, -55, -17, + 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, 25, 62, + 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, 85, 88, + 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, 73, 40, + 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, 85, 48, // 24 + -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, -8, -62, +-88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, -88, -77, +-33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, -33, 25, + 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, 73, 88, + 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, 68, 17, +-40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, -40, -81, +-87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, -87, -55, + 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, 81, 25, // 32 +-48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, -48, -88, +-68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, -68, 0, + 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, + 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, +-81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, -81, +-25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, -25, 48, + 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, 88, 68, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, // 40 +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, + 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, 77, 0, +-77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, -77, + 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, 73, -25, // 48 +-88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, -88, -33, + 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, 68, 77, +-17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, -17, -88, +-40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, -40, 62, + 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, 81, -8, +-87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, -87, -48, + 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, 55, 85, + 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, 68, -48, // 56 +-81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, -81, 25, + 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, 88, 0, +-88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, + 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, +-68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, -68, + 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, 48, 81, +-25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, -25, -88, + 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, 62, -68, // 64 +-55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, -55, 73, + 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, 48, -77, +-40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, -40, 81, + 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, 33, -85, +-25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, -25, 87, + 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, 17, -88, + -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, -8, 88, + 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, 55, -81, // 72 +-17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, -17, 88, +-25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, -25, -77, + 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, 62, 48, +-85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, -85, -8, + 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, 88, -33, +-73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, -73, 68, + 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, 40, -87, + 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, 48, -88, // 80 + 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, 25, 68, +-81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, -81, 0, + 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, +-25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, +-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, + 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, 88, -25, +-68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, -68, 81, + 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, 40, -88, // 88 + 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, 62, 17, +-81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, -81, 77, + -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, -8, -68, + 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, 87, -33, +-48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, -48, 88, +-55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, -55, -25, + 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, 85, -73, + 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, 33, -81, // 96 + 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, 85, -40, +-25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, -25, 77, +-87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, -87, 48, + 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, 17, -73, + 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, 88, -55, + -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, -8, 68, +-88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, -88, 62, + 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, 25, -68, // 104 + 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, 88, -81, + 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, 48, 0, +-48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, +-88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, +-25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, + 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, 68, -88, + 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, 81, -48, + 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, 17, -48, // 112 + 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, 73, -87, + 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, 88, -77, + 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, 55, -25, + -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, -8, 40, +-68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, -68, 85, +-88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, -88, 81, +-62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, -62, 33, + 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, 8, -25, // 120 + 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, 40, -55, + 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, 68, -77, + 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, 85, -88, + 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, 88, -87, + 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, 81, -73, + 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, 62, -48, + 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, 33, -17, +}; + + +ALIGNED(32) const int16_t ff_dct2_8x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dst7_8x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dct8_8x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + + const int16_t* fi_dct2_8x32_coeff_hor = fi_dct2_8x8_coeff_hor; + + const int16_t* fi_dst7_8x32_coeff_hor = fi_dst7_8x8_coeff_hor; + + const int16_t* fi_dct8_8x32_coeff_hor = fi_dct8_8x8_coeff_hor; + + +// 16xN +ALIGNED(32) const int16_t ff_dct2_16xN_coeff_hor[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 64, -64, 57, -80, 50, -89, 43, -90, + 64, 64, 80, 70, 50, 18, 9, -43, -64, 64, -25, 90, 18, 75, 57, 25, + 64, 64, 57, 43, -18, -50, -80, -90, 64, -64, -9, -87, -75, -18, -87, 70, + 64, 64, 25, 9, -75, -89, -70, -25, -64, 64, 43, 70, 89, -50, 9, -80, + 64, 64, -9, -25, -89, -75, 25, 70, 64, -64, -70, -43, -50, 89, 80, -9, + 64, 64, -43, -57, -50, -18, 90, 80, -64, 64, 87, 9, -18, -75, -70, 87, + 64, 64, -70, -80, 18, 50, 43, -9, 64, -64, -90, 25, 75, 18, -25, -57, + 64, 64, -87, -90, 75, 89, -57, -87, -64, 64, 80, -57, -89, 50, 90, -43, + 83, 36, 80, 9, 75, -18, 70, -43, 36, -83, 25, -70, 18, -50, 9, -25, +-36, -83, -70, -87, -89, -50, -87, 9, 83, -36, 90, -80, 75, -89, 43, -57, +-83, -36, -25, 57, 50, 89, 90, 25, -36, 83, 43, 9, 89, -75, 70, -80, + 36, 83, 90, 43, 18, -75, -80, -57, -83, 36, -57, 87, 50, -18, 87, -90, + 83, 36, -43, -90, -75, 18, 57, 80, 36, -83, -87, 57, -18, 50, 90, -87, +-36, -83, -57, 25, 89, 50, -25, -90, 83, -36, -9, -43, -75, 89, 80, -70, +-83, -36, 87, 70, -50, -89, -9, 87, -36, 83, 80, -90, -89, 75, 57, -43, + 36, 83, -9, -80, -18, 75, 43, -70, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) const int16_t ff_dst7_16xN_coeff_hor[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 88, -8, 87, -40, 81, -68, 73, -85, // 0 + 25, 33, 68, 81, 88, 85, 81, 40, -88, 17, -68, 73, -25, 88, 25, 55, + 40, 48, 88, 88, 62, 25, -17, -68, 87, -25, 33, -88, -48, -48, -88, 48, + 55, 62, 81, 68, -17, -55, -88, -73, -85, 33, 8, 85, 88, -25, 33, -87, + 68, 73, 48, 25, -81, -88, -25, 33, 81, -40, -48, -62, -68, 81, 68, 8, + 77, 81, 0, -25, -77, -48, 77, 88, -77, 48, 77, 25, 0, -81, -77, 81, + 85, 87, -48, -68, -8, 33, 62, 8, 73, -55, -88, 17, 68, 25, -17, -62, + 88, 88, -81, -88, 68, 87, -48, -85, -68, 62, 81, -55, -88, 48, 88, -40, + 68, 88, 77, 77, 85, 55, 88, 25, 62, -88, 48, -81, 33, -62, 17, -33, // 8 + 48, -25, 0, -77, -48, -87, -81, -48, 68, -8, 88, -68, 81, -88, 48, -62, +-81, -81, -77, 0, -8, 81, 68, 68, -55, 88, 25, 25, 85, -68, 73, -81, +-25, 48, 77, 77, 62, -40, -48, -81, -73, 17, -68, 88, 40, -8, 87, -88, + 88, 68, 0, -77, -88, -17, 25, 88, 48, -87, -81, 48, -25, 55, 88, -85, + 0, -68, -77, 0, 77, 68, 0, -88, 77, -25, 0, -48, -77, 88, 77, -68, +-88, -48, 77, 77, -33, -88, -25, 81, -40, 85, 81, -88, -87, 73, 55, -40, + 25, 81, 0, -77, -25, 73, 48, -68, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) const int16_t ff_dct8_16xN_coeff_hor[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 62, -68, 55, -81, 48, -88, 40, -88, // 0 + 87, 85, 68, 48, 33, -8, -8, -62, -55, 73, -17, 88, 25, 68, 62, 17, + 81, 77, 25, 0, -48, -77, -88, -77, 48, -77, -25, -77, -81, 0, -81, 77, + 73, 68, -25, -48, -88, -81, -33, 25, -40, 81, 62, 48, 81, -68, -8, -68, + 62, 55, -68, -81, -55, -17, 73, 88, 33, -85, -85, -8, -25, 88, 87, -33, + 48, 40, -88, -88, 25, 62, 68, 17, -25, 87, 88, -33, -48, -48, -48, 88, + 33, 25, -81, -68, 85, 88, -40, -81, 17, -88, -73, 68, 88, -25, -55, -25, + 17, 8, -48, -25, 73, 40, -87, -55, -8, 88, 40, -87, -68, 81, 85, -73, + 81, 25, 77, 0, 73, -25, 68, -48, 33, -81, 25, -68, 17, -48, 8, -25, // 8 +-48, -88, -77, -77, -88, -33, -81, 25, 85, -40, 88, -81, 73, -87, 40, -55, +-68, 0, 0, 77, 68, 77, 88, 0, -25, 77, 48, 0, 88, -77, 68, -77, + 68, 88, 77, 0, -17, -88, -88, -25, -87, 48, -48, 81, 55, -25, 85, -88, + 48, -25, -77, -77, -40, 62, 81, 48, 17, -73, -88, 68, -8, 40, 88, -87, +-81, -81, 0, 77, 81, -8, -68, -68, 88, -55, -25, -25, -68, 85, 81, -73, +-25, 48, 77, 0, -87, -48, 48, 81, -8, 68, 68, -88, -88, 81, 62, -48, + 88, 68, -77, -77, 55, 85, -25, -88, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + + const int16_t* ff_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) const int16_t fi_dct2_16x2_coeff_hor[512] = { + 64, 90, 89, 87, 83, 80, 75, 70, 64, 90, 89, 87, 83, 80, 75, 70, // 0 + 64, 57, 50, 43, 36, 25, 18, 9, 64, 57, 50, 43, 36, 25, 18, 9, + 64, 87, 75, 57, 36, 9, -18, -43, 64, 87, 75, 57, 36, 9, -18, -43, +-64, -80, -89, -90, -83, -70, -50, -25, -64, -80, -89, -90, -83, -70, -50, -25, + 64, 80, 50, 9, -36, -70, -89, -87, 64, 80, 50, 9, -36, -70, -89, -87, +-64, -25, 18, 57, 83, 90, 75, 43, -64, -25, 18, 57, 83, 90, 75, 43, + 64, 70, 18, -43, -83, -87, -50, 9, 64, 70, 18, -43, -83, -87, -50, 9, + 64, 90, 75, 25, -36, -80, -89, -57, 64, 90, 75, 25, -36, -80, -89, -57, + 64, 57, -18, -80, -83, -25, 50, 90, 64, 57, -18, -80, -83, -25, 50, 90, // 8 + 64, -9, -75, -87, -36, 43, 89, 70, 64, -9, -75, -87, -36, 43, 89, 70, + 64, 43, -50, -90, -36, 57, 89, 25, 64, 43, -50, -90, -36, 57, 89, 25, +-64, -87, -18, 70, 83, 9, -75, -80, -64, -87, -18, 70, 83, 9, -75, -80, + 64, 25, -75, -70, 36, 90, 18, -80, 64, 25, -75, -70, 36, 90, 18, -80, +-64, 43, 89, 9, -83, -57, 50, 87, -64, 43, 89, 9, -83, -57, 50, 87, + 64, 9, -89, -25, 83, 43, -75, -57, 64, 9, -89, -25, 83, 43, -75, -57, + 64, 70, -50, -80, 36, 87, -18, -90, 64, 70, -50, -80, 36, 87, -18, -90, + 64, -9, -89, 25, 83, -43, -75, 57, 64, -9, -89, 25, 83, -43, -75, 57, // 16 + 64, -70, -50, 80, 36, -87, -18, 90, 64, -70, -50, 80, 36, -87, -18, 90, + 64, -25, -75, 70, 36, -90, 18, 80, 64, -25, -75, 70, 36, -90, 18, 80, +-64, -43, 89, -9, -83, 57, 50, -87, -64, -43, 89, -9, -83, 57, 50, -87, + 64, -43, -50, 90, -36, -57, 89, -25, 64, -43, -50, 90, -36, -57, 89, -25, +-64, 87, -18, -70, 83, -9, -75, 80, -64, 87, -18, -70, 83, -9, -75, 80, + 64, -57, -18, 80, -83, 25, 50, -90, 64, -57, -18, 80, -83, 25, 50, -90, + 64, 9, -75, 87, -36, -43, 89, -70, 64, 9, -75, 87, -36, -43, 89, -70, + 64, -70, 18, 43, -83, 87, -50, -9, 64, -70, 18, 43, -83, 87, -50, -9, // 24 + 64, -90, 75, -25, -36, 80, -89, 57, 64, -90, 75, -25, -36, 80, -89, 57, + 64, -80, 50, -9, -36, 70, -89, 87, 64, -80, 50, -9, -36, 70, -89, 87, +-64, 25, 18, -57, 83, -90, 75, -43, -64, 25, 18, -57, 83, -90, 75, -43, + 64, -87, 75, -57, 36, -9, -18, 43, 64, -87, 75, -57, 36, -9, -18, 43, +-64, 80, -89, 90, -83, 70, -50, 25, -64, 80, -89, 90, -83, 70, -50, 25, + 64, -90, 89, -87, 83, -80, 75, -70, 64, -90, 89, -87, 83, -80, 75, -70, + 64, -57, 50, -43, 36, -25, 18, -9, 64, -57, 50, -43, 36, -25, 18, -9, +}; + + const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + +ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = { + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, + 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, // 8 + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, // 16 + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, // 24 +-43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, // 32 +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, // 40 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, +-87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, // 48 +-70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, + 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, 9, -9, // 56 +-25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, -25, 25, + 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, 43, -43, +-57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, -57, 57, + 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, 70, -70, +-80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, -80, 80, + 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +}; + + +ALIGNED(32) const int16_t ff_dct2_16x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) const int16_t ff_dst7_16x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) const int16_t ff_dct8_16x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_16x4_coeff_hor[1024] = { + 64, 90, 89, 87, 64, 90, 89, 87, 64, 90, 89, 87, 64, 90, 89, 87, // 0 + 83, 80, 75, 70, 83, 80, 75, 70, 83, 80, 75, 70, 83, 80, 75, 70, + 64, 57, 50, 43, 64, 57, 50, 43, 64, 57, 50, 43, 64, 57, 50, 43, + 36, 25, 18, 9, 36, 25, 18, 9, 36, 25, 18, 9, 36, 25, 18, 9, + 64, 87, 75, 57, 64, 87, 75, 57, 64, 87, 75, 57, 64, 87, 75, 57, + 36, 9, -18, -43, 36, 9, -18, -43, 36, 9, -18, -43, 36, 9, -18, -43, +-64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, -64, -80, -89, -90, +-83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, -83, -70, -50, -25, + 64, 80, 50, 9, 64, 80, 50, 9, 64, 80, 50, 9, 64, 80, 50, 9, // 8 +-36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, -36, -70, -89, -87, +-64, -25, 18, 57, -64, -25, 18, 57, -64, -25, 18, 57, -64, -25, 18, 57, + 83, 90, 75, 43, 83, 90, 75, 43, 83, 90, 75, 43, 83, 90, 75, 43, + 64, 70, 18, -43, 64, 70, 18, -43, 64, 70, 18, -43, 64, 70, 18, -43, +-83, -87, -50, 9, -83, -87, -50, 9, -83, -87, -50, 9, -83, -87, -50, 9, + 64, 90, 75, 25, 64, 90, 75, 25, 64, 90, 75, 25, 64, 90, 75, 25, +-36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, -36, -80, -89, -57, + 64, 57, -18, -80, 64, 57, -18, -80, 64, 57, -18, -80, 64, 57, -18, -80, // 16 +-83, -25, 50, 90, -83, -25, 50, 90, -83, -25, 50, 90, -83, -25, 50, 90, + 64, -9, -75, -87, 64, -9, -75, -87, 64, -9, -75, -87, 64, -9, -75, -87, +-36, 43, 89, 70, -36, 43, 89, 70, -36, 43, 89, 70, -36, 43, 89, 70, + 64, 43, -50, -90, 64, 43, -50, -90, 64, 43, -50, -90, 64, 43, -50, -90, +-36, 57, 89, 25, -36, 57, 89, 25, -36, 57, 89, 25, -36, 57, 89, 25, +-64, -87, -18, 70, -64, -87, -18, 70, -64, -87, -18, 70, -64, -87, -18, 70, + 83, 9, -75, -80, 83, 9, -75, -80, 83, 9, -75, -80, 83, 9, -75, -80, + 64, 25, -75, -70, 64, 25, -75, -70, 64, 25, -75, -70, 64, 25, -75, -70, // 24 + 36, 90, 18, -80, 36, 90, 18, -80, 36, 90, 18, -80, 36, 90, 18, -80, +-64, 43, 89, 9, -64, 43, 89, 9, -64, 43, 89, 9, -64, 43, 89, 9, +-83, -57, 50, 87, -83, -57, 50, 87, -83, -57, 50, 87, -83, -57, 50, 87, + 64, 9, -89, -25, 64, 9, -89, -25, 64, 9, -89, -25, 64, 9, -89, -25, + 83, 43, -75, -57, 83, 43, -75, -57, 83, 43, -75, -57, 83, 43, -75, -57, + 64, 70, -50, -80, 64, 70, -50, -80, 64, 70, -50, -80, 64, 70, -50, -80, + 36, 87, -18, -90, 36, 87, -18, -90, 36, 87, -18, -90, 36, 87, -18, -90, + 64, -9, -89, 25, 64, -9, -89, 25, 64, -9, -89, 25, 64, -9, -89, 25, // 32 + 83, -43, -75, 57, 83, -43, -75, 57, 83, -43, -75, 57, 83, -43, -75, 57, + 64, -70, -50, 80, 64, -70, -50, 80, 64, -70, -50, 80, 64, -70, -50, 80, + 36, -87, -18, 90, 36, -87, -18, 90, 36, -87, -18, 90, 36, -87, -18, 90, + 64, -25, -75, 70, 64, -25, -75, 70, 64, -25, -75, 70, 64, -25, -75, 70, + 36, -90, 18, 80, 36, -90, 18, 80, 36, -90, 18, 80, 36, -90, 18, 80, +-64, -43, 89, -9, -64, -43, 89, -9, -64, -43, 89, -9, -64, -43, 89, -9, +-83, 57, 50, -87, -83, 57, 50, -87, -83, 57, 50, -87, -83, 57, 50, -87, + 64, -43, -50, 90, 64, -43, -50, 90, 64, -43, -50, 90, 64, -43, -50, 90, // 40 +-36, -57, 89, -25, -36, -57, 89, -25, -36, -57, 89, -25, -36, -57, 89, -25, +-64, 87, -18, -70, -64, 87, -18, -70, -64, 87, -18, -70, -64, 87, -18, -70, + 83, -9, -75, 80, 83, -9, -75, 80, 83, -9, -75, 80, 83, -9, -75, 80, + 64, -57, -18, 80, 64, -57, -18, 80, 64, -57, -18, 80, 64, -57, -18, 80, +-83, 25, 50, -90, -83, 25, 50, -90, -83, 25, 50, -90, -83, 25, 50, -90, + 64, 9, -75, 87, 64, 9, -75, 87, 64, 9, -75, 87, 64, 9, -75, 87, +-36, -43, 89, -70, -36, -43, 89, -70, -36, -43, 89, -70, -36, -43, 89, -70, + 64, -70, 18, 43, 64, -70, 18, 43, 64, -70, 18, 43, 64, -70, 18, 43, // 48 +-83, 87, -50, -9, -83, 87, -50, -9, -83, 87, -50, -9, -83, 87, -50, -9, + 64, -90, 75, -25, 64, -90, 75, -25, 64, -90, 75, -25, 64, -90, 75, -25, +-36, 80, -89, 57, -36, 80, -89, 57, -36, 80, -89, 57, -36, 80, -89, 57, + 64, -80, 50, -9, 64, -80, 50, -9, 64, -80, 50, -9, 64, -80, 50, -9, +-36, 70, -89, 87, -36, 70, -89, 87, -36, 70, -89, 87, -36, 70, -89, 87, +-64, 25, 18, -57, -64, 25, 18, -57, -64, 25, 18, -57, -64, 25, 18, -57, + 83, -90, 75, -43, 83, -90, 75, -43, 83, -90, 75, -43, 83, -90, 75, -43, + 64, -87, 75, -57, 64, -87, 75, -57, 64, -87, 75, -57, 64, -87, 75, -57, // 56 + 36, -9, -18, 43, 36, -9, -18, 43, 36, -9, -18, 43, 36, -9, -18, 43, +-64, 80, -89, 90, -64, 80, -89, 90, -64, 80, -89, 90, -64, 80, -89, 90, +-83, 70, -50, 25, -83, 70, -50, 25, -83, 70, -50, 25, -83, 70, -50, 25, + 64, -90, 89, -87, 64, -90, 89, -87, 64, -90, 89, -87, 64, -90, 89, -87, + 83, -80, 75, -70, 83, -80, 75, -70, 83, -80, 75, -70, 83, -80, 75, -70, + 64, -57, 50, -43, 64, -57, 50, -43, 64, -57, 50, -43, 64, -57, 50, -43, + 36, -25, 18, -9, 36, -25, 18, -9, 36, -25, 18, -9, 36, -25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_16x4_coeff_hor[1024] = { + 8, 25, 40, 55, 8, 25, 40, 55, 8, 25, 40, 55, 8, 25, 40, 55, // 0 + 68, 77, 85, 88, 68, 77, 85, 88, 68, 77, 85, 88, 68, 77, 85, 88, + 88, 87, 81, 73, 88, 87, 81, 73, 88, 87, 81, 73, 88, 87, 81, 73, + 62, 48, 33, 17, 62, 48, 33, 17, 62, 48, 33, 17, 62, 48, 33, 17, + 17, 48, 73, 87, 17, 48, 73, 87, 17, 48, 73, 87, 17, 48, 73, 87, + 88, 77, 55, 25, 88, 77, 55, 25, 88, 77, 55, 25, 88, 77, 55, 25, + -8, -40, -68, -85, -8, -40, -68, -85, -8, -40, -68, -85, -8, -40, -68, -85, +-88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, -88, -81, -62, -33, + 25, 68, 88, 81, 25, 68, 88, 81, 25, 68, 88, 81, 25, 68, 88, 81, // 8 + 48, 0, -48, -81, 48, 0, -48, -81, 48, 0, -48, -81, 48, 0, -48, -81, +-88, -68, -25, 25, -88, -68, -25, 25, -88, -68, -25, 25, -88, -68, -25, 25, + 68, 88, 81, 48, 68, 88, 81, 48, 68, 88, 81, 48, 68, 88, 81, 48, + 33, 81, 85, 40, 33, 81, 85, 40, 33, 81, 85, 40, 33, 81, 85, 40, +-25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, -25, -77, -87, -48, + 17, 73, 88, 55, 17, 73, 88, 55, 17, 73, 88, 55, 17, 73, 88, 55, + -8, -68, -88, -62, -8, -68, -88, -62, -8, -68, -88, -62, -8, -68, -88, -62, + 40, 88, 62, -17, 40, 88, 62, -17, 40, 88, 62, -17, 40, 88, 62, -17, // 16 +-81, -77, -8, 68, -81, -77, -8, 68, -81, -77, -8, 68, -81, -77, -8, 68, + 87, 33, -48, -88, 87, 33, -48, -88, 87, 33, -48, -88, 87, 33, -48, -88, +-55, 25, 85, 73, -55, 25, 85, 73, -55, 25, 85, 73, -55, 25, 85, 73, + 48, 88, 25, -68, 48, 88, 25, -68, 48, 88, 25, -68, 48, 88, 25, -68, +-81, 0, 81, 68, -81, 0, 81, 68, -81, 0, 81, 68, -81, 0, 81, 68, +-25, -88, -48, 48, -25, -88, -48, 48, -25, -88, -48, 48, -25, -88, -48, 48, + 88, 25, -68, -81, 88, 25, -68, -81, 88, 25, -68, -81, 88, 25, -68, -81, + 55, 81, -17, -88, 55, 81, -17, -88, 55, 81, -17, -88, 55, 81, -17, -88, // 24 +-25, 77, 62, -48, -25, 77, 62, -48, -25, 77, 62, -48, -25, 77, 62, -48, +-85, 8, 88, 33, -85, 8, 88, 33, -85, 8, 88, 33, -85, 8, 88, 33, +-73, -68, 40, 87, -73, -68, 40, 87, -73, -68, 40, 87, -73, -68, 40, 87, + 62, 68, -55, -73, 62, 68, -55, -73, 62, 68, -55, -73, 62, 68, -55, -73, + 48, 77, -40, -81, 48, 77, -40, -81, 48, 77, -40, -81, 48, 77, -40, -81, + 33, 85, -25, -87, 33, 85, -25, -87, 33, 85, -25, -87, 33, 85, -25, -87, + 17, 88, -8, -88, 17, 88, -8, -88, 17, 88, -8, -88, 17, 88, -8, -88, + 68, 48, -81, -25, 68, 48, -81, -25, 68, 48, -81, -25, 68, 48, -81, -25, // 32 + 88, 0, -88, 25, 88, 0, -88, 25, 88, 0, -88, 25, 88, 0, -88, 25, + 81, -48, -68, 68, 81, -48, -68, 68, 81, -48, -68, 68, 81, -48, -68, 68, + 48, -81, -25, 88, 48, -81, -25, 88, 48, -81, -25, 88, 48, -81, -25, 88, + 73, 25, -88, 33, 73, 25, -88, 33, 73, 25, -88, 33, 73, 25, -88, 33, + 68, -77, -17, 88, 68, -77, -17, 88, 68, -77, -17, 88, 68, -77, -17, 88, +-40, -62, 81, 8, -40, -62, 81, 8, -40, -62, 81, 8, -40, -62, 81, 8, +-87, 48, 55, -85, -87, 48, 55, -85, -87, 48, 55, -85, -87, 48, 55, -85, + 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, // 40 + 0, -77, 77, 0, 0, -77, 77, 0, 0, -77, 77, 0, 0, -77, 77, 0, +-77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, + 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, 77, 0, -77, 77, + 81, -25, -48, 88, 81, -25, -48, 88, 81, -25, -48, 88, 81, -25, -48, 88, +-68, 0, 68, -88, -68, 0, 68, -88, -68, 0, 68, -88, -68, 0, 68, -88, + 48, 25, -81, 81, 48, 25, -81, 81, 48, 25, -81, 81, 48, 25, -81, 81, +-25, -48, 88, -68, -25, -48, 88, -68, -25, -48, 88, -68, -25, -48, 88, -68, + 85, -48, -8, 62, 85, -48, -8, 62, 85, -48, -8, 62, 85, -48, -8, 62, // 48 +-88, 77, -33, -25, -88, 77, -33, -25, -88, 77, -33, -25, -88, 77, -33, -25, + 73, -88, 68, -17, 73, -88, 68, -17, 73, -88, 68, -17, 73, -88, 68, -17, +-40, 81, -87, 55, -40, 81, -87, 55, -40, 81, -87, 55, -40, 81, -87, 55, + 87, -68, 33, 8, 87, -68, 33, 8, 87, -68, 33, 8, 87, -68, 33, 8, +-48, 77, -88, 81, -48, 77, -88, 81, -48, 77, -88, 81, -48, 77, -88, 81, +-55, 17, 25, -62, -55, 17, 25, -62, -55, 17, 25, -62, -55, 17, 25, -62, + 85, -88, 73, -40, 85, -88, 73, -40, 85, -88, 73, -40, 85, -88, 73, -40, + 88, -81, 68, -48, 88, -81, 68, -48, 88, -81, 68, -48, 88, -81, 68, -48, // 56 + 25, 0, -25, 48, 25, 0, -25, 48, 25, 0, -25, 48, 25, 0, -25, 48, +-68, 81, -88, 88, -68, 81, -88, 88, -68, 81, -88, 88, -68, 81, -88, 88, +-81, 68, -48, 25, -81, 68, -48, 25, -81, 68, -48, 25, -81, 68, -48, 25, + 88, -88, 87, -85, 88, -88, 87, -85, 88, -88, 87, -85, 88, -88, 87, -85, + 81, -77, 73, -68, 81, -77, 73, -68, 81, -77, 73, -68, 81, -77, 73, -68, + 62, -55, 48, -40, 62, -55, 48, -40, 62, -55, 48, -40, 62, -55, 48, -40, + 33, -25, 17, -8, 33, -25, 17, -8, 33, -25, 17, -8, 33, -25, 17, -8, +}; + +ALIGNED(32) const int16_t fi_dct8_16x4_coeff_hor[1024] = { + 88, 88, 87, 85, 88, 88, 87, 85, 88, 88, 87, 85, 88, 88, 87, 85, // 0 + 81, 77, 73, 68, 81, 77, 73, 68, 81, 77, 73, 68, 81, 77, 73, 68, + 62, 55, 48, 40, 62, 55, 48, 40, 62, 55, 48, 40, 62, 55, 48, 40, + 33, 25, 17, 8, 33, 25, 17, 8, 33, 25, 17, 8, 33, 25, 17, 8, + 88, 81, 68, 48, 88, 81, 68, 48, 88, 81, 68, 48, 88, 81, 68, 48, + 25, 0, -25, -48, 25, 0, -25, -48, 25, 0, -25, -48, 25, 0, -25, -48, +-68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, -68, -81, -88, -88, +-81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, -81, -68, -48, -25, + 87, 68, 33, -8, 87, 68, 33, -8, 87, 68, 33, -8, 87, 68, 33, -8, // 8 +-48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, -48, -77, -88, -81, +-55, -17, 25, 62, -55, -17, 25, 62, -55, -17, 25, 62, -55, -17, 25, 62, + 85, 88, 73, 40, 85, 88, 73, 40, 85, 88, 73, 40, 85, 88, 73, 40, + 85, 48, -8, -62, 85, 48, -8, -62, 85, 48, -8, -62, 85, 48, -8, -62, +-88, -77, -33, 25, -88, -77, -33, 25, -88, -77, -33, 25, -88, -77, -33, 25, + 73, 88, 68, 17, 73, 88, 68, 17, 73, 88, 68, 17, 73, 88, 68, 17, +-40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, -40, -81, -87, -55, + 81, 25, -48, -88, 81, 25, -48, -88, 81, 25, -48, -88, 81, 25, -48, -88, // 16 +-68, 0, 68, 88, -68, 0, 68, 88, -68, 0, 68, 88, -68, 0, 68, 88, + 48, -25, -81, -81, 48, -25, -81, -81, 48, -25, -81, -81, 48, -25, -81, -81, +-25, 48, 88, 68, -25, 48, 88, 68, -25, 48, 88, 68, -25, 48, 88, 68, + 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, + 0, 77, 77, 0, 0, 77, 77, 0, 0, 77, 77, 0, 0, 77, 77, 0, +-77, -77, 0, 77, -77, -77, 0, 77, -77, -77, 0, 77, -77, -77, 0, 77, + 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, 77, 0, -77, -77, + 73, -25, -88, -33, 73, -25, -88, -33, 73, -25, -88, -33, 73, -25, -88, -33, // 24 + 68, 77, -17, -88, 68, 77, -17, -88, 68, 77, -17, -88, 68, 77, -17, -88, +-40, 62, 81, -8, -40, 62, 81, -8, -40, 62, 81, -8, -40, 62, 81, -8, +-87, -48, 55, 85, -87, -48, 55, 85, -87, -48, 55, 85, -87, -48, 55, 85, + 68, -48, -81, 25, 68, -48, -81, 25, 68, -48, -81, 25, 68, -48, -81, 25, + 88, 0, -88, -25, 88, 0, -88, -25, 88, 0, -88, -25, 88, 0, -88, -25, + 81, 48, -68, -68, 81, 48, -68, -68, 81, 48, -68, -68, 81, 48, -68, -68, + 48, 81, -25, -88, 48, 81, -25, -88, 48, 81, -25, -88, 48, 81, -25, -88, + 62, -68, -55, 73, 62, -68, -55, 73, 62, -68, -55, 73, 62, -68, -55, 73, // 32 + 48, -77, -40, 81, 48, -77, -40, 81, 48, -77, -40, 81, 48, -77, -40, 81, + 33, -85, -25, 87, 33, -85, -25, 87, 33, -85, -25, 87, 33, -85, -25, 87, + 17, -88, -8, 88, 17, -88, -8, 88, 17, -88, -8, 88, 17, -88, -8, 88, + 55, -81, -17, 88, 55, -81, -17, 88, 55, -81, -17, 88, 55, -81, -17, 88, +-25, -77, 62, 48, -25, -77, 62, 48, -25, -77, 62, 48, -25, -77, 62, 48, +-85, -8, 88, -33, -85, -8, 88, -33, -85, -8, 88, -33, -85, -8, 88, -33, +-73, 68, 40, -87, -73, 68, 40, -87, -73, 68, 40, -87, -73, 68, 40, -87, + 48, -88, 25, 68, 48, -88, 25, 68, 48, -88, 25, 68, 48, -88, 25, 68, // 40 +-81, 0, 81, -68, -81, 0, 81, -68, -81, 0, 81, -68, -81, 0, 81, -68, +-25, 88, -48, -48, -25, 88, -48, -48, -25, 88, -48, -48, -25, 88, -48, -48, + 88, -25, -68, 81, 88, -25, -68, 81, 88, -25, -68, 81, 88, -25, -68, 81, + 40, -88, 62, 17, 40, -88, 62, 17, 40, -88, 62, 17, 40, -88, 62, 17, +-81, 77, -8, -68, -81, 77, -8, -68, -81, 77, -8, -68, -81, 77, -8, -68, + 87, -33, -48, 88, 87, -33, -48, 88, 87, -33, -48, 88, 87, -33, -48, 88, +-55, -25, 85, -73, -55, -25, 85, -73, -55, -25, 85, -73, -55, -25, 85, -73, + 33, -81, 85, -40, 33, -81, 85, -40, 33, -81, 85, -40, 33, -81, 85, -40, // 48 +-25, 77, -87, 48, -25, 77, -87, 48, -25, 77, -87, 48, -25, 77, -87, 48, + 17, -73, 88, -55, 17, -73, 88, -55, 17, -73, 88, -55, 17, -73, 88, -55, + -8, 68, -88, 62, -8, 68, -88, 62, -8, 68, -88, 62, -8, 68, -88, 62, + 25, -68, 88, -81, 25, -68, 88, -81, 25, -68, 88, -81, 25, -68, 88, -81, + 48, 0, -48, 81, 48, 0, -48, 81, 48, 0, -48, 81, 48, 0, -48, 81, +-88, 68, -25, -25, -88, 68, -25, -25, -88, 68, -25, -25, -88, 68, -25, -25, + 68, -88, 81, -48, 68, -88, 81, -48, 68, -88, 81, -48, 68, -88, 81, -48, + 17, -48, 73, -87, 17, -48, 73, -87, 17, -48, 73, -87, 17, -48, 73, -87, // 56 + 88, -77, 55, -25, 88, -77, 55, -25, 88, -77, 55, -25, 88, -77, 55, -25, + -8, 40, -68, 85, -8, 40, -68, 85, -8, 40, -68, 85, -8, 40, -68, 85, +-88, 81, -62, 33, -88, 81, -62, 33, -88, 81, -62, 33, -88, 81, -62, 33, + 8, -25, 40, -55, 8, -25, 40, -55, 8, -25, 40, -55, 8, -25, 40, -55, + 68, -77, 85, -88, 68, -77, 85, -88, 68, -77, 85, -88, 68, -77, 85, -88, + 88, -87, 81, -73, 88, -87, 81, -73, 88, -87, 81, -73, 88, -87, 81, -73, + 62, -48, 33, -17, 62, -48, 33, -17, 62, -48, 33, -17, 62, -48, 33, -17, +}; + + +ALIGNED(32) const int16_t fi_dct2_16x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_16x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_16x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t ff_dct2_16x8_coeff_ver[64] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50, + 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89, + 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75, + 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18, +}; + +ALIGNED(32) const int16_t ff_dst7_16x8_coeff_ver[64] = { + 17, 32, 46, 78, 71, 85, 85, 46, 86, -17, 78, -71, 60, -86, 32, -60, + 46, 60, 86, 71, 32, -46, -60, -78, -85, 32, -17, 85, 71, -17, 78, -86, + 71, 78, 32, -17, -86, -60, 17, 86, 78, -46, -60, -32, -46, 85, 85, -71, + 85, 86, -60, -85, 17, 78, 32, -71, -71, 60, 86, -46, -78, 32, 46, -17, +}; + +ALIGNED(32) const int16_t ff_dct8_16x8_coeff_ver[64] = { + 86, 85, 85, 60, 78, 17, 71, -32, 60, -71, 46, -86, 32, -78, 17, -46, + 78, 71, 17, -32, -60, -86, -86, -17, -46, 78, 32, 60, 85, -46, 71, -85, + 60, 46, -71, -86, -46, 32, 78, 60, 32, -85, -85, 17, -17, 71, 86, -78, + 32, 17, -78, -46, 85, 71, -46, -85, -17, 86, 71, -78, -86, 60, 60, -32, +}; + +ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_coeff_ver[128] = { + 64, 64, 89, 75, 83, 36, 75, -18, 64, 64, 89, 75, 83, 36, 75, -18, + 64, 64, 50, 18, -36, -83, -89, -50, 64, 64, 50, 18, -36, -83, -89, -50, + 64, 64, -18, -50, -83, -36, 50, 89, 64, 64, -18, -50, -83, -36, 50, 89, + 64, 64, -75, -89, 36, 83, 18, -75, 64, 64, -75, -89, 36, 83, 18, -75, + 64, -64, 50, -89, 36, -83, 18, -50, 64, -64, 50, -89, 36, -83, 18, -50, + -64, 64, 18, 75, 83, -36, 75, -89, -64, 64, 18, 75, 83, -36, 75, -89, + 64, -64, -75, -18, -36, 83, 89, -75, 64, -64, -75, -18, -36, 83, 89, -75, + -64, 64, 89, -50, -83, 36, 50, -18, -64, 64, 89, -50, -83, 36, 50, -18 +}; + +ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_o_row_coeff_ver[256] = { + 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, // 0 + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, // 8 +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, + 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, 18, -18, +-50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, -50, 50, + 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, 75, -75, +-89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, -89, 89, +}; + + + const int16_t* fi_dct2_16x8_coeff_hor = fi_dct2_8x16_coeff_ver; // Duplicate table. + + const int16_t* fi_dst7_16x8_coeff_hor = fi_dst7_8x16_coeff_ver; // Duplicate table. + + const int16_t* fi_dct8_16x8_coeff_hor = fi_dct8_8x16_coeff_ver; // Duplicate table. + + + const int16_t* fi_dct2_16x8_coeff_ver = fi_dct2_8x8_coeff_hor; // Duplicate table + + const int16_t* fi_dst7_16x8_coeff_ver = fi_dst7_8x8_coeff_hor; // Duplicate table + + const int16_t* fi_dct8_16x8_coeff_ver = fi_dct8_8x8_coeff_hor; // Duplicate table + + +ALIGNED(32) const int16_t ff_dct2_16x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) const int16_t ff_dst7_16x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) const int16_t ff_dct8_16x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) const int16_t fi_dct2_16x16_coeff_hor[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_16x16_coeff_hor[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver; + + + const int16_t* fi_dct2_16x16_coeff_ver = fi_dct2_16x16_coeff_hor; + + const int16_t* fi_dst7_16x16_coeff_ver = fi_dst7_16x16_coeff_hor; + + const int16_t* fi_dct8_16x16_coeff_ver = ff_dct8_16x16_coeff_ver; + + +ALIGNED(32) const int16_t ff_dct2_16x32_butterfly_o_row_coeff_ver[4096] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, // 8 + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 16 + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, // 24 +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, // 32 + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, // 40 + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, // 48 + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, // 56 + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, // 64 + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, // 72 +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, // 80 + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, // 88 +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, // 96 +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, // 104 + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, // 112 +-54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 120 + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, // 128 +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, // 136 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, // 144 +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, // 152 + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, // 160 +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, // 168 + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, // 176 +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, +-67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, // 184 +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, // 192 +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, // 200 +-22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, // 208 +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, + -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, + 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, // 216 + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, // 224 +-38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, + 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, +-78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, + 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, // 232 +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, + 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, 4, -4, // 240 +-13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, -13, 13, + 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, 22, -22, +-31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, -31, 31, + 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, 38, -38, +-46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, -46, 46, + 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, 54, -54, +-61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, -61, 61, + 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, 67, -67, // 248 +-73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, -73, 73, + 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, 78, -78, +-82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, -82, 82, + 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, 85, -85, +-88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, -88, 88, + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, +-90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, +}; + +ALIGNED(32) const int16_t ff_dct2_16x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dst7_16x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dct8_16x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + + const int16_t* fi_dct2_16x32_coeff_hor = fi_dct2_16x16_coeff_hor; + + const int16_t* fi_dst7_16x32_coeff_hor = fi_dst7_16x16_coeff_hor; + + const int16_t* fi_dct8_16x32_coeff_hor = ff_dct8_16x16_coeff_ver; + +// 32xN +ALIGNED(32) const int16_t ff_dct2_32xN_coeff_hor[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dst7_32xN_coeff_hor[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, // 0 + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, // 2 +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, // 8 + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, // 10 +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, // 4 + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, -60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, -46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, // 6 +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, -68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, -85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, // 12 + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, -53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, -21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, // 14 +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, -74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, // 16 + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, // 18 +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, // 24 + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, -17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, // 26 +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, -90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, // 20 + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, -46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, // 22 +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, -78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, -86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, // 28 + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, -38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, // 30 +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, -82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dct8_32xN_coeff_hor[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +ALIGNED(32) const int16_t fi_dct2_32xN_coeff_hor[1024] = { +64, 90, 64, 90, 64, 88, 64, 85, 64, 82, 64, 78, 64, 73, 64, 67, 64, 61, 64, 54, 64, 46, 64, 38, 64, 31, 64, 22, 64, 13, 64, 4, // 0 + 64, -4, 64, -13, 64, -22, 64, -31, 64, -38, 64, -46, 64, -54, 64, -61, 64, -67, 64, -73, 64, -78, 64, -82, 64, -85, 64, -88, 64, -90, 64, -90, + 90, 90, 87, 82, 80, 67, 70, 46, 57, 22, 43, -4, 25, -31, 9, -54, -9, -73, -25, -85, -43, -90, -57, -88, -70, -78, -80, -61, -87, -38, -90, -13, // 2 +-90, 13, -87, 38, -80, 61, -70, 78, -57, 88, -43, 90, -25, 85, -9, 73, 9, 54, 25, 31, 43, 4, 57, -22, 70, -46, 80, -67, 87, -82, 90, -90, + 89, 88, 75, 67, 50, 31, 18, -13, -18, -54, -50, -82, -75, -90, -89, -78, -89, -46, -75, -4, -50, 38, -18, 73, 18, 90, 50, 85, 75, 61, 89, 22, // 4 + 89, -22, 75, -61, 50, -85, 18, -90, -18, -73, -50, -38, -75, 4, -89, 46, -89, 78, -75, 90, -50, 82, -18, 54, 18, 13, 50, -31, 75, -67, 89, -88, + 87, 85, 57, 46, 9, -13, -43, -67, -80, -90, -90, -73, -70, -22, -25, 38, 25, 82, 70, 88, 90, 54, 80, -4, 43, -61, -9, -90, -57, -78, -87, -31, // 6 +-87, 31, -57, 78, -9, 90, 43, 61, 80, 4, 90, -54, 70, -88, 25, -82, -25, -38, -70, 22, -90, 73, -80, 90, -43, 67, 9, 13, 57, -46, 87, -85, + 83, 82, 36, 22, -36, -54, -83, -90, -83, -61, -36, 13, 36, 78, 83, 85, 83, 31, 36, -46, -36, -90, -83, -67, -83, 4, -36, 73, 36, 88, 83, 38, // 8 + 83, -38, 36, -88, -36, -73, -83, -4, -83, 67, -36, 90, 36, 46, 83, -31, 83, -85, 36, -78, -36, -13, -83, 61, -83, 90, -36, 54, 36, -22, 83, -82, + 80, 78, 9, -4, -70, -82, -87, -73, -25, 13, 57, 85, 90, 67, 43, -22, -43, -88, -90, -61, -57, 31, 25, 90, 87, 54, 70, -38, -9, -90, -80, -46, // 10 +-80, 46, -9, 90, 70, 38, 87, -54, 25, -90, -57, -31, -90, 61, -43, 88, 43, 22, 90, -67, 57, -85, -25, -13, -87, 73, -70, 82, 9, 4, 80, -78, + 75, 73, -18, -31, -89, -90, -50, -22, 50, 78, 89, 67, 18, -38, -75, -90, -75, -13, 18, 82, 89, 61, 50, -46, -50, -88, -89, -4, -18, 85, 75, 54, // 12 + 75, -54, -18, -85, -89, 4, -50, 88, 50, 46, 89, -61, 18, -82, -75, 13, -75, 90, 18, 38, 89, -67, 50, -78, -50, 22, -89, 90, -18, 31, 75, -73, + 70, 67, -43, -54, -87, -78, 9, 38, 90, 85, 25, -22, -80, -90, -57, 4, 57, 90, 80, 13, -25, -88, -90, -31, -9, 82, 87, 46, 43, -73, -70, -61, // 14 +-70, 61, 43, 73, 87, -46, -9, -82, -90, 31, -25, 88, 80, -13, 57, -90, -57, -4, -80, 90, 25, 22, 90, -85, 9, -38, -87, 78, -43, 54, 70, -67, + 64, 61, -64, -73, -64, -46, 64, 82, 64, 31, -64, -88, -64, -13, 64, 90, 64, -4, -64, -90, -64, 22, 64, 85, 64, -38, -64, -78, -64, 54, 64, 67, // 16 + 64, -67, -64, -54, -64, 78, 64, 38, 64, -85, -64, -22, -64, 90, 64, 4, 64, -90, -64, 13, -64, 88, 64, -31, 64, -82, -64, 46, -64, 73, 64, -61, + 57, 54, -80, -85, -25, -4, 90, 88, -9, -46, -87, -61, 43, 82, 70, 13, -70, -90, -43, 38, 87, 67, 9, -78, -90, -22, 25, 90, 80, -31, -57, -73, // 18 +-57, 73, 80, 31, 25, -90, -90, 22, 9, 78, 87, -67, -43, -38, -70, 90, 70, -13, 43, -82, -87, 61, -9, 46, 90, -88, -25, 4, -80, 85, 57, -54, + 50, 46, -89, -90, 18, 38, 75, 54, -75, -90, -18, 31, 89, 61, -50, -88, -50, 22, 89, 67, -18, -85, -75, 13, 75, 73, 18, -82, -89, 4, 50, 78, // 20 + 50, -78, -89, -4, 18, 82, 75, -73, -75, -13, -18, 85, 89, -67, -50, -22, -50, 88, 89, -61, -18, -31, -75, 90, 75, -54, 18, -38, -89, 90, 50, -46, + 43, 38, -90, -88, 57, 73, 25, -4, -87, -67, 70, 90, 9, -46, -80, -31, 80, 85, -9, -78, -70, 13, 87, 61, -25, -90, -57, 54, 90, 22, -43, -82, // 22 +-43, 82, 90, -22, -57, -54, -25, 90, 87, -61, -70, -13, -9, 78, 80, -85, -80, 31, 9, 46, 70, -90, -87, 67, 25, 4, 57, -73, -90, 88, 43, -38, + 36, 31, -83, -78, 83, 90, -36, -61, -36, 4, 83, 54, -83, -88, 36, 82, 36, -38, -83, -22, 83, 73, -36, -90, -36, 67, 83, -13, -83, -46, 36, 85, // 24 + 36, -85, -83, 46, 83, 13, -36, -67, -36, 90, 83, -73, -83, 22, 36, 38, 36, -82, -83, 88, 83, -54, -36, -4, -36, 61, 83, -90, -83, 78, 36, -31, + 25, 22, -70, -61, 90, 85, -80, -90, 43, 73, 9, -38, -57, -4, 87, 46, -87, -78, 57, 90, -9, -82, -43, 54, 80, -13, -90, -31, 70, 67, -25, -88, // 26 +-25, 88, 70, -67, -90, 31, 80, 13, -43, -54, -9, 82, 57, -90, -87, 78, 87, -46, -57, 4, 9, 38, 43, -73, -80, 90, 90, -85, -70, 61, 25, -22, + 18, 13, -50, -38, 75, 61, -89, -78, 89, 88, -75, -90, 50, 85, -18, -73, -18, 54, 50, -31, -75, 4, 89, 22, -89, -46, 75, 67, -50, -82, 18, 90, // 28 + 18, -90, -50, 82, 75, -67, -89, 46, 89, -22, -75, -4, 50, 31, -18, -54, -18, 73, 50, -85, -75, 90, 89, -88, -89, 78, 75, -61, -50, 38, 18, -13, + 9, 4, -25, -13, 43, 22, -57, -31, 70, 38, -80, -46, 87, 54, -90, -61, 90, 67, -87, -73, 80, 78, -70, -82, 57, 85, -43, -88, 25, 90, -9, -90, // 30 + -9, 90, 25, -90, -43, 88, 57, -85, -70, 82, 80, -78, -87, 73, 90, -67, -90, 61, 87, -54, -80, 46, 70, -38, -57, 31, 43, -22, -25, 13, 9, -4, +}; + + +ALIGNED(32) const int16_t fi_dst7_32xN_coeff_hor[1024] = { + 4, 13, 9, 26, 13, 38, 17, 50, 21, 60, 26, 68, 30, 77, 34, 82, 38, 86, 42, 89, 46, 90, 50, 88, 53, 85, 56, 80, 60, 74, 63, 66, // 0 + 66, 56, 68, 46, 72, 34, 74, 21, 77, 9, 78, -4, 80, -17, 82, -30, 84, -42, 85, -53, 86, -63, 87, -72, 88, -78, 89, -84, 90, -87, 90, -90, + 21, 30, 42, 56, 60, 77, 74, 87, 84, 89, 89, 80, 89, 63, 84, 38, 74, 9, 60, -21, 42, -50, 21, -72, 0, -85, -21, -90, -42, -84, -60, -68, // 2 +-74, -46, -84, -17, -89, 13, -89, 42, -84, 66, -74, 82, -60, 90, -42, 86, -21, 74, 0, 53, 21, 26, 42, -4, 60, -34, 74, -60, 84, -78, 89, -88, + 38, 46, 68, 78, 86, 90, 88, 77, 74, 42, 46, -4, 9, -50, -30, -80, -63, -90, -84, -74, -90, -38, -78, 9, -53, 53, -17, 82, 21, 89, 56, 72, // 4 + 80, 34, 90, -13, 82, -56, 60, -84, 26, -88, -13, -68, -50, -30, -77, 17, -89, 60, -85, 85, -66, 87, -34, 66, 4, 26, 42, -21, 72, -63, 87, -86, + 53, 60, 85, 89, 85, 74, 53, 21, 0, -42, -53, -84, -85, -84, -85, -42, -53, 21, 0, 74, 53, 89, 85, 60, 85, 0, 53, -60, 0, -89, -53, -74, // 6 +-85, -21, -85, 42, -53, 84, 0, 84, 53, 42, 85, -21, 85, -74, 53, -89, 0, -60, -53, 0, -85, 60, -85, 89, -53, 74, 0, 21, 53, -42, 85, -84, + 66, 72, 90, 86, 56, 34, -13, -46, -74, -89, -87, -63, -46, 13, 26, 78, 80, 82, 84, 21, 34, -56, -38, -90, -85, -53, -78, 26, -21, 84, 50, 77, // 8 + 88, 9, 72, -66, 9, -88, -60, -42, -90, 38, -63, 87, 4, 68, 68, -4, 89, -74, 53, -85, -17, -30, -77, 50, -86, 90, -42, 60, 30, -17, 82, -80, + 77, 80, 80, 72, 9, -17, -72, -86, -84, -60, -17, 34, 66, 90, 86, 46, 26, -50, -60, -89, -88, -30, -34, 63, 53, 85, 90, 13, 42, -74, -46, -78, // 10 +-90, 4, -50, 82, 38, 68, 89, -21, 56, -87, -30, -56, -87, 38, -63, 90, 21, 42, 85, -53, 68, -88, -13, -26, -82, 66, -74, 84, 4, 9, 78, -77, + 84, 86, 60, 46, -42, -63, -89, -78, -21, 21, 74, 90, 74, 26, -21, -77, -89, -66, -42, 42, 60, 87, 84, 4, 0, -85, -84, -50, -60, 60, 42, 80, // 12 + 89, -17, 21, -90, -74, -30, -74, 74, 21, 68, 89, -38, 42, -88, -60, -9, -84, 84, 0, 53, 84, -56, 60, -82, -42, 13, -89, 89, -21, 34, 74, -72, + 88, 90, 30, 13, -78, -87, -56, -26, 60, 84, 77, 38, -34, -78, -87, -50, 4, 72, 89, 60, 26, -63, -80, -68, -53, 53, 63, 77, 74, -42, -38, -82, // 14 +-86, 30, 9, 86, 90, -17, 21, -89, -82, 4, -50, 90, 66, 9, 72, -88, -42, -21, -85, 85, 13, 34, 90, -80, 17, -46, -84, 74, -46, 56, 68, -66, + 90, 89, -4, -21, -90, -84, 9, 42, 89, 74, -13, -60, -88, -60, 17, 74, 87, 42, -21, -84, -86, -21, 26, 89, 85, 0, -30, -89, -84, 21, 34, 84, // 16 + 82, -42, -38, -74, -80, 60, 42, 60, 78, -74, -46, -42, -77, 84, 50, 21, 74, -89, -53, 0, -72, 89, 56, -21, 68, -84, -60, 42, -66, 74, 63, -60, + 87, 85, -38, -53, -72, -53, 68, 85, 42, 0, -86, -85, -4, 53, 88, 53, -34, -85, -74, 0, 66, 85, 46, -53, -85, -53, -9, 85, 89, 0, -30, -85, // 18 +-77, 53, 63, 53, 50, -85, -84, 0, -13, 85, 90, -53, -26, -53, -78, 85, 60, 0, 53, -85, -82, 53, -17, 53, 90, -85, -21, 0, -80, 85, 56, -53, + 82, 78, -66, -77, -30, -4, 90, 80, -42, -74, -56, -9, 86, 82, -13, -72, -77, -13, 74, 84, 17, -68, -87, -17, 53, 85, 46, -66, -89, -21, 26, 86, // 20 + 68, -63, -80, -26, -4, 87, 84, -60, -63, -30, -34, 88, 90, -56, -38, -34, -60, 89, 85, -53, -9, -38, -78, 90, 72, -50, 21, -42, -88, 90, 50, -46, + 74, 68, -84, -88, 21, 46, 60, 30, -89, -84, 42, 78, 42, -17, -89, -56, 60, 90, 21, -60, -84, -13, 74, 77, 0, -85, -74, 34, 84, 42, -21, -87, // 22 +-60, 72, 89, -4, -42, -66, -42, 89, 89, -50, -60, -26, -21, 82, 84, -80, -74, 21, 0, 53, 74, -90, -84, 63, 21, 9, 60, -74, -89, 86, 42, -38, + 63, 56, -90, -87, 66, 80, -4, -38, -60, -21, 90, 72, -68, -90, 9, 68, 56, -17, -89, -42, 72, 82, -13, -86, -53, 53, 88, 4, -74, -60, 17, 88, // 24 + 50, -78, -87, 34, 77, 26, -21, -74, -46, 90, 86, -66, -78, 13, 26, 46, 42, -84, -85, 85, 80, -50, -30, -9, -38, 63, 84, -89, -82, 77, 34, -30, + 50, 42, -82, -74, 88, 89, -66, -84, 21, 60, 30, -21, -72, -21, 90, 60, -78, -84, 42, 89, 9, -74, -56, 42, 85, 0, -86, -42, 60, 74, -13, -89, // 26 +-38, 84, 77, -60, -90, 21, 74, 21, -34, -60, -17, 84, 63, -89, -87, 74, 84, -42, -53, 0, 4, 42, 46, -74, -80, 89, 89, -84, -68, 60, 26, -21, + 34, 26, -63, -50, 82, 68, -90, -82, 84, 89, -66, -88, 38, 80, -4, -66, -30, 46, 60, -21, -80, -4, 90, 30, -85, -53, 68, 72, -42, -84, 9, 90, // 28 + 26, -87, -56, 78, 78, -63, -89, 42, 86, -17, -72, -9, 46, 34, -13, -56, -21, 74, 53, -85, -77, 90, 88, -86, -87, 77, 74, -60, -50, 38, 17, -13, + 17, 9, -34, -17, 50, 26, -63, -34, 74, 42, -82, -50, 87, 56, -90, -63, 88, 68, -84, -74, 77, 78, -66, -82, 53, 85, -38, -87, 21, 89, -4, -90, // 30 +-13, 90, 30, -88, -46, 86, 60, -84, -72, 80, 80, -77, -86, 72, 90, -66, -89, 60, 85, -53, -78, 46, 68, -38, -56, 30, 42, -21, -26, 13, 9, -4, +}; + + +ALIGNED(32) const int16_t fi_dct8_32xN_coeff_hor[1024] = { +90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + +const int16_t ff_dct8_4x32_coeff_ver[1024] = { +90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, // 0 + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, -42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, // 2 +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, -77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, // 4 + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, -30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, // 6 +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, -85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, // 8 + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, -63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, // 10 +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, -60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, // 12 + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, -21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, // 14 +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, -87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, // 16 + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, -78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, // 18 +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, -38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, // 20 + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, -13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, // 22 +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, -89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, // 24 + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, -87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, // 26 +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, -13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, // 28 + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, // 30 + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, -90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; +const int16_t ff_dst7_4x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, // 0 + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, // 2 +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, -74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, // 4 + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, -60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, -46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, // 6 +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, -68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, // 8 + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, // 10 +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, -85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, // 12 + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, -53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, -21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, // 14 +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, -74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, // 16 + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, // 18 +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, -90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, // 20 + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, -46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, // 22 +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, -78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, // 24 + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, -17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, // 26 +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, -86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, // 28 + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, -38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, // 30 +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, -82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + + const int16_t* ff_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table + + + const int16_t* fi_dct2_32x2_coeff_ver = ff_dct2_2xN_coeff_hor; + + +ALIGNED(32) const int16_t ff_dct2_32x4_butterfly_eo_row_coeff_hor[512] = { + 90, 90, 87, 87, 90, 90, 87, 87, 90, 90, 87, 87, 90, 90, 87, 87, // 0 + 80, 80, 70, 70, 80, 80, 70, 70, 80, 80, 70, 70, 80, 80, 70, 70, + 57, 57, 43, 43, 57, 57, 43, 43, 57, 57, 43, 43, 57, 57, 43, 43, + 25, 25, 9, 9, 25, 25, 9, 9, 25, 25, 9, 9, 25, 25, 9, 9, + 87, 87, 57, 57, 87, 87, 57, 57, 87, 87, 57, 57, 87, 87, 57, 57, + 9, 9, -43, -43, 9, 9, -43, -43, 9, 9, -43, -43, 9, 9, -43, -43, +-80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, -80, -80, -90, -90, +-70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, -70, -70, -25, -25, + 80, 80, 9, 9, 80, 80, 9, 9, 80, 80, 9, 9, 80, 80, 9, 9, // 8 +-70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, -70, -70, -87, -87, +-25, -25, 57, 57, -25, -25, 57, 57, -25, -25, 57, 57, -25, -25, 57, 57, + 90, 90, 43, 43, 90, 90, 43, 43, 90, 90, 43, 43, 90, 90, 43, 43, + 70, 70, -43, -43, 70, 70, -43, -43, 70, 70, -43, -43, 70, 70, -43, -43, +-87, -87, 9, 9, -87, -87, 9, 9, -87, -87, 9, 9, -87, -87, 9, 9, + 90, 90, 25, 25, 90, 90, 25, 25, 90, 90, 25, 25, 90, 90, 25, 25, +-80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, -80, -80, -57, -57, + 57, 57, -80, -80, 57, 57, -80, -80, 57, 57, -80, -80, 57, 57, -80, -80, // 16 +-25, -25, 90, 90, -25, -25, 90, 90, -25, -25, 90, 90, -25, -25, 90, 90, + -9, -9, -87, -87, -9, -9, -87, -87, -9, -9, -87, -87, -9, -9, -87, -87, + 43, 43, 70, 70, 43, 43, 70, 70, 43, 43, 70, 70, 43, 43, 70, 70, + 43, 43, -90, -90, 43, 43, -90, -90, 43, 43, -90, -90, 43, 43, -90, -90, + 57, 57, 25, 25, 57, 57, 25, 25, 57, 57, 25, 25, 57, 57, 25, 25, +-87, -87, 70, 70, -87, -87, 70, 70, -87, -87, 70, 70, -87, -87, 70, 70, + 9, 9, -80, -80, 9, 9, -80, -80, 9, 9, -80, -80, 9, 9, -80, -80, + 25, 25, -70, -70, 25, 25, -70, -70, 25, 25, -70, -70, 25, 25, -70, -70, // 24 + 90, 90, -80, -80, 90, 90, -80, -80, 90, 90, -80, -80, 90, 90, -80, -80, + 43, 43, 9, 9, 43, 43, 9, 9, 43, 43, 9, 9, 43, 43, 9, 9, +-57, -57, 87, 87, -57, -57, 87, 87, -57, -57, 87, 87, -57, -57, 87, 87, + 9, 9, -25, -25, 9, 9, -25, -25, 9, 9, -25, -25, 9, 9, -25, -25, + 43, 43, -57, -57, 43, 43, -57, -57, 43, 43, -57, -57, 43, 43, -57, -57, + 70, 70, -80, -80, 70, 70, -80, -80, 70, 70, -80, -80, 70, 70, -80, -80, + 87, 87, -90, -90, 87, 87, -90, -90, 87, 87, -90, -90, 87, 87, -90, -90, +}; + +ALIGNED(32) const int16_t ff_dct2_32x4_butterfly_o_row_coeff_hor[2048] = { // TODO: change this to 32-bit combined coeff table at some point, these huge tables are getting out of hand + 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0 + 88, -88, 85, -85, 88, -88, 85, -85, 88, -88, 85, -85, 88, -88, 85, -85, + 82, -82, 78, -78, 82, -82, 78, -78, 82, -82, 78, -78, 82, -82, 78, -78, + 73, -73, 67, -67, 73, -73, 67, -67, 73, -73, 67, -67, 73, -73, 67, -67, + 61, -61, 54, -54, 61, -61, 54, -54, 61, -61, 54, -54, 61, -61, 54, -54, + 46, -46, 38, -38, 46, -46, 38, -38, 46, -46, 38, -38, 46, -46, 38, -38, + 31, -31, 22, -22, 31, -31, 22, -22, 31, -31, 22, -22, 31, -31, 22, -22, + 13, -13, 4, -4, 13, -13, 4, -4, 13, -13, 4, -4, 13, -13, 4, -4, + 90, -90, 82, -82, 90, -90, 82, -82, 90, -90, 82, -82, 90, -90, 82, -82, // 8 + 67, -67, 46, -46, 67, -67, 46, -46, 67, -67, 46, -46, 67, -67, 46, -46, + 22, -22, -4, 4, 22, -22, -4, 4, 22, -22, -4, 4, 22, -22, -4, 4, +-31, 31, -54, 54, -31, 31, -54, 54, -31, 31, -54, 54, -31, 31, -54, 54, +-73, 73, -85, 85, -73, 73, -85, 85, -73, 73, -85, 85, -73, 73, -85, 85, +-90, 90, -88, 88, -90, 90, -88, 88, -90, 90, -88, 88, -90, 90, -88, 88, +-78, 78, -61, 61, -78, 78, -61, 61, -78, 78, -61, 61, -78, 78, -61, 61, +-38, 38, -13, 13, -38, 38, -13, 13, -38, 38, -13, 13, -38, 38, -13, 13, + 88, -88, 67, -67, 88, -88, 67, -67, 88, -88, 67, -67, 88, -88, 67, -67, // 16 + 31, -31, -13, 13, 31, -31, -13, 13, 31, -31, -13, 13, 31, -31, -13, 13, +-54, 54, -82, 82, -54, 54, -82, 82, -54, 54, -82, 82, -54, 54, -82, 82, +-90, 90, -78, 78, -90, 90, -78, 78, -90, 90, -78, 78, -90, 90, -78, 78, +-46, 46, -4, 4, -46, 46, -4, 4, -46, 46, -4, 4, -46, 46, -4, 4, + 38, -38, 73, -73, 38, -38, 73, -73, 38, -38, 73, -73, 38, -38, 73, -73, + 90, -90, 85, -85, 90, -90, 85, -85, 90, -90, 85, -85, 90, -90, 85, -85, + 61, -61, 22, -22, 61, -61, 22, -22, 61, -61, 22, -22, 61, -61, 22, -22, + 85, -85, 46, -46, 85, -85, 46, -46, 85, -85, 46, -46, 85, -85, 46, -46, // 24 +-13, 13, -67, 67, -13, 13, -67, 67, -13, 13, -67, 67, -13, 13, -67, 67, +-90, 90, -73, 73, -90, 90, -73, 73, -90, 90, -73, 73, -90, 90, -73, 73, +-22, 22, 38, -38, -22, 22, 38, -38, -22, 22, 38, -38, -22, 22, 38, -38, + 82, -82, 88, -88, 82, -82, 88, -88, 82, -82, 88, -88, 82, -82, 88, -88, + 54, -54, -4, 4, 54, -54, -4, 4, 54, -54, -4, 4, 54, -54, -4, 4, +-61, 61, -90, 90, -61, 61, -90, 90, -61, 61, -90, 90, -61, 61, -90, 90, +-78, 78, -31, 31, -78, 78, -31, 31, -78, 78, -31, 31, -78, 78, -31, 31, + 82, -82, 22, -22, 82, -82, 22, -22, 82, -82, 22, -22, 82, -82, 22, -22, // 32 +-54, 54, -90, 90, -54, 54, -90, 90, -54, 54, -90, 90, -54, 54, -90, 90, +-61, 61, 13, -13, -61, 61, 13, -13, -61, 61, 13, -13, -61, 61, 13, -13, + 78, -78, 85, -85, 78, -78, 85, -85, 78, -78, 85, -85, 78, -78, 85, -85, + 31, -31, -46, 46, 31, -31, -46, 46, 31, -31, -46, 46, 31, -31, -46, 46, +-90, 90, -67, 67, -90, 90, -67, 67, -90, 90, -67, 67, -90, 90, -67, 67, + 4, -4, 73, -73, 4, -4, 73, -73, 4, -4, 73, -73, 4, -4, 73, -73, + 88, -88, 38, -38, 88, -88, 38, -38, 88, -88, 38, -38, 88, -88, 38, -38, + 78, -78, -4, 4, 78, -78, -4, 4, 78, -78, -4, 4, 78, -78, -4, 4, // 40 +-82, 82, -73, 73, -82, 82, -73, 73, -82, 82, -73, 73, -82, 82, -73, 73, + 13, -13, 85, -85, 13, -13, 85, -85, 13, -13, 85, -85, 13, -13, 85, -85, + 67, -67, -22, 22, 67, -67, -22, 22, 67, -67, -22, 22, 67, -67, -22, 22, +-88, 88, -61, 61, -88, 88, -61, 61, -88, 88, -61, 61, -88, 88, -61, 61, + 31, -31, 90, -90, 31, -31, 90, -90, 31, -31, 90, -90, 31, -31, 90, -90, + 54, -54, -38, 38, 54, -54, -38, 38, 54, -54, -38, 38, 54, -54, -38, 38, +-90, 90, -46, 46, -90, 90, -46, 46, -90, 90, -46, 46, -90, 90, -46, 46, + 73, -73, -31, 31, 73, -73, -31, 31, 73, -73, -31, 31, 73, -73, -31, 31, // 48 +-90, 90, -22, 22, -90, 90, -22, 22, -90, 90, -22, 22, -90, 90, -22, 22, + 78, -78, 67, -67, 78, -78, 67, -67, 78, -78, 67, -67, 78, -78, 67, -67, +-38, 38, -90, 90, -38, 38, -90, 90, -38, 38, -90, 90, -38, 38, -90, 90, +-13, 13, 82, -82, -13, 13, 82, -82, -13, 13, 82, -82, -13, 13, 82, -82, + 61, -61, -46, 46, 61, -61, -46, 46, 61, -61, -46, 46, 61, -61, -46, 46, +-88, 88, -4, 4, -88, 88, -4, 4, -88, 88, -4, 4, -88, 88, -4, 4, + 85, -85, 54, -54, 85, -85, 54, -54, 85, -85, 54, -54, 85, -85, 54, -54, + 67, -67, -54, 54, 67, -67, -54, 54, 67, -67, -54, 54, 67, -67, -54, 54, // 56 +-78, 78, 38, -38, -78, 78, 38, -38, -78, 78, 38, -38, -78, 78, 38, -38, + 85, -85, -22, 22, 85, -85, -22, 22, 85, -85, -22, 22, 85, -85, -22, 22, +-90, 90, 4, -4, -90, 90, 4, -4, -90, 90, 4, -4, -90, 90, 4, -4, + 90, -90, 13, -13, 90, -90, 13, -13, 90, -90, 13, -13, 90, -90, 13, -13, +-88, 88, -31, 31, -88, 88, -31, 31, -88, 88, -31, 31, -88, 88, -31, 31, + 82, -82, 46, -46, 82, -82, 46, -46, 82, -82, 46, -46, 82, -82, 46, -46, +-73, 73, -61, 61, -73, 73, -61, 61, -73, 73, -61, 61, -73, 73, -61, 61, + 61, -61, -73, 73, 61, -61, -73, 73, 61, -61, -73, 73, 61, -61, -73, 73, // 64 +-46, 46, 82, -82, -46, 46, 82, -82, -46, 46, 82, -82, -46, 46, 82, -82, + 31, -31, -88, 88, 31, -31, -88, 88, 31, -31, -88, 88, 31, -31, -88, 88, +-13, 13, 90, -90, -13, 13, 90, -90, -13, 13, 90, -90, -13, 13, 90, -90, + -4, 4, -90, 90, -4, 4, -90, 90, -4, 4, -90, 90, -4, 4, -90, 90, + 22, -22, 85, -85, 22, -22, 85, -85, 22, -22, 85, -85, 22, -22, 85, -85, +-38, 38, -78, 78, -38, 38, -78, 78, -38, 38, -78, 78, -38, 38, -78, 78, + 54, -54, 67, -67, 54, -54, 67, -67, 54, -54, 67, -67, 54, -54, 67, -67, + 54, -54, -85, 85, 54, -54, -85, 85, 54, -54, -85, 85, 54, -54, -85, 85, // 72 + -4, 4, 88, -88, -4, 4, 88, -88, -4, 4, 88, -88, -4, 4, 88, -88, +-46, 46, -61, 61, -46, 46, -61, 61, -46, 46, -61, 61, -46, 46, -61, 61, + 82, -82, 13, -13, 82, -82, 13, -13, 82, -82, 13, -13, 82, -82, 13, -13, +-90, 90, 38, -38, -90, 90, 38, -38, -90, 90, 38, -38, -90, 90, 38, -38, + 67, -67, -78, 78, 67, -67, -78, 78, 67, -67, -78, 78, 67, -67, -78, 78, +-22, 22, 90, -90, -22, 22, 90, -90, -22, 22, 90, -90, -22, 22, 90, -90, +-31, 31, -73, 73, -31, 31, -73, 73, -31, 31, -73, 73, -31, 31, -73, 73, + 46, -46, -90, 90, 46, -46, -90, 90, 46, -46, -90, 90, 46, -46, -90, 90, // 80 + 38, -38, 54, -54, 38, -38, 54, -54, 38, -38, 54, -54, 38, -38, 54, -54, +-90, 90, 31, -31, -90, 90, 31, -31, -90, 90, 31, -31, -90, 90, 31, -31, + 61, -61, -88, 88, 61, -61, -88, 88, 61, -61, -88, 88, 61, -61, -88, 88, + 22, -22, 67, -67, 22, -22, 67, -67, 22, -22, 67, -67, 22, -22, 67, -67, +-85, 85, 13, -13, -85, 85, 13, -13, -85, 85, 13, -13, -85, 85, 13, -13, + 73, -73, -82, 82, 73, -73, -82, 82, 73, -73, -82, 82, 73, -73, -82, 82, + 4, -4, 78, -78, 4, -4, 78, -78, 4, -4, 78, -78, 4, -4, 78, -78, + 38, -38, -88, 88, 38, -38, -88, 88, 38, -38, -88, 88, 38, -38, -88, 88, // 88 + 73, -73, -4, 4, 73, -73, -4, 4, 73, -73, -4, 4, 73, -73, -4, 4, +-67, 67, 90, -90, -67, 67, 90, -90, -67, 67, 90, -90, -67, 67, 90, -90, +-46, 46, -31, 31, -46, 46, -31, 31, -46, 46, -31, 31, -46, 46, -31, 31, + 85, -85, -78, 78, 85, -85, -78, 78, 85, -85, -78, 78, 85, -85, -78, 78, + 13, -13, 61, -61, 13, -13, 61, -61, 13, -13, 61, -61, 13, -13, 61, -61, +-90, 90, 54, -54, -90, 90, 54, -54, -90, 90, 54, -54, -90, 90, 54, -54, + 22, -22, -82, 82, 22, -22, -82, 82, 22, -22, -82, 82, 22, -22, -82, 82, + 31, -31, -78, 78, 31, -31, -78, 78, 31, -31, -78, 78, 31, -31, -78, 78, // 96 + 90, -90, -61, 61, 90, -90, -61, 61, 90, -90, -61, 61, 90, -90, -61, 61, + 4, -4, 54, -54, 4, -4, 54, -54, 4, -4, 54, -54, 4, -4, 54, -54, +-88, 88, 82, -82, -88, 88, 82, -82, -88, 88, 82, -82, -88, 88, 82, -82, +-38, 38, -22, 22, -38, 38, -22, 22, -38, 38, -22, 22, -38, 38, -22, 22, + 73, -73, -90, 90, 73, -73, -90, 90, 73, -73, -90, 90, 73, -73, -90, 90, + 67, -67, -13, 13, 67, -67, -13, 13, 67, -67, -13, 13, 67, -67, -13, 13, +-46, 46, 85, -85, -46, 46, 85, -85, -46, 46, 85, -85, -46, 46, 85, -85, + 22, -22, -61, 61, 22, -22, -61, 61, 22, -22, -61, 61, 22, -22, -61, 61, // 104 + 85, -85, -90, 90, 85, -85, -90, 90, 85, -85, -90, 90, 85, -85, -90, 90, + 73, -73, -38, 38, 73, -73, -38, 38, 73, -73, -38, 38, 73, -73, -38, 38, + -4, 4, 46, -46, -4, 4, 46, -46, -4, 4, 46, -46, -4, 4, 46, -46, +-78, 78, 90, -90, -78, 78, 90, -90, -78, 78, 90, -90, -78, 78, 90, -90, +-82, 82, 54, -54, -82, 82, 54, -54, -82, 82, 54, -54, -82, 82, 54, -54, +-13, 13, -31, 31, -13, 13, -31, 31, -13, 13, -31, 31, -13, 13, -31, 31, + 67, -67, -88, 88, 67, -67, -88, 88, 67, -67, -88, 88, 67, -67, -88, 88, + 13, -13, -38, 38, 13, -13, -38, 38, 13, -13, -38, 38, 13, -13, -38, 38, // 112 + 61, -61, -78, 78, 61, -61, -78, 78, 61, -61, -78, 78, 61, -61, -78, 78, + 88, -88, -90, 90, 88, -88, -90, 90, 88, -88, -90, 90, 88, -88, -90, 90, + 85, -85, -73, 73, 85, -85, -73, 73, 85, -85, -73, 73, 85, -85, -73, 73, + 54, -54, -31, 31, 54, -54, -31, 31, 54, -54, -31, 31, 54, -54, -31, 31, + 4, -4, 22, -22, 4, -4, 22, -22, 4, -4, 22, -22, 4, -4, 22, -22, +-46, 46, 67, -67, -46, 46, 67, -67, -46, 46, 67, -67, -46, 46, 67, -67, +-82, 82, 90, -90, -82, 82, 90, -90, -82, 82, 90, -90, -82, 82, 90, -90, + 4, -4, -13, 13, 4, -4, -13, 13, 4, -4, -13, 13, 4, -4, -13, 13, // 120 + 22, -22, -31, 31, 22, -22, -31, 31, 22, -22, -31, 31, 22, -22, -31, 31, + 38, -38, -46, 46, 38, -38, -46, 46, 38, -38, -46, 46, 38, -38, -46, 46, + 54, -54, -61, 61, 54, -54, -61, 61, 54, -54, -61, 61, 54, -54, -61, 61, + 67, -67, -73, 73, 67, -67, -73, 73, 67, -67, -73, 73, 67, -67, -73, 73, + 78, -78, -82, 82, 78, -78, -82, 82, 78, -78, -82, 82, 78, -78, -82, 82, + 85, -85, -88, 88, 85, -85, -88, 88, 85, -85, -88, 88, 85, -85, -88, 88, + 90, -90, -90, 90, 90, -90, -90, 90, 90, -90, -90, 90, 90, -90, -90, 90, +}; + + +ALIGNED(32) const int16_t ff_dct2_32x4_coeff_ver[128] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, +}; + +ALIGNED(32) const int16_t ff_dst7_32x4_coeff_ver[128] = { + 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, + 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, + 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, + 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, 55, -84, + 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, + 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, 0, -74, +-74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, + 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, 74, -29, +}; + +ALIGNED(32) const int16_t ff_dct8_32x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t fi_dct2_32x4_coeff_ver[128] = { + 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, + 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, +-64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +-64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, + 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, + 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, +}; + +ALIGNED(32) const int16_t fi_dst7_32x4_coeff_ver[128] = { + 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, 29, 74, + 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, 84, 55, + 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, 55, 74, +-29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, -29, -84, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, -74, 74, + 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, 84, -74, + 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, 55, -29, +}; + +ALIGNED(32) const int16_t fi_dct8_32x4_coeff_ver[128] = { + 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, 84, 74, + 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, 55, 29, + 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, 74, 0, +-74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, -74, + 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, 55, -74, +-29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, -29, 84, + 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, 29, -74, + 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, 84, -55, +}; + + +ALIGNED(32) const int16_t ff_dct2_32x8_coeff_ver[512] = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 0 + 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, + 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, + 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, + 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, + 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, + 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 8 + 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, +-36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, +-89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, + 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, + 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 16 +-18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, -18, -50, +-83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, -83, -36, + 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, 50, 89, + 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, +-75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, -75, -18, +-36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, -36, 83, + 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, 89, -75, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 24 +-75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, -75, -89, + 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, 36, 83, + 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, 18, -75, +-64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, -64, 64, + 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, 89, -50, +-83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, -83, 36, + 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, 50, -18, +}; + +ALIGNED(32) const int16_t ff_dst7_32x8_coeff_ver[512] = { + 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, // 0 + 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, 46, 78, + 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, + 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, 85, 46, + 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, + 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, 78, -71, + 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, + 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, 32, -60, + 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, // 8 + 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, 86, 71, + 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, +-60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, -60, -78, +-85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, +-17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, -17, 85, + 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, + 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, 78, -86, + 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, // 16 + 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, 32, -17, +-86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, + 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, 17, 86, + 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, +-60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, -60, -32, +-46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, + 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, 85, -71, + 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, // 24 +-60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, -60, -85, + 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, + 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, 32, -71, +-71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, + 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, 86, -46, +-78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, + 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, 46, -17, +}; + +ALIGNED(32) const int16_t ff_dct8_32x8_coeff_ver[512] = { + 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, 86, 85, // 0 + 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, 85, 60, + 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, 78, 17, + 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, 71, -32, + 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, 60, -71, + 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, 46, -86, + 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, 32, -78, + 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, 17, -46, + 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, 78, 71, // 8 + 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, 17, -32, +-60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, -60, -86, +-86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, -86, -17, +-46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, -46, 78, + 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, 32, 60, + 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, 85, -46, + 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, 71, -85, + 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, 60, 46, // 16 +-71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, -71, -86, +-46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, -46, 32, + 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, 78, 60, + 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, 32, -85, +-85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, -85, 17, +-17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, -17, 71, + 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, 86, -78, + 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, 32, 17, // 24 +-78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, -78, -46, + 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, 85, 71, +-46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, -46, -85, +-17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, -17, 86, + 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, 71, -78, +-86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, -86, 60, + 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, 60, -32, +}; + + +ALIGNED(32) const int16_t fi_dct2_32x8_coeff_ver[256] = { + 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, 64, 89, 83, 75, // 0 + 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, 64, 50, 36, 18, + 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, 64, 75, 36, -18, +-64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, -64, -89, -83, -50, + 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, 64, 50, -36, -89, +-64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, -64, 18, 83, 75, + 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, 64, 18, -83, -50, + 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, 64, 75, -36, -89, + 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, 64, -18, -83, 50, // 8 + 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, 64, -75, -36, 89, + 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, 64, -50, -36, 89, +-64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, -64, -18, 83, -75, + 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, 64, -75, 36, 18, +-64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, -64, 89, -83, 50, + 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, 64, -89, 83, -75, + 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, 64, -50, 36, -18, +}; + +ALIGNED(32) const int16_t fi_dst7_32x8_coeff_ver[256] = { + 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, 17, 46, 71, 85, // 0 + 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, 86, 78, 60, 32, + 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, 32, 78, 85, 46, +-17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, -17, -71, -86, -60, + 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, 46, 86, 32, -60, +-85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, -85, -17, 71, 78, + 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, 60, 71, -46, -78, + 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, 32, 85, -17, -86, + 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, 71, 32, -86, 17, // 8 + 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, 78, -60, -46, 85, + 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, 78, -17, -60, 86, +-46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, -46, -32, 85, -71, + 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, 85, -60, 17, 32, +-71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, -71, 86, -78, 46, + 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, 86, -85, 78, -71, + 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, 60, -46, 32, -17, +}; + +ALIGNED(32) const int16_t fi_dct8_32x8_coeff_ver[256] = { + 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, 86, 85, 78, 71, // 0 + 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, 60, 46, 32, 17, + 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, 85, 60, 17, -32, +-71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, -71, -86, -78, -46, + 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, 78, 17, -60, -86, +-46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, -46, 32, 85, 71, + 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, 71, -32, -86, -17, + 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, 78, 60, -46, -85, + 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, 60, -71, -46, 78, // 8 + 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, 32, -85, -17, 86, + 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, 46, -86, 32, 60, +-85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, -85, 17, 71, -78, + 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, 32, -78, 85, -46, +-17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, -17, 71, -86, 60, + 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, 17, -46, 71, -85, + 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, 86, -78, 60, -32, +}; + + +ALIGNED(32) const int16_t ff_dct2_32x16_coeff_ver[256] = { + 64, 64, 90, 87, 89, 75, 87, 57, 83, 36, 80, 9, 75, -18, 70, -43, // 0 + 64, -64, 57, -80, 50, -89, 43, -90, 36, -83, 25, -70, 18, -50, 9, -25, + 64, 64, 80, 70, 50, 18, 9, -43, -36, -83, -70, -87, -89, -50, -87, 9, +-64, 64, -25, 90, 18, 75, 57, 25, 83, -36, 90, -80, 75, -89, 43, -57, + 64, 64, 57, 43, -18, -50, -80, -90, -83, -36, -25, 57, 50, 89, 90, 25, + 64, -64, -9, -87, -75, -18, -87, 70, -36, 83, 43, 9, 89, -75, 70, -80, + 64, 64, 25, 9, -75, -89, -70, -25, 36, 83, 90, 43, 18, -75, -80, -57, +-64, 64, 43, 70, 89, -50, 9, -80, -83, 36, -57, 87, 50, -18, 87, -90, + 64, 64, -9, -25, -89, -75, 25, 70, 83, 36, -43, -90, -75, 18, 57, 80, // 8 + 64, -64, -70, -43, -50, 89, 80, -9, 36, -83, -87, 57, -18, 50, 90, -87, + 64, 64, -43, -57, -50, -18, 90, 80, -36, -83, -57, 25, 89, 50, -25, -90, +-64, 64, 87, 9, -18, -75, -70, 87, 83, -36, -9, -43, -75, 89, 80, -70, + 64, 64, -70, -80, 18, 50, 43, -9, -83, -36, 87, 70, -50, -89, -9, 87, + 64, -64, -90, 25, 75, 18, -25, -57, -36, 83, 80, -90, -89, 75, 57, -43, + 64, 64, -87, -90, 75, 89, -57, -87, 36, 83, -9, -80, -18, 75, 43, -70, +-64, 64, 80, -57, -89, 50, 90, -43, -83, 36, 70, -25, -50, 18, 25, -9, +}; + +ALIGNED(32) const int16_t ff_dst7_32x16_coeff_ver[256] = { + 8, 17, 25, 48, 40, 73, 55, 87, 68, 88, 77, 77, 85, 55, 88, 25, // 0 + 88, -8, 87, -40, 81, -68, 73, -85, 62, -88, 48, -81, 33, -62, 17, -33, + 25, 33, 68, 81, 88, 85, 81, 40, 48, -25, 0, -77, -48, -87, -81, -48, +-88, 17, -68, 73, -25, 88, 25, 55, 68, -8, 88, -68, 81, -88, 48, -62, + 40, 48, 88, 88, 62, 25, -17, -68, -81, -81, -77, 0, -8, 81, 68, 68, + 87, -25, 33, -88, -48, -48, -88, 48, -55, 88, 25, 25, 85, -68, 73, -81, + 55, 62, 81, 68, -17, -55, -88, -73, -25, 48, 77, 77, 62, -40, -48, -81, +-85, 33, 8, 85, 88, -25, 33, -87, -73, 17, -68, 88, 40, -8, 87, -88, + 68, 73, 48, 25, -81, -88, -25, 33, 88, 68, 0, -77, -88, -17, 25, 88, // 8 + 81, -40, -48, -62, -68, 81, 68, 8, 48, -87, -81, 48, -25, 55, 88, -85, + 77, 81, 0, -25, -77, -48, 77, 88, 0, -68, -77, 0, 77, 68, 0, -88, +-77, 48, 77, 25, 0, -81, -77, 81, 77, -25, 0, -48, -77, 88, 77, -68, + 85, 87, -48, -68, -8, 33, 62, 8, -88, -48, 77, 77, -33, -88, -25, 81, + 73, -55, -88, 17, 68, 25, -17, -62, -40, 85, 81, -88, -87, 73, 55, -40, + 88, 88, -81, -88, 68, 87, -48, -85, 25, 81, 0, -77, -25, 73, 48, -68, +-68, 62, 81, -55, -88, 48, 88, -40, -81, 33, 68, -25, -48, 17, 25, -8, +}; + +ALIGNED(32) const int16_t ff_dct8_32x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) const int16_t fi_dct2_32x16_coeff_ver[256] = { + 64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0 + 64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90, + 89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25, +-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87, + 83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43, + 83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80, + 75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57, +-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70, + 64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8 + 64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57, + 50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80, +-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43, + 36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87, + 36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25, + 18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90, +-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9, +}; + +ALIGNED(32) const int16_t fi_dst7_32x16_coeff_ver[256] = { + 8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0 + 68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88, + 40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73, +-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85, + 68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77, + 88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77, + 85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81, +-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68, + 88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8 + 81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55, + 81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87, +-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40, + 62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88, + 48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25, + 33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88, +-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8, +}; + +ALIGNED(32) const int16_t fi_dct8_32x16_coeff_ver[256] = { + 88, 88, 88, 81, 87, 68, 85, 48, 81, 25, 77, 0, 73, -25, 68, -48, // 0 + 62, -68, 55, -81, 48, -88, 40, -88, 33, -81, 25, -68, 17, -48, 8, -25, + 87, 85, 68, 48, 33, -8, -8, -62, -48, -88, -77, -77, -88, -33, -81, 25, +-55, 73, -17, 88, 25, 68, 62, 17, 85, -40, 88, -81, 73, -87, 40, -55, + 81, 77, 25, 0, -48, -77, -88, -77, -68, 0, 0, 77, 68, 77, 88, 0, + 48, -77, -25, -77, -81, 0, -81, 77, -25, 77, 48, 0, 88, -77, 68, -77, + 73, 68, -25, -48, -88, -81, -33, 25, 68, 88, 77, 0, -17, -88, -88, -25, +-40, 81, 62, 48, 81, -68, -8, -68, -87, 48, -48, 81, 55, -25, 85, -88, + 62, 55, -68, -81, -55, -17, 73, 88, 48, -25, -77, -77, -40, 62, 81, 48, // 8 + 33, -85, -85, -8, -25, 88, 87, -33, 17, -73, -88, 68, -8, 40, 88, -87, + 48, 40, -88, -88, 25, 62, 68, 17, -81, -81, 0, 77, 81, -8, -68, -68, +-25, 87, 88, -33, -48, -48, -48, 88, 88, -55, -25, -25, -68, 85, 81, -73, + 33, 25, -81, -68, 85, 88, -40, -81, -25, 48, 77, 0, -87, -48, 48, 81, + 17, -88, -73, 68, 88, -25, -55, -25, -8, 68, 68, -88, -88, 81, 62, -48, + 17, 8, -48, -25, 73, 40, -87, -55, 88, 68, -77, -77, 55, 85, -25, -88, + -8, 88, 40, -87, -68, 81, 85, -73, -88, 62, 81, -48, -62, 33, 33, -17, +}; + + +ALIGNED(32) const int16_t ff_dct2_32x32_coeff_ver[1024] = { + 64, 64, 90, 90, 90, 87, 90, 82, 89, 75, 88, 67, 87, 57, 85, 46, // 0 + 83, 36, 82, 22, 80, 9, 78, -4, 75, -18, 73, -31, 70, -43, 67, -54, + 64, -64, 61, -73, 57, -80, 54, -85, 50, -89, 46, -90, 43, -90, 38, -88, + 36, -83, 31, -78, 25, -70, 22, -61, 18, -50, 13, -38, 9, -25, 4, -13, + 64, 64, 88, 85, 80, 70, 67, 46, 50, 18, 31, -13, 9, -43, -13, -67, +-36, -83, -54, -90, -70, -87, -82, -73, -89, -50, -90, -22, -87, 9, -78, 38, +-64, 64, -46, 82, -25, 90, -4, 88, 18, 75, 38, 54, 57, 25, 73, -4, + 83, -36, 90, -61, 90, -80, 85, -90, 75, -89, 61, -78, 43, -57, 22, -31, + 64, 64, 82, 78, 57, 43, 22, -4, -18, -50, -54, -82, -80, -90, -90, -73, // 8 +-83, -36, -61, 13, -25, 57, 13, 85, 50, 89, 78, 67, 90, 25, 85, -22, + 64, -64, 31, -88, -9, -87, -46, -61, -75, -18, -90, 31, -87, 70, -67, 90, +-36, 83, 4, 54, 43, 9, 73, -38, 89, -75, 88, -90, 70, -80, 38, -46, + 64, 64, 73, 67, 25, 9, -31, -54, -75, -89, -90, -78, -70, -25, -22, 38, + 36, 83, 78, 85, 90, 43, 67, -22, 18, -75, -38, -90, -80, -57, -90, 4, +-64, 64, -13, 90, 43, 70, 82, 13, 89, -50, 61, -88, 9, -80, -46, -31, +-83, 36, -88, 82, -57, 87, -4, 46, 50, -18, 85, -73, 87, -90, 54, -61, + 64, 64, 61, 54, -9, -25, -73, -85, -89, -75, -46, -4, 25, 70, 82, 88, // 16 + 83, 36, 31, -46, -43, -90, -88, -61, -75, 18, -13, 82, 57, 80, 90, 13, + 64, -64, -4, -90, -70, -43, -90, 38, -50, 89, 22, 67, 80, -9, 85, -78, + 36, -83, -38, -22, -87, 57, -78, 90, -18, 50, 54, -31, 90, -87, 67, -73, + 64, 64, 46, 38, -43, -57, -90, -88, -50, -18, 38, 73, 90, 80, 54, -4, +-36, -83, -90, -67, -57, 25, 31, 90, 89, 50, 61, -46, -25, -90, -88, -31, +-64, 64, 22, 85, 87, 9, 67, -78, -18, -75, -85, 13, -70, 87, 13, 61, + 83, -36, 73, -90, -9, -43, -82, 54, -75, 89, 4, 22, 80, -70, 78, -82, + 64, 64, 31, 22, -70, -80, -78, -61, 18, 50, 90, 85, 43, -9, -61, -90, // 24 +-83, -36, 4, 73, 87, 70, 54, -38, -50, -89, -88, -4, -9, 87, 82, 46, + 64, -64, -38, -78, -90, 25, -22, 90, 75, 18, 73, -82, -25, -57, -90, 54, +-36, 83, 67, -13, 80, -90, -13, -31, -89, 75, -46, 67, 57, -43, 85, -88, + 64, 64, 13, 4, -87, -90, -38, -13, 75, 89, 61, 22, -57, -87, -78, -31, + 36, 83, 88, 38, -9, -80, -90, -46, -18, 75, 85, 54, 43, -70, -73, -61, +-64, 64, 54, 67, 80, -57, -31, -73, -89, 50, 4, 78, 90, -43, 22, -82, +-83, 36, -46, 85, 70, -25, 67, -88, -50, 18, -82, 90, 25, -9, 90, -90, + 64, 64, -4, -13, -90, -87, 13, 38, 89, 75, -22, -61, -87, -57, 31, 78, // 32 + 83, 36, -38, -88, -80, -9, 46, 90, 75, -18, -54, -85, -70, 43, 61, 73, + 64, -64, -67, -54, -57, 80, 73, 31, 50, -89, -78, -4, -43, 90, 82, -22, + 36, -83, -85, 46, -25, 70, 88, -67, 18, -50, -90, 82, -9, 25, 90, -90, + 64, 64, -22, -31, -80, -70, 61, 78, 50, 18, -85, -90, -9, 43, 90, 61, +-36, -83, -73, -4, 70, 87, 38, -54, -89, -50, 4, 88, 87, -9, -46, -82, +-64, 64, 78, 38, 25, -90, -90, 22, 18, 75, 82, -73, -57, -25, -54, 90, + 83, -36, 13, -67, -90, 80, 31, 13, 75, -89, -67, 46, -43, 57, 88, -85, + 64, 64, -38, -46, -57, -43, 88, 90, -18, -50, -73, -38, 80, 90, 4, -54, // 40 +-83, -36, 67, 90, 25, -57, -90, -31, 50, 89, 46, -61, -90, -25, 31, 88, + 64, -64, -85, -22, 9, 87, 78, -67, -75, -18, -13, 85, 87, -70, -61, -13, +-36, 83, 90, -73, -43, -9, -54, 82, 89, -75, -22, -4, -70, 80, 82, -78, + 64, 64, -54, -61, -25, -9, 85, 73, -75, -89, 4, 46, 70, 25, -88, -82, + 36, 83, 46, -31, -90, -43, 61, 88, 18, -75, -82, 13, 80, 57, -13, -90, +-64, 64, 90, 4, -43, -70, -38, 90, 89, -50, -67, -22, -9, 80, 78, -85, +-83, 36, 22, 38, 57, -87, -90, 78, 50, -18, 31, -54, -87, 90, 73, -67, + 64, 64, -67, -73, 9, 25, 54, 31, -89, -75, 78, 90, -25, -70, -38, 22, // 48 + 83, 36, -85, -78, 43, 90, 22, -67, -75, 18, 90, 38, -57, -80, -4, 90, + 64, -64, -90, 13, 70, 43, -13, -82, -50, 89, 88, -61, -80, 9, 31, 46, + 36, -83, -82, 88, 87, -57, -46, 4, -18, 50, 73, -85, -90, 87, 61, -54, + 64, 64, -78, -82, 43, 57, 4, -22, -50, -18, 82, 54, -90, -80, 73, 90, +-36, -83, -13, 61, 57, -25, -85, -13, 89, 50, -67, -78, 25, 90, 22, -85, +-64, 64, 88, -31, -87, -9, 61, 46, -18, -75, -31, 90, 70, -87, -90, 67, + 83, -36, -54, -4, 9, 43, 38, -73, -75, 89, 90, -88, -80, 70, 46, -38, + 64, 64, -85, -88, 70, 80, -46, -67, 18, 50, 13, -31, -43, 9, 67, 13, // 56 +-83, -36, 90, 54, -87, -70, 73, 82, -50, -89, 22, 90, 9, -87, -38, 78, + 64, -64, -82, 46, 90, -25, -88, 4, 75, 18, -54, -38, 25, 57, 4, -73, +-36, 83, 61, -90, -80, 90, 90, -85, -89, 75, 78, -61, -57, 43, 31, -22, + 64, 64, -90, -90, 87, 90, -82, -90, 75, 89, -67, -88, 57, 87, -46, -85, + 36, 83, -22, -82, 9, 80, 4, -78, -18, 75, 31, -73, -43, 70, 54, -67, +-64, 64, 73, -61, -80, 57, 85, -54, -89, 50, 90, -46, -90, 43, 88, -38, +-83, 36, 78, -31, -70, 25, 61, -22, -50, 18, 38, -13, -25, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dst7_32x32_coeff_ver[1024] = { + 4, 9, 13, 26, 21, 42, 30, 56, 38, 68, 46, 78, 53, 85, 60, 89, // 0 + 66, 90, 72, 86, 77, 80, 80, 72, 84, 60, 86, 46, 88, 30, 90, 13, + 90, -4, 89, -21, 87, -38, 85, -53, 82, -66, 78, -77, 74, -84, 68, -88, + 63, -90, 56, -87, 50, -82, 42, -74, 34, -63, 26, -50, 17, -34, 9, -17, + 13, 17, 38, 50, 60, 74, 77, 87, 86, 88, 90, 77, 85, 53, 74, 21, + 56, -13, 34, -46, 9, -72, -17, -86, -42, -89, -63, -78, -78, -56, -87, -26, +-90, 9, -84, 42, -72, 68, -53, 85, -30, 90, -4, 80, 21, 60, 46, 30, + 66, -4, 80, -38, 88, -66, 89, -84, 82, -90, 68, -82, 50, -63, 26, -34, + 21, 26, 60, 68, 84, 89, 89, 80, 74, 46, 42, -4, 0, -53, -42, -84, // 8 +-74, -87, -89, -63, -84, -17, -60, 34, -21, 74, 21, 90, 60, 77, 84, 38, + 89, -13, 74, -60, 42, -86, 0, -85, -42, -56, -74, -9, -89, 42, -84, 78, +-60, 90, -21, 72, 21, 30, 60, -21, 84, -66, 89, -88, 74, -82, 42, -50, + 30, 34, 77, 82, 89, 84, 63, 38, 9, -30, -50, -80, -85, -85, -84, -42, +-46, 26, 13, 78, 66, 86, 90, 46, 74, -21, 26, -77, -34, -87, -78, -50, +-88, 17, -60, 74, -4, 88, 53, 53, 86, -13, 82, -72, 42, -89, -17, -56, +-68, 9, -90, 68, -72, 90, -21, 60, 38, -4, 80, -66, 87, -90, 56, -63, + 38, 42, 86, 89, 74, 60, 9, -21, -63, -84, -90, -74, -53, 0, 21, 74, // 16 + 80, 84, 82, 21, 26, -60, -50, -89, -89, -42, -66, 42, 4, 89, 72, 60, + 87, -21, 42, -84, -34, -74, -85, 0, -77, 74, -13, 84, 60, 21, 90, -60, + 56, -89, -17, -42, -78, 42, -84, 89, -30, 60, 46, -21, 88, -84, 68, -74, + 46, 50, 90, 88, 42, 21, -50, -72, -90, -78, -38, 9, 53, 85, 89, 60, + 34, -38, -56, -90, -88, -34, -30, 63, 60, 84, 87, 4, 26, -80, -63, -68, +-86, 26, -21, 89, 66, 46, 85, -53, 17, -87, -68, -17, -84, 74, -13, 77, + 72, -13, 82, -86, 9, -56, -74, 42, -80, 90, -4, 30, 77, -66, 78, -82, + 53, 56, 85, 80, 0, -21, -85, -90, -53, -17, 53, 82, 85, 53, 0, -60, // 24 +-85, -78, -53, 26, 53, 90, 85, 13, 0, -84, -85, -50, -53, 63, 53, 77, + 85, -30, 0, -89, -85, -9, -53, 85, 53, 46, 85, -66, 0, -74, -85, 34, +-53, 88, 53, 4, 85, -86, 0, -42, -85, 68, -53, 72, 53, -38, 85, -87, + 60, 63, 74, 66, -42, -60, -84, -68, 21, 56, 89, 72, 0, -53, -89, -74, +-21, 50, 84, 77, 42, -46, -74, -78, -60, 42, 60, 80, 74, -38, -42, -82, +-84, 34, 21, 84, 89, -30, 0, -85, -89, 26, -21, 86, 84, -21, 42, -87, +-74, 17, -60, 88, 60, -13, 74, -89, -42, 9, -84, 90, 21, -4, 89, -90, + 66, 68, 56, 46, -74, -84, -46, -17, 80, 90, 34, -13, -85, -85, -21, 42, // 32 + 88, 72, 9, -66, -90, -50, 4, 82, 89, 21, -17, -90, -86, 9, 30, 86, + 82, -38, -42, -74, -77, 63, 53, 53, 68, -80, -63, -26, -60, 89, 72, -4, + 50, -87, -78, 34, -38, 77, 84, -60, 26, -56, -87, 78, -13, 30, 90, -88, + 72, 74, 34, 21, -89, -89, 13, 42, 82, 60, -56, -84, -53, 0, 84, 84, + 9, -60, -88, -42, 38, 89, 68, -21, -74, -74, -30, 74, 90, 21, -17, -89, +-80, 42, 60, 60, 50, -84, -85, 0, -4, 84, 87, -60, -42, -42, -66, 89, + 77, -21, 26, -74, -90, 74, 21, 21, 78, -89, -63, 42, -46, 60, 86, -84, + 77, 78, 9, -4, -84, -74, 66, 82, 26, -13, -88, -68, 53, 85, 42, -21, // 40 +-90, -63, 38, 87, 56, -30, -87, -56, 21, 89, 68, -38, -82, -50, 4, 90, + 78, -46, -74, -42, -13, 90, 85, -53, -63, -34, -30, 88, 89, -60, -50, -26, +-46, 86, 90, -66, -34, -17, -60, 84, 86, -72, -17, -9, -72, 80, 80, -77, + 80, 82, -17, -30, -60, -42, 90, 86, -50, -77, -30, 17, 85, 53, -74, -89, + 4, 68, 68, -4, -87, -63, 38, 90, 42, -60, -88, -9, 66, 72, 9, -88, +-77, 50, 84, 21, -26, -78, -53, 85, 90, -38, -56, -34, -21, 84, 82, -80, +-78, 26, 13, 46, 63, -87, -89, 74, 46, -13, 34, -56, -86, 90, 72, -66, + 84, 85, -42, -53, -21, 0, 74, 53, -89, -85, 60, 85, 0, -53, -60, 0, // 48 + 89, 53, -74, -85, 21, 85, 42, -53, -84, 0, 84, 53, -42, -85, -21, 85, + 74, -53, -89, 0, 60, 53, 0, -85, -60, 85, 89, -53, -74, 0, 21, 53, + 42, -85, -84, 85, 84, -53, -42, 0, -21, 53, 74, -85, -89, 85, 60, -53, + 86, 87, -63, -72, 21, 42, 26, -4, -66, -34, 87, 66, -85, -85, 60, 89, +-17, -77, -30, 50, 68, -13, -88, -26, 84, 60, -56, -82, 13, 90, 34, -80, +-72, 56, 89, -21, -82, -17, 53, 53, -9, -78, -38, 90, 74, -84, -90, 63, + 80, -30, -50, -9, 4, 46, 42, -74, -77, 88, 90, -86, -78, 68, 46, -38, + 88, 89, -78, -84, 60, 74, -34, -60, 4, 42, 26, -21, -53, 0, 74, 21, // 56 +-86, -42, 90, 60, -82, -74, 66, 84, -42, -89, 13, 89, 17, -84, -46, 74, + 68, -60, -84, 42, 90, -21, -85, 0, 72, 21, -50, -42, 21, 60, 9, -74, +-38, 84, 63, -89, -80, 89, 89, -84, -87, 74, 77, -60, -56, 42, 30, -21, + 90, 90, -87, -90, 84, 89, -78, -88, 72, 87, -63, -86, 53, 85, -42, -84, + 30, 82, -17, -80, 4, 78, 9, -77, -21, 74, 34, -72, -46, 68, 56, -66, +-66, 63, 74, -60, -80, 56, 85, -53, -88, 50, 90, -46, -89, 42, 86, -38, +-82, 34, 77, -30, -68, 26, 60, -21, -50, 17, 38, -13, -26, 9, 13, -4, +}; + +ALIGNED(32) const int16_t ff_dct8_32x32_coeff_ver[1024] = { + 90, 90, 90, 87, 89, 84, 88, 78, 87, 72, 86, 63, 85, 53, 84, 42, // 0 + 82, 30, 80, 17, 78, 4, 77, -9, 74, -21, 72, -34, 68, -46, 66, -56, + 63, -66, 60, -74, 56, -80, 53, -85, 50, -88, 46, -90, 42, -89, 38, -86, + 34, -82, 30, -77, 26, -68, 21, -60, 17, -50, 13, -38, 9, -26, 4, -13, + 89, 88, 84, 78, 74, 60, 60, 34, 42, 4, 21, -26, 0, -53, -21, -74, +-42, -86, -60, -90, -74, -82, -84, -66, -89, -42, -89, -13, -84, 17, -74, 46, +-60, 68, -42, 84, -21, 90, 0, 85, 21, 72, 42, 50, 60, 21, 74, -9, + 84, -38, 89, -63, 89, -80, 84, -89, 74, -87, 60, -77, 42, -56, 21, -30, + 87, 86, 72, 63, 42, 21, 4, -26, -34, -66, -66, -87, -85, -85, -89, -60, // 8 +-77, -17, -50, 30, -13, 68, 26, 88, 60, 84, 82, 56, 90, 13, 80, -34, + 56, -72, 21, -89, -17, -82, -53, -53, -78, -9, -90, 38, -84, 74, -63, 90, +-30, 80, 9, 50, 46, 4, 74, -42, 88, -77, 86, -90, 68, -78, 38, -46, + 85, 84, 53, 42, 0, -21, -53, -74, -85, -89, -85, -60, -53, 0, 0, 60, + 53, 89, 85, 74, 85, 21, 53, -42, 0, -84, -53, -84, -85, -42, -85, 21, +-53, 74, 0, 89, 53, 60, 85, 0, 85, -60, 53, -89, 0, -74, -53, -21, +-85, 42, -85, 84, -53, 84, 0, 42, 53, -21, 85, -74, 85, -89, 53, -60, + 82, 80, 30, 17, -42, -60, -86, -90, -77, -50, -17, 30, 53, 85, 89, 74, // 16 + 68, 4, 4, -68, -63, -87, -90, -38, -60, 42, 9, 88, 72, 66, 88, -9, + 50, -77, -21, -84, -78, -26, -85, 53, -38, 90, 34, 56, 84, -21, 80, -82, + 26, -78, -46, -13, -87, 63, -74, 89, -13, 46, 56, -34, 90, -86, 66, -72, + 78, 77, 4, -9, -74, -84, -82, -66, -13, 26, 68, 88, 85, 53, 21, -42, +-63, -90, -87, -38, -30, 56, 56, 87, 89, 21, 38, -68, -50, -82, -90, -4, +-46, 78, 42, 74, 90, -13, 53, -85, -34, -63, -88, 30, -60, 89, 26, 50, + 86, -46, 66, -90, -17, -34, -84, 60, -72, 86, 9, 17, 80, -72, 77, -80, + 74, 72, -21, -34, -89, -89, -42, -13, 60, 82, 84, 56, 0, -53, -84, -84, // 24 +-60, 9, 42, 88, 89, 38, 21, -68, -74, -74, -74, 30, 21, 90, 89, 17, + 42, -80, -60, -60, -84, 50, 0, 85, 84, -4, 60, -87, -42, -42, -89, 66, +-21, 77, 74, -26, 74, -90, -21, -21, -89, 78, -42, 63, 60, -46, 84, -86, + 68, 66, -46, -56, -84, -74, 17, 46, 90, 80, 13, -34, -85, -85, -42, 21, + 72, 88, 66, -9, -50, -90, -82, -4, 21, 89, 90, 17, 9, -86, -86, -30, +-38, 82, 74, 42, 63, -77, -53, -53, -80, 68, 26, 63, 89, -60, 4, -72, +-87, 50, -34, 78, 77, -38, 60, -84, -56, 26, -78, 87, 30, -13, 88, -90, + 63, 60, -66, -74, -60, -42, 68, 84, 56, 21, -72, -89, -53, 0, 74, 89, // 32 + 50, -21, -77, -84, -46, 42, 78, 74, 42, -60, -80, -60, -38, 74, 82, 42, + 34, -84, -84, -21, -30, 89, 85, 0, 26, -89, -86, 21, -21, 84, 87, -42, + 17, -74, -88, 60, -13, 60, 89, -74, 9, -42, -90, 84, -4, 21, 90, -89, + 56, 53, -80, -85, -21, 0, 90, 85, -17, -53, -82, -53, 53, 85, 60, 0, +-78, -85, -26, 53, 90, 53, -13, -85, -84, 0, 50, 85, 63, -53, -77, -53, +-30, 85, 89, 0, -9, -85, -85, 53, 46, 53, 66, -85, -74, 0, -34, 85, + 88, -53, -4, -53, -86, 85, 42, 0, 68, -85, -72, 53, -38, 53, 87, -85, + 50, 46, -88, -90, 21, 42, 72, 50, -78, -90, -9, 38, 85, 53, -60, -89, // 40 +-38, 34, 90, 56, -34, -88, -63, 30, 84, 60, -4, -87, -80, 26, 68, 63, + 26, -86, -89, 21, 46, 66, 53, -85, -87, 17, 17, 68, 74, -84, -77, 13, +-13, 72, 86, -82, -56, 9, -42, 74, 90, -80, -30, 4, -66, 77, 82, -78, + 42, 38, -89, -86, 60, 74, 21, -9, -84, -63, 74, 90, 0, -53, -74, -21, + 84, 80, -21, -82, -60, 26, 89, 50, -42, -89, -42, 66, 89, 4, -60, -72, +-21, 87, 84, -42, -74, -34, 0, 85, 74, -77, -84, 13, 21, 60, 60, -90, +-89, 56, 42, 17, 42, -78, -89, 84, 60, -30, 21, -46, -84, 88, 74, -68, + 34, 30, -82, -77, 84, 89, -38, -63, -30, 9, 80, 50, -85, -85, 42, 84, // 48 + 26, -46, -78, -13, 86, 66, -46, -90, -21, 74, 77, -26, -87, -34, 50, 78, + 17, -88, -74, 60, 88, -4, -53, -53, -13, 86, 72, -82, -89, 42, 56, 17, + 9, -68, -68, 90, 90, -72, -60, 21, -4, 38, 66, -80, -90, 87, 63, -56, + 26, 21, -68, -60, 89, 84, -80, -89, 46, 74, 4, -42, -53, 0, 84, 42, +-87, -74, 63, 89, -17, -84, -34, 60, 74, -21, -90, -21, 77, 60, -38, -84, +-13, 89, 60, -74, -86, 42, 85, 0, -56, -42, 9, 74, 42, -89, -78, 84, + 90, -60, -72, 21, 30, 21, 21, -60, -66, 84, 88, -89, -82, 74, 50, -42, + 17, 13, -50, -38, 74, 60, -87, -77, 88, 86, -77, -90, 53, 85, -21, -74, // 56 +-13, 56, 46, -34, -72, 9, 86, 17, -89, -42, 78, 63, -56, -78, 26, 87, + 9, -90, -42, 84, 68, -72, -85, 53, 90, -30, -80, 4, 60, 21, -30, -46, + -4, 66, 38, -80, -66, 88, 84, -89, -90, 82, 82, -68, -63, 50, 34, -26, + 9, 4, -26, -13, 42, 21, -56, -30, 68, 38, -78, -46, 85, 53, -89, -60, + 90, 66, -86, -72, 80, 77, -72, -80, 60, 84, -46, -86, 30, 88, -13, -90, + -4, 90, 21, -89, -38, 87, 53, -85, -66, 82, 77, -78, -84, 74, 88, -68, +-90, 63, 87, -56, -82, 50, 74, -42, -63, 34, 50, -26, -34, 17, 17, -9, +}; + + +typedef int32_t TCoeff; +typedef int16_t TMatrixCoeff; + +//! \ingroup CommonLib +//! \{ + + + // DCT-2 +#define DEFINE_DCT2_P2_MATRIX(a) \ +{ \ + {a, a}, \ + {a, -a} \ +} + +#define DEFINE_DCT2_P4_MATRIX(a,b,c) \ +{ \ + { a, a, a, a}, \ + { b, c, -c, -b}, \ + { a, -a, -a, a}, \ + { c, -b, b, -c} \ +} + +#define DEFINE_DCT2_P8_MATRIX(a,b,c,d,e,f,g) \ +{ \ + { a, a, a, a, a, a, a, a}, \ + { d, e, f, g, -g, -f, -e, -d}, \ + { b, c, -c, -b, -b, -c, c, b}, \ + { e, -g, -d, -f, f, d, g, -e}, \ + { a, -a, -a, a, a, -a, -a, a}, \ + { f, -d, g, e, -e, -g, d, -f}, \ + { c, -b, b, -c, -c, b, -b, c}, \ + { g, -f, e, -d, d, -e, f, -g} \ +} + +#define DEFINE_DCT2_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ +{ \ + { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}, \ + { h, i, j, k, l, m, n, o, -o, -n, -m, -l, -k, -j, -i, -h}, \ + { d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d}, \ + { i, l, o, -m, -j, -h, -k, -n, n, k, h, j, m, -o, -l, -i}, \ + { b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b}, \ + { j, o, -k, -i, -n, l, h, m, -m, -h, -l, n, i, k, -o, -j}, \ + { e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e}, \ + { k, -m, -i, o, h, n, -j, -l, l, j, -n, -h, -o, i, m, -k}, \ + { a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a}, \ + { l, -j, -n, h, -o, -i, m, k, -k, -m, i, o, -h, n, j, -l}, \ + { f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f}, \ + { m, -h, l, n, -i, k, o, -j, j, -o, -k, i, -n, -l, h, -m}, \ + { c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c}, \ + { n, -k, h, -j, m, o, -l, i, -i, l, -o, -m, j, -h, k, -n}, \ + { g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g}, \ + { o, -n, m, -l, k, -j, i, -h, h, -i, j, -k, l, -m, n, -o} \ +} + +#define DEFINE_DCT2_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E) \ +{ \ + { a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}, \ + { p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, -E, -D, -C, -B, -A, -z, -y, -x, -w, -v, -u, -t, -s, -r, -q, -p}, \ + { h, i, j, k, l, m, n, o, -o, -n, -m, -l, -k, -j, -i, -h, -h, -i, -j, -k, -l, -m, -n, -o, o, n, m, l, k, j, i, h}, \ + { q, t, w, z, C, -E, -B, -y, -v, -s, -p, -r, -u, -x, -A, -D, D, A, x, u, r, p, s, v, y, B, E, -C, -z, -w, -t, -q}, \ + { d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d, d, e, f, g, -g, -f, -e, -d, -d, -e, -f, -g, g, f, e, d}, \ + { r, w, B, -D, -y, -t, -p, -u, -z, -E, A, v, q, s, x, C, -C, -x, -s, -q, -v, -A, E, z, u, p, t, y, D, -B, -w, -r}, \ + { i, l, o, -m, -j, -h, -k, -n, n, k, h, j, m, -o, -l, -i, -i, -l, -o, m, j, h, k, n, -n, -k, -h, -j, -m, o, l, i}, \ + { s, z, -D, -w, -p, -v, -C, A, t, r, y, -E, -x, -q, -u, -B, B, u, q, x, E, -y, -r, -t, -A, C, v, p, w, D, -z, -s}, \ + { b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b, b, c, -c, -b, -b, -c, c, b}, \ + { t, C, -y, -p, -x, D, u, s, B, -z, -q, -w, E, v, r, A, -A, -r, -v, -E, w, q, z, -B, -s, -u, -D, x, p, y, -C, -t}, \ + { j, o, -k, -i, -n, l, h, m, -m, -h, -l, n, i, k, -o, -j, -j, -o, k, i, n, -l, -h, -m, m, h, l, -n, -i, -k, o, j}, \ + { u, -E, -t, -v, D, s, w, -C, -r, -x, B, q, y, -A, -p, -z, z, p, A, -y, -q, -B, x, r, C, -w, -s, -D, v, t, E, -u}, \ + { e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e, e, -g, -d, -f, f, d, g, -e, -e, g, d, f, -f, -d, -g, e}, \ + { v, -B, -p, -C, u, w, -A, -q, -D, t, x, -z, -r, -E, s, y, -y, -s, E, r, z, -x, -t, D, q, A, -w, -u, C, p, B, -v}, \ + { k, -m, -i, o, h, n, -j, -l, l, j, -n, -h, -o, i, m, -k, -k, m, i, -o, -h, -n, j, l, -l, -j, n, h, o, -i, -m, k}, \ + { w, -y, -u, A, s, -C, -q, E, p, D, -r, -B, t, z, -v, -x, x, v, -z, -t, B, r, -D, -p, -E, q, C, -s, -A, u, y, -w}, \ + { a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a, a, -a, -a, a}, \ + { x, -v, -z, t, B, -r, -D, p, -E, -q, C, s, -A, -u, y, w, -w, -y, u, A, -s, -C, q, E, -p, D, r, -B, -t, z, v, -x}, \ + { l, -j, -n, h, -o, -i, m, k, -k, -m, i, o, -h, n, j, -l, -l, j, n, -h, o, i, -m, -k, k, m, -i, -o, h, -n, -j, l}, \ + { y, -s, -E, r, -z, -x, t, D, -q, A, w, -u, -C, p, -B, -v, v, B, -p, C, u, -w, -A, q, -D, -t, x, z, -r, E, s, -y}, \ + { f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f, f, -d, g, e, -e, -g, d, -f, -f, d, -g, -e, e, g, -d, f}, \ + { z, -p, A, y, -q, B, x, -r, C, w, -s, D, v, -t, E, u, -u, -E, t, -v, -D, s, -w, -C, r, -x, -B, q, -y, -A, p, -z}, \ + { m, -h, l, n, -i, k, o, -j, j, -o, -k, i, -n, -l, h, -m, -m, h, -l, -n, i, -k, -o, j, -j, o, k, -i, n, l, -h, m}, \ + { A, -r, v, -E, -w, q, -z, -B, s, -u, D, x, -p, y, C, -t, t, -C, -y, p, -x, -D, u, -s, B, z, -q, w, E, -v, r, -A}, \ + { c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c, c, -b, b, -c, -c, b, -b, c}, \ + { B, -u, q, -x, E, y, -r, t, -A, -C, v, -p, w, -D, -z, s, -s, z, D, -w, p, -v, C, A, -t, r, -y, -E, x, -q, u, -B}, \ + { n, -k, h, -j, m, o, -l, i, -i, l, -o, -m, j, -h, k, -n, -n, k, -h, j, -m, -o, l, -i, i, -l, o, m, -j, h, -k, n}, \ + { C, -x, s, -q, v, -A, -E, z, -u, p, -t, y, -D, -B, w, -r, r, -w, B, D, -y, t, -p, u, -z, E, A, -v, q, -s, x, -C}, \ + { g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g, g, -f, e, -d, d, -e, f, -g, -g, f, -e, d, -d, e, -f, g}, \ + { D, -A, x, -u, r, -p, s, -v, y, -B, E, C, -z, w, -t, q, -q, t, -w, z, -C, -E, B, -y, v, -s, p, -r, u, -x, A, -D}, \ + { o, -n, m, -l, k, -j, i, -h, h, -i, j, -k, l, -m, n, -o, -o, n, -m, l, -k, j, -i, h, -h, i, -j, k, -l, m, -n, o}, \ + { E, -D, C, -B, A, -z, y, -x, w, -v, u, -t, s, -r, q, -p, p, -q, r, -s, t, -u, v, -w, x, -y, z, -A, B, -C, D, -E} \ +} + + +#define DEFINE_DCT2_P64_MATRIX(aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck) \ +{ \ + { aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa, aa }, \ + { bf, bg, bh, bi, bj, bk, bl, bm, bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck, -ck, -cj, -ci, -ch, -cg, -cf, -ce, -cd, -cc, -cb, -ca, -bz, -by, -bx, -bw, -bv, -bu, -bt, -bs, -br, -bq, -bp, -bo, -bn, -bm, -bl, -bk, -bj, -bi, -bh, -bg, -bf }, \ + { ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, -be, -bd, -bc, -bb, -ba, -az, -ay, -ax, -aw, -av, -au, -at, -as, -ar, -aq, -ap, -ap, -aq, -ar, -as, -at, -au, -av, -aw, -ax, -ay, -az, -ba, -bb, -bc, -bd, -be, be, bd, bc, bb, ba, az, ay, ax, aw, av, au, at, as, ar, aq, ap }, \ + { bg, bj, bm, bp, bs, bv, by, cb, ce, ch, ck, -ci, -cf, -cc, -bz, -bw, -bt, -bq, -bn, -bk, -bh, -bf, -bi, -bl, -bo, -br, -bu, -bx, -ca, -cd, -cg, -cj, cj, cg, cd, ca, bx, bu, br, bo, bl, bi, bf, bh, bk, bn, bq, bt, bw, bz, cc, cf, ci, -ck, -ch, -ce, -cb, -by, -bv, -bs, -bp, -bm, -bj, -bg }, \ + { ah, ai, aj, ak, al, am, an, ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao, ao, an, am, al, ak, aj, ai, ah, ah, ai, aj, ak, al, am, an, ao, -ao, -an, -am, -al, -ak, -aj, -ai, -ah, -ah, -ai, -aj, -ak, -al, -am, -an, -ao, ao, an, am, al, ak, aj, ai, ah }, \ + { bh, bm, br, bw, cb, cg, -ck, -cf, -ca, -bv, -bq, -bl, -bg, -bi, -bn, -bs, -bx, -cc, -ch, cj, ce, bz, bu, bp, bk, bf, bj, bo, bt, by, cd, ci, -ci, -cd, -by, -bt, -bo, -bj, -bf, -bk, -bp, -bu, -bz, -ce, -cj, ch, cc, bx, bs, bn, bi, bg, bl, bq, bv, ca, cf, ck, -cg, -cb, -bw, -br, -bm, -bh }, \ + { aq, at, aw, az, bc, -be, -bb, -ay, -av, -as, -ap, -ar, -au, -ax, -ba, -bd, bd, ba, ax, au, ar, ap, as, av, ay, bb, be, -bc, -az, -aw, -at, -aq, -aq, -at, -aw, -az, -bc, be, bb, ay, av, as, ap, ar, au, ax, ba, bd, -bd, -ba, -ax, -au, -ar, -ap, -as, -av, -ay, -bb, -be, bc, az, aw, at, aq }, \ + { bi, bp, bw, cd, ck, -ce, -bx, -bq, -bj, -bh, -bo, -bv, -cc, -cj, cf, by, br, bk, bg, bn, bu, cb, ci, -cg, -bz, -bs, -bl, -bf, -bm, -bt, -ca, -ch, ch, ca, bt, bm, bf, bl, bs, bz, cg, -ci, -cb, -bu, -bn, -bg, -bk, -br, -by, -cf, cj, cc, bv, bo, bh, bj, bq, bx, ce, -ck, -cd, -bw, -bp, -bi }, \ + { ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad, ad, ae, af, ag, -ag, -af, -ae, -ad, -ad, -ae, -af, -ag, ag, af, ae, ad }, \ + { bj, bs, cb, ck, -cc, -bt, -bk, -bi, -br, -ca, -cj, cd, bu, bl, bh, bq, bz, ci, -ce, -bv, -bm, -bg, -bp, -by, -ch, cf, bw, bn, bf, bo, bx, cg, -cg, -bx, -bo, -bf, -bn, -bw, -cf, ch, by, bp, bg, bm, bv, ce, -ci, -bz, -bq, -bh, -bl, -bu, -cd, cj, ca, br, bi, bk, bt, cc, -ck, -cb, -bs, -bj }, \ + { ar, aw, bb, -bd, -ay, -at, -ap, -au, -az, -be, ba, av, aq, as, ax, bc, -bc, -ax, -as, -aq, -av, -ba, be, az, au, ap, at, ay, bd, -bb, -aw, -ar, -ar, -aw, -bb, bd, ay, at, ap, au, az, be, -ba, -av, -aq, -as, -ax, -bc, bc, ax, as, aq, av, ba, -be, -az, -au, -ap, -at, -ay, -bd, bb, aw, ar }, \ + { bk, bv, cg, -ce, -bt, -bi, -bm, -bx, -ci, cc, br, bg, bo, bz, ck, -ca, -bp, -bf, -bq, -cb, cj, by, bn, bh, bs, cd, -ch, -bw, -bl, -bj, -bu, -cf, cf, bu, bj, bl, bw, ch, -cd, -bs, -bh, -bn, -by, -cj, cb, bq, bf, bp, ca, -ck, -bz, -bo, -bg, -br, -cc, ci, bx, bm, bi, bt, ce, -cg, -bv, -bk }, \ + { ai, al, ao, -am, -aj, -ah, -ak, -an, an, ak, ah, aj, am, -ao, -al, -ai, -ai, -al, -ao, am, aj, ah, ak, an, -an, -ak, -ah, -aj, -am, ao, al, ai, ai, al, ao, -am, -aj, -ah, -ak, -an, an, ak, ah, aj, am, -ao, -al, -ai, -ai, -al, -ao, am, aj, ah, ak, an, -an, -ak, -ah, -aj, -am, ao, al, ai }, \ + { bl, by, -ck, -bx, -bk, -bm, -bz, cj, bw, bj, bn, ca, -ci, -bv, -bi, -bo, -cb, ch, bu, bh, bp, cc, -cg, -bt, -bg, -bq, -cd, cf, bs, bf, br, ce, -ce, -br, -bf, -bs, -cf, cd, bq, bg, bt, cg, -cc, -bp, -bh, -bu, -ch, cb, bo, bi, bv, ci, -ca, -bn, -bj, -bw, -cj, bz, bm, bk, bx, ck, -by, -bl }, \ + { as, az, -bd, -aw, -ap, -av, -bc, ba, at, ar, ay, -be, -ax, -aq, -au, -bb, bb, au, aq, ax, be, -ay, -ar, -at, -ba, bc, av, ap, aw, bd, -az, -as, -as, -az, bd, aw, ap, av, bc, -ba, -at, -ar, -ay, be, ax, aq, au, bb, -bb, -au, -aq, -ax, -be, ay, ar, at, ba, -bc, -av, -ap, -aw, -bd, az, as }, \ + { bm, cb, -cf, -bq, -bi, -bx, cj, bu, bf, bt, ci, -by, -bj, -bp, -ce, cc, bn, bl, ca, -cg, -br, -bh, -bw, ck, bv, bg, bs, ch, -bz, -bk, -bo, -cd, cd, bo, bk, bz, -ch, -bs, -bg, -bv, -ck, bw, bh, br, cg, -ca, -bl, -bn, -cc, ce, bp, bj, by, -ci, -bt, -bf, -bu, -cj, bx, bi, bq, cf, -cb, -bm }, \ + { ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab, ab, ac, -ac, -ab, -ab, -ac, ac, ab }, \ + { bn, ce, -ca, -bj, -br, -ci, bw, bf, bv, -cj, -bs, -bi, -bz, cf, bo, bm, cd, -cb, -bk, -bq, -ch, bx, bg, bu, -ck, -bt, -bh, -by, cg, bp, bl, cc, -cc, -bl, -bp, -cg, by, bh, bt, ck, -bu, -bg, -bx, ch, bq, bk, cb, -cd, -bm, -bo, -cf, bz, bi, bs, cj, -bv, -bf, -bw, ci, br, bj, ca, -ce, -bn }, \ + { at, bc, -ay, -ap, -ax, bd, au, as, bb, -az, -aq, -aw, be, av, ar, ba, -ba, -ar, -av, -be, aw, aq, az, -bb, -as, -au, -bd, ax, ap, ay, -bc, -at, -at, -bc, ay, ap, ax, -bd, -au, -as, -bb, az, aq, aw, -be, -av, -ar, -ba, ba, ar, av, be, -aw, -aq, -az, bb, as, au, bd, -ax, -ap, -ay, bc, at }, \ + { bo, ch, -bv, -bh, -ca, cc, bj, bt, -cj, -bq, -bm, -cf, bx, bf, by, -ce, -bl, -br, -ck, bs, bk, cd, -bz, -bg, -bw, cg, bn, bp, ci, -bu, -bi, -cb, cb, bi, bu, -ci, -bp, -bn, -cg, bw, bg, bz, -cd, -bk, -bs, ck, br, bl, ce, -by, -bf, -bx, cf, bm, bq, cj, -bt, -bj, -cc, ca, bh, bv, -ch, -bo }, \ + { aj, ao, -ak, -ai, -an, al, ah, am, -am, -ah, -al, an, ai, ak, -ao, -aj, -aj, -ao, ak, ai, an, -al, -ah, -am, am, ah, al, -an, -ai, -ak, ao, aj, aj, ao, -ak, -ai, -an, al, ah, am, -am, -ah, -al, an, ai, ak, -ao, -aj, -aj, -ao, ak, ai, an, -al, -ah, -am, am, ah, al, -an, -ai, -ak, ao, aj }, \ + { bp, ck, -bq, -bo, -cj, br, bn, ci, -bs, -bm, -ch, bt, bl, cg, -bu, -bk, -cf, bv, bj, ce, -bw, -bi, -cd, bx, bh, cc, -by, -bg, -cb, bz, bf, ca, -ca, -bf, -bz, cb, bg, by, -cc, -bh, -bx, cd, bi, bw, -ce, -bj, -bv, cf, bk, bu, -cg, -bl, -bt, ch, bm, bs, -ci, -bn, -br, cj, bo, bq, -ck, -bp }, \ + { au, -be, -at, -av, bd, as, aw, -bc, -ar, -ax, bb, aq, ay, -ba, -ap, -az, az, ap, ba, -ay, -aq, -bb, ax, ar, bc, -aw, -as, -bd, av, at, be, -au, -au, be, at, av, -bd, -as, -aw, bc, ar, ax, -bb, -aq, -ay, ba, ap, az, -az, -ap, -ba, ay, aq, bb, -ax, -ar, -bc, aw, as, bd, -av, -at, -be, au }, \ + { bq, -ci, -bl, -bv, cd, bg, ca, -by, -bi, -cf, bt, bn, ck, -bo, -bs, cg, bj, bx, -cb, -bf, -cc, bw, bk, ch, -br, -bp, cj, bm, bu, -ce, -bh, -bz, bz, bh, ce, -bu, -bm, -cj, bp, br, -ch, -bk, -bw, cc, bf, cb, -bx, -bj, -cg, bs, bo, -ck, -bn, -bt, cf, bi, by, -ca, -bg, -cd, bv, bl, ci, -bq }, \ + { ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae, ae, -ag, -ad, -af, af, ad, ag, -ae, -ae, ag, ad, af, -af, -ad, -ag, ae }, \ + { br, -cf, -bg, -cc, bu, bo, -ci, -bj, -bz, bx, bl, ck, -bm, -bw, ca, bi, ch, -bp, -bt, cd, bf, ce, -bs, -bq, cg, bh, cb, -bv, -bn, cj, bk, by, -by, -bk, -cj, bn, bv, -cb, -bh, -cg, bq, bs, -ce, -bf, -cd, bt, bp, -ch, -bi, -ca, bw, bm, -ck, -bl, -bx, bz, bj, ci, -bo, -bu, cc, bg, cf, -br }, \ + { av, -bb, -ap, -bc, au, aw, -ba, -aq, -bd, at, ax, -az, -ar, -be, as, ay, -ay, -as, be, ar, az, -ax, -at, bd, aq, ba, -aw, -au, bc, ap, bb, -av, -av, bb, ap, bc, -au, -aw, ba, aq, bd, -at, -ax, az, ar, be, -as, -ay, ay, as, -be, -ar, -az, ax, at, -bd, -aq, -ba, aw, au, -bc, -ap, -bb, av }, \ + { bs, -cc, -bi, -cj, bl, bz, -bv, -bp, cf, bf, cg, -bo, -bw, by, bm, -ci, -bh, -cd, br, bt, -cb, -bj, -ck, bk, ca, -bu, -bq, ce, bg, ch, -bn, -bx, bx, bn, -ch, -bg, -ce, bq, bu, -ca, -bk, ck, bj, cb, -bt, -br, cd, bh, ci, -bm, -by, bw, bo, -cg, -bf, -cf, bp, bv, -bz, -bl, cj, bi, cc, -bs }, \ + { ak, -am, -ai, ao, ah, an, -aj, -al, al, aj, -an, -ah, -ao, ai, am, -ak, -ak, am, ai, -ao, -ah, -an, aj, al, -al, -aj, an, ah, ao, -ai, -am, ak, ak, -am, -ai, ao, ah, an, -aj, -al, al, aj, -an, -ah, -ao, ai, am, -ak, -ak, am, ai, -ao, -ah, -an, aj, al, -al, -aj, an, ah, ao, -ai, -am, ak }, \ + { bt, -bz, -bn, cf, bh, ck, -bi, -ce, bo, by, -bu, -bs, ca, bm, -cg, -bg, -cj, bj, cd, -bp, -bx, bv, br, -cb, -bl, ch, bf, ci, -bk, -cc, bq, bw, -bw, -bq, cc, bk, -ci, -bf, -ch, bl, cb, -br, -bv, bx, bp, -cd, -bj, cj, bg, cg, -bm, -ca, bs, bu, -by, -bo, ce, bi, -ck, -bh, -cf, bn, bz, -bt }, \ + { aw, -ay, -au, ba, as, -bc, -aq, be, ap, bd, -ar, -bb, at, az, -av, -ax, ax, av, -az, -at, bb, ar, -bd, -ap, -be, aq, bc, -as, -ba, au, ay, -aw, -aw, ay, au, -ba, -as, bc, aq, -be, -ap, -bd, ar, bb, -at, -az, av, ax, -ax, -av, az, at, -bb, -ar, bd, ap, be, -aq, -bc, as, ba, -au, -ay, aw }, \ + { bu, -bw, -bs, by, bq, -ca, -bo, cc, bm, -ce, -bk, cg, bi, -ci, -bg, ck, bf, cj, -bh, -ch, bj, cf, -bl, -cd, bn, cb, -bp, -bz, br, bx, -bt, -bv, bv, bt, -bx, -br, bz, bp, -cb, -bn, cd, bl, -cf, -bj, ch, bh, -cj, -bf, -ck, bg, ci, -bi, -cg, bk, ce, -bm, -cc, bo, ca, -bq, -by, bs, bw, -bu }, \ + { aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa, aa, -aa, -aa, aa }, \ + { bv, -bt, -bx, br, bz, -bp, -cb, bn, cd, -bl, -cf, bj, ch, -bh, -cj, bf, -ck, -bg, ci, bi, -cg, -bk, ce, bm, -cc, -bo, ca, bq, -by, -bs, bw, bu, -bu, -bw, bs, by, -bq, -ca, bo, cc, -bm, -ce, bk, cg, -bi, -ci, bg, ck, -bf, cj, bh, -ch, -bj, cf, bl, -cd, -bn, cb, bp, -bz, -br, bx, bt, -bv }, \ + { ax, -av, -az, at, bb, -ar, -bd, ap, -be, -aq, bc, as, -ba, -au, ay, aw, -aw, -ay, au, ba, -as, -bc, aq, be, -ap, bd, ar, -bb, -at, az, av, -ax, -ax, av, az, -at, -bb, ar, bd, -ap, be, aq, -bc, -as, ba, au, -ay, -aw, aw, ay, -au, -ba, as, bc, -aq, -be, ap, -bd, -ar, bb, at, -az, -av, ax }, \ + { bw, -bq, -cc, bk, ci, -bf, ch, bl, -cb, -br, bv, bx, -bp, -cd, bj, cj, -bg, cg, bm, -ca, -bs, bu, by, -bo, -ce, bi, ck, -bh, cf, bn, -bz, -bt, bt, bz, -bn, -cf, bh, -ck, -bi, ce, bo, -by, -bu, bs, ca, -bm, -cg, bg, -cj, -bj, cd, bp, -bx, -bv, br, cb, -bl, -ch, bf, -ci, -bk, cc, bq, -bw }, \ + { al, -aj, -an, ah, -ao, -ai, am, ak, -ak, -am, ai, ao, -ah, an, aj, -al, -al, aj, an, -ah, ao, ai, -am, -ak, ak, am, -ai, -ao, ah, -an, -aj, al, al, -aj, -an, ah, -ao, -ai, am, ak, -ak, -am, ai, ao, -ah, an, aj, -al, -al, aj, an, -ah, ao, ai, -am, -ak, ak, am, -ai, -ao, ah, -an, -aj, al }, \ + { bx, -bn, -ch, bg, -ce, -bq, bu, ca, -bk, -ck, bj, -cb, -bt, br, cd, -bh, ci, bm, -by, -bw, bo, cg, -bf, cf, bp, -bv, -bz, bl, cj, -bi, cc, bs, -bs, -cc, bi, -cj, -bl, bz, bv, -bp, -cf, bf, -cg, -bo, bw, by, -bm, -ci, bh, -cd, -br, bt, cb, -bj, ck, bk, -ca, -bu, bq, ce, -bg, ch, bn, -bx }, \ + { ay, -as, -be, ar, -az, -ax, at, bd, -aq, ba, aw, -au, -bc, ap, -bb, -av, av, bb, -ap, bc, au, -aw, -ba, aq, -bd, -at, ax, az, -ar, be, as, -ay, -ay, as, be, -ar, az, ax, -at, -bd, aq, -ba, -aw, au, bc, -ap, bb, av, -av, -bb, ap, -bc, -au, aw, ba, -aq, bd, at, -ax, -az, ar, -be, -as, ay }, \ + { by, -bk, cj, bn, -bv, -cb, bh, -cg, -bq, bs, ce, -bf, cd, bt, -bp, -ch, bi, -ca, -bw, bm, ck, -bl, bx, bz, -bj, ci, bo, -bu, -cc, bg, -cf, -br, br, cf, -bg, cc, bu, -bo, -ci, bj, -bz, -bx, bl, -ck, -bm, bw, ca, -bi, ch, bp, -bt, -cd, bf, -ce, -bs, bq, cg, -bh, cb, bv, -bn, -cj, bk, -by }, \ + { af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af, af, -ad, ag, ae, -ae, -ag, ad, -af, -af, ad, -ag, -ae, ae, ag, -ad, af }, \ + { bz, -bh, ce, bu, -bm, cj, bp, -br, -ch, bk, -bw, -cc, bf, -cb, -bx, bj, -cg, -bs, bo, ck, -bn, bt, cf, -bi, by, ca, -bg, cd, bv, -bl, ci, bq, -bq, -ci, bl, -bv, -cd, bg, -ca, -by, bi, -cf, -bt, bn, -ck, -bo, bs, cg, -bj, bx, cb, -bf, cc, bw, -bk, ch, br, -bp, -cj, bm, -bu, -ce, bh, -bz }, \ + { az, -ap, ba, ay, -aq, bb, ax, -ar, bc, aw, -as, bd, av, -at, be, au, -au, -be, at, -av, -bd, as, -aw, -bc, ar, -ax, -bb, aq, -ay, -ba, ap, -az, -az, ap, -ba, -ay, aq, -bb, -ax, ar, -bc, -aw, as, -bd, -av, at, -be, -au, au, be, -at, av, bd, -as, aw, bc, -ar, ax, bb, -aq, ay, ba, -ap, az }, \ + { ca, -bf, bz, cb, -bg, by, cc, -bh, bx, cd, -bi, bw, ce, -bj, bv, cf, -bk, bu, cg, -bl, bt, ch, -bm, bs, ci, -bn, br, cj, -bo, bq, ck, -bp, bp, -ck, -bq, bo, -cj, -br, bn, -ci, -bs, bm, -ch, -bt, bl, -cg, -bu, bk, -cf, -bv, bj, -ce, -bw, bi, -cd, -bx, bh, -cc, -by, bg, -cb, -bz, bf, -ca }, \ + { am, -ah, al, an, -ai, ak, ao, -aj, aj, -ao, -ak, ai, -an, -al, ah, -am, -am, ah, -al, -an, ai, -ak, -ao, aj, -aj, ao, ak, -ai, an, al, -ah, am, am, -ah, al, an, -ai, ak, ao, -aj, aj, -ao, -ak, ai, -an, -al, ah, -am, -am, ah, -al, -an, ai, -ak, -ao, aj, -aj, ao, ak, -ai, an, al, -ah, am }, \ + { cb, -bi, bu, ci, -bp, bn, -cg, -bw, bg, -bz, -cd, bk, -bs, -ck, br, -bl, ce, by, -bf, bx, cf, -bm, bq, -cj, -bt, bj, -cc, -ca, bh, -bv, -ch, bo, -bo, ch, bv, -bh, ca, cc, -bj, bt, cj, -bq, bm, -cf, -bx, bf, -by, -ce, bl, -br, ck, bs, -bk, cd, bz, -bg, bw, cg, -bn, bp, -ci, -bu, bi, -cb }, \ + { ba, -ar, av, -be, -aw, aq, -az, -bb, as, -au, bd, ax, -ap, ay, bc, -at, at, -bc, -ay, ap, -ax, -bd, au, -as, bb, az, -aq, aw, be, -av, ar, -ba, -ba, ar, -av, be, aw, -aq, az, bb, -as, au, -bd, -ax, ap, -ay, -bc, at, -at, bc, ay, -ap, ax, bd, -au, as, -bb, -az, aq, -aw, -be, av, -ar, ba }, \ + { cc, -bl, bp, -cg, -by, bh, -bt, ck, bu, -bg, bx, ch, -bq, bk, -cb, -cd, bm, -bo, cf, bz, -bi, bs, -cj, -bv, bf, -bw, -ci, br, -bj, ca, ce, -bn, bn, -ce, -ca, bj, -br, ci, bw, -bf, bv, cj, -bs, bi, -bz, -cf, bo, -bm, cd, cb, -bk, bq, -ch, -bx, bg, -bu, -ck, bt, -bh, by, cg, -bp, bl, -cc }, \ + { ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac, ac, -ab, ab, -ac, -ac, ab, -ab, ac }, \ + { cd, -bo, bk, -bz, -ch, bs, -bg, bv, -ck, -bw, bh, -br, cg, ca, -bl, bn, -cc, -ce, bp, -bj, by, ci, -bt, bf, -bu, cj, bx, -bi, bq, -cf, -cb, bm, -bm, cb, cf, -bq, bi, -bx, -cj, bu, -bf, bt, -ci, -by, bj, -bp, ce, cc, -bn, bl, -ca, -cg, br, -bh, bw, ck, -bv, bg, -bs, ch, bz, -bk, bo, -cd }, \ + { bb, -au, aq, -ax, be, ay, -ar, at, -ba, -bc, av, -ap, aw, -bd, -az, as, -as, az, bd, -aw, ap, -av, bc, ba, -at, ar, -ay, -be, ax, -aq, au, -bb, -bb, au, -aq, ax, -be, -ay, ar, -at, ba, bc, -av, ap, -aw, bd, az, -as, as, -az, -bd, aw, -ap, av, -bc, -ba, at, -ar, ay, be, -ax, aq, -au, bb }, \ + { ce, -br, bf, -bs, cf, cd, -bq, bg, -bt, cg, cc, -bp, bh, -bu, ch, cb, -bo, bi, -bv, ci, ca, -bn, bj, -bw, cj, bz, -bm, bk, -bx, ck, by, -bl, bl, -by, -ck, bx, -bk, bm, -bz, -cj, bw, -bj, bn, -ca, -ci, bv, -bi, bo, -cb, -ch, bu, -bh, bp, -cc, -cg, bt, -bg, bq, -cd, -cf, bs, -bf, br, -ce }, \ + { an, -ak, ah, -aj, am, ao, -al, ai, -ai, al, -ao, -am, aj, -ah, ak, -an, -an, ak, -ah, aj, -am, -ao, al, -ai, ai, -al, ao, am, -aj, ah, -ak, an, an, -ak, ah, -aj, am, ao, -al, ai, -ai, al, -ao, -am, aj, -ah, ak, -an, -an, ak, -ah, aj, -am, -ao, al, -ai, ai, -al, ao, am, -aj, ah, -ak, an }, \ + { cf, -bu, bj, -bl, bw, -ch, -cd, bs, -bh, bn, -by, cj, cb, -bq, bf, -bp, ca, ck, -bz, bo, -bg, br, -cc, -ci, bx, -bm, bi, -bt, ce, cg, -bv, bk, -bk, bv, -cg, -ce, bt, -bi, bm, -bx, ci, cc, -br, bg, -bo, bz, -ck, -ca, bp, -bf, bq, -cb, -cj, by, -bn, bh, -bs, cd, ch, -bw, bl, -bj, bu, -cf }, \ + { bc, -ax, as, -aq, av, -ba, -be, az, -au, ap, -at, ay, -bd, -bb, aw, -ar, ar, -aw, bb, bd, -ay, at, -ap, au, -az, be, ba, -av, aq, -as, ax, -bc, -bc, ax, -as, aq, -av, ba, be, -az, au, -ap, at, -ay, bd, bb, -aw, ar, -ar, aw, -bb, -bd, ay, -at, ap, -au, az, -be, -ba, av, -aq, as, -ax, bc }, \ + { cg, -bx, bo, -bf, bn, -bw, cf, ch, -by, bp, -bg, bm, -bv, ce, ci, -bz, bq, -bh, bl, -bu, cd, cj, -ca, br, -bi, bk, -bt, cc, ck, -cb, bs, -bj, bj, -bs, cb, -ck, -cc, bt, -bk, bi, -br, ca, -cj, -cd, bu, -bl, bh, -bq, bz, -ci, -ce, bv, -bm, bg, -bp, by, -ch, -cf, bw, -bn, bf, -bo, bx, -cg }, \ + { ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag, ag, -af, ae, -ad, ad, -ae, af, -ag, -ag, af, -ae, ad, -ad, ae, -af, ag }, \ + { ch, -ca, bt, -bm, bf, -bl, bs, -bz, cg, ci, -cb, bu, -bn, bg, -bk, br, -by, cf, cj, -cc, bv, -bo, bh, -bj, bq, -bx, ce, ck, -cd, bw, -bp, bi, -bi, bp, -bw, cd, -ck, -ce, bx, -bq, bj, -bh, bo, -bv, cc, -cj, -cf, by, -br, bk, -bg, bn, -bu, cb, -ci, -cg, bz, -bs, bl, -bf, bm, -bt, ca, -ch }, \ + { bd, -ba, ax, -au, ar, -ap, as, -av, ay, -bb, be, bc, -az, aw, -at, aq, -aq, at, -aw, az, -bc, -be, bb, -ay, av, -as, ap, -ar, au, -ax, ba, -bd, -bd, ba, -ax, au, -ar, ap, -as, av, -ay, bb, -be, -bc, az, -aw, at, -aq, aq, -at, aw, -az, bc, be, -bb, ay, -av, as, -ap, ar, -au, ax, -ba, bd }, \ + { ci, -cd, by, -bt, bo, -bj, bf, -bk, bp, -bu, bz, -ce, cj, ch, -cc, bx, -bs, bn, -bi, bg, -bl, bq, -bv, ca, -cf, ck, cg, -cb, bw, -br, bm, -bh, bh, -bm, br, -bw, cb, -cg, -ck, cf, -ca, bv, -bq, bl, -bg, bi, -bn, bs, -bx, cc, -ch, -cj, ce, -bz, bu, -bp, bk, -bf, bj, -bo, bt, -by, cd, -ci }, \ + { ao, -an, am, -al, ak, -aj, ai, -ah, ah, -ai, aj, -ak, al, -am, an, -ao, -ao, an, -am, al, -ak, aj, -ai, ah, -ah, ai, -aj, ak, -al, am, -an, ao, ao, -an, am, -al, ak, -aj, ai, -ah, ah, -ai, aj, -ak, al, -am, an, -ao, -ao, an, -am, al, -ak, aj, -ai, ah, -ah, ai, -aj, ak, -al, am, -an, ao }, \ + { cj, -cg, cd, -ca, bx, -bu, br, -bo, bl, -bi, bf, -bh, bk, -bn, bq, -bt, bw, -bz, cc, -cf, ci, ck, -ch, ce, -cb, by, -bv, bs, -bp, bm, -bj, bg, -bg, bj, -bm, bp, -bs, bv, -by, cb, -ce, ch, -ck, -ci, cf, -cc, bz, -bw, bt, -bq, bn, -bk, bh, -bf, bi, -bl, bo, -br, bu, -bx, ca, -cd, cg, -cj }, \ + { be, -bd, bc, -bb, ba, -az, ay, -ax, aw, -av, au, -at, as, -ar, aq, -ap, ap, -aq, ar, -as, at, -au, av, -aw, ax, -ay, az, -ba, bb, -bc, bd, -be, -be, bd, -bc, bb, -ba, az, -ay, ax, -aw, av, -au, at, -as, ar, -aq, ap, -ap, aq, -ar, as, -at, au, -av, aw, -ax, ay, -az, ba, -bb, bc, -bd, be }, \ + { ck, -cj, ci, -ch, cg, -cf, ce, -cd, cc, -cb, ca, -bz, by, -bx, bw, -bv, bu, -bt, bs, -br, bq, -bp, bo, -bn, bm, -bl, bk, -bj, bi, -bh, bg, -bf, bf, -bg, bh, -bi, bj, -bk, bl, -bm, bn, -bo, bp, -bq, br, -bs, bt, -bu, bv, -bw, bx, -by, bz, -ca, cb, -cc, cd, -ce, cf, -cg, ch, -ci, cj, -ck }, \ + } + +// DCT-8 +#define DEFINE_DCT8_P4_MATRIX(a,b,c,d) \ +{ \ + { a, b, c, d,}, \ + { b, 0, -b, -b,}, \ + { c, -b, -d, a,}, \ + { d, -b, a, -c,}, \ +} + +#define DEFINE_DCT8_P8_MATRIX(a,b,c,d,e,f,g,h) \ +{ \ + { a, b, c, d, e, f, g, h,}, \ + { b, e, h, -g, -d, -a, -c, -f,}, \ + { c, h, -e, -a, -f, g, b, d,}, \ + { d, -g, -a, -h, c, e, -f, -b,}, \ + { e, -d, -f, c, g, -b, -h, a,}, \ + { f, -a, g, e, -b, h, d, -c,}, \ + { g, -c, b, -f, -h, d, -a, e,}, \ + { h, -f, d, -b, a, -c, e, -g,}, \ +} + +#define DEFINE_DCT8_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p,}, \ + { b, e, h, k, n, 0, -n, -k, -h, -e, -b, -b, -e, -h, -k, -n,}, \ + { c, h, m, -p, -k, -f, -a, -e, -j, -o, n, i, d, b, g, l,}, \ + { d, k, -p, -i, -b, -f, -m, n, g, a, h, o, -l, -e, -c, -j,}, \ + { e, n, -k, -b, -h, 0, h, b, k, -n, -e, -e, -n, k, b, h,}, \ + { f, 0, -f, -f, 0, f, f, 0, -f, -f, 0, f, f, 0, -f, -f,}, \ + { g, -n, -a, -m, h, f, -o, -b, -l, i, e, -p, -c, -k, j, d,}, \ + { h, -k, -e, n, b, 0, -b, -n, e, k, -h, -h, k, e, -n, -b,}, \ + { i, -h, -j, g, k, -f, -l, e, m, -d, -n, c, o, -b, -p, a,}, \ + { j, -e, -o, a, -n, -f, i, k, -d, -p, b, -m, -g, h, l, -c,}, \ + { k, -b, n, h, -e, 0, e, -h, -n, b, -k, -k, b, -n, -h, e,}, \ + { l, -b, i, o, -e, f, -p, -h, c, -m, -k, a, -j, -n, d, -g,}, \ + { m, -e, d, -l, -n, f, -c, k, o, -g, b, -j, -p, h, -a, i,}, \ + { n, -h, b, -e, k, 0, -k, e, -b, h, -n, -n, h, -b, e, -k,}, \ + { o, -k, g, -c, b, -f, j, -n, -p, l, -h, d, -a, e, -i, m,}, \ + { p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o,}, \ +} + +#define DEFINE_DCT8_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F,}, \ + { b, e, h, k, n, q, t, w, z, C, F, -E, -B, -y, -v, -s, -p, -m, -j, -g, -d, -a, -c, -f, -i, -l, -o, -r, -u, -x, -A, -D,}, \ + { c, h, m, r, w, B, 0, -B, -w, -r, -m, -h, -c, -c, -h, -m, -r, -w, -B, 0, B, w, r, m, h, c, c, h, m, r, w, B,}, \ + { d, k, r, y, F, -A, -t, -m, -f, -b, -i, -p, -w, -D, C, v, o, h, a, g, n, u, B, -E, -x, -q, -j, -c, -e, -l, -s, -z,}, \ + { e, n, w, F, -y, -p, -g, -c, -l, -u, -D, A, r, i, a, j, s, B, -C, -t, -k, -b, -h, -q, -z, E, v, m, d, f, o, x,}, \ + { f, q, B, -A, -p, -e, -g, -r, -C, z, o, d, h, s, D, -y, -n, -c, -i, -t, -E, x, m, b, j, u, F, -w, -l, -a, -k, -v,}, \ + { g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t, 0, -t, -g, -g, -t, 0, t, g, g, t,}, \ + { h, w, -B, -m, -c, -r, 0, r, c, m, B, -w, -h, -h, -w, B, m, c, r, 0, -r, -c, -m, -B, w, h, h, w, -B, -m, -c, -r,}, \ + { i, z, -w, -f, -l, -C, t, c, o, F, -q, -a, -r, E, n, d, u, -B, -k, -g, -x, y, h, j, A, -v, -e, -m, -D, s, b, p,}, \ + { j, C, -r, -b, -u, z, g, m, F, -o, -e, -x, w, d, p, -E, -l, -h, -A, t, a, s, -B, -i, -k, -D, q, c, v, -y, -f, -n,}, \ + { k, F, -m, -i, -D, o, g, B, -q, -e, -z, s, c, x, -u, -a, -v, w, b, t, -y, -d, -r, A, f, p, -C, -h, -n, E, j, l,}, \ + { l, -E, -h, -p, A, d, t, -w, -a, -x, s, e, B, -o, -i, -F, k, m, -D, -g, -q, z, c, u, -v, -b, -y, r, f, C, -n, -j,}, \ + { m, -B, -c, -w, r, h, 0, -h, -r, w, c, B, -m, -m, B, c, w, -r, -h, 0, h, r, -w, -c, -B, m, m, -B, -c, -w, r, h,}, \ + { n, -y, -c, -D, i, s, -t, -h, E, d, x, -o, -m, z, b, C, -j, -r, u, g, -F, -e, -w, p, l, -A, -a, -B, k, q, -v, -f,}, \ + { o, -v, -h, C, a, D, -g, -w, n, p, -u, -i, B, b, E, -f, -x, m, q, -t, -j, A, c, F, -e, -y, l, r, -s, -k, z, d,}, \ + { p, -s, -m, v, j, -y, -g, B, d, -E, -a, -F, c, C, -f, -z, i, w, -l, -t, o, q, -r, -n, u, k, -x, -h, A, e, -D, -b,}, \ + { q, -p, -r, o, s, -n, -t, m, u, -l, -v, k, w, -j, -x, i, y, -h, -z, g, A, -f, -B, e, C, -d, -D, c, E, -b, -F, a,}, \ + { r, -m, -w, h, B, -c, 0, c, -B, -h, w, m, -r, -r, m, w, -h, -B, c, 0, -c, B, h, -w, -m, r, r, -m, -w, h, B, -c,}, \ + { s, -j, -B, a, -C, -i, t, r, -k, -A, b, -D, -h, u, q, -l, -z, c, -E, -g, v, p, -m, -y, d, -F, -f, w, o, -n, -x, e,}, \ + { t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g, 0, g, -t, -t, g, 0, -g, t, t, -g,}, \ + { u, -d, B, n, -k, -E, g, -r, -x, a, -y, -q, h, -F, -j, o, A, -c, v, t, -e, C, m, -l, -D, f, -s, -w, b, -z, -p, i,}, \ + { v, -a, w, u, -b, x, t, -c, y, s, -d, z, r, -e, A, q, -f, B, p, -g, C, o, -h, D, n, -i, E, m, -j, F, l, -k,}, \ + { w, -c, r, B, -h, m, 0, -m, h, -B, -r, c, -w, -w, c, -r, -B, h, -m, 0, m, -h, B, r, -c, w, w, -c, r, B, -h, m,}, \ + { x, -f, m, -E, -q, b, -t, -B, j, -i, A, u, -c, p, F, -n, e, -w, -y, g, -l, D, r, -a, s, C, -k, h, -z, -v, d, -o,}, \ + { y, -i, h, -x, -z, j, -g, w, A, -k, f, -v, -B, l, -e, u, C, -m, d, -t, -D, n, -c, s, E, -o, b, -r, -F, p, -a, q,}, \ + { z, -l, c, -q, E, u, -g, h, -v, -D, p, -b, m, -A, -y, k, -d, r, -F, -t, f, -i, w, C, -o, a, -n, B, x, -j, e, -s,}, \ + { A, -o, c, -j, v, F, -t, h, -e, q, -C, -y, m, -a, l, -x, -D, r, -f, g, -s, E, w, -k, b, -n, z, B, -p, d, -i, u,}, \ + { B, -r, h, -c, m, -w, 0, w, -m, c, -h, r, -B, -B, r, -h, c, -m, w, 0, -w, m, -c, h, -r, B, B, -r, h, -c, m, -w,}, \ + { C, -u, m, -e, d, -l, t, -B, -D, v, -n, f, -c, k, -s, A, E, -w, o, -g, b, -j, r, -z, -F, x, -p, h, -a, i, -q, y,}, \ + { D, -x, r, -l, f, -a, g, -m, s, -y, E, C, -w, q, -k, e, -b, h, -n, t, -z, F, B, -v, p, -j, d, -c, i, -o, u, -A,}, \ + { E, -A, w, -s, o, -k, g, -c, b, -f, j, -n, r, -v, z, -D, -F, B, -x, t, -p, l, -h, d, -a, e, -i, m, -q, u, -y, C,}, \ + { F, -D, B, -z, x, -v, t, -r, p, -n, l, -j, h, -f, d, -b, a, -c, e, -g, i, -k, m, -o, q, -s, u, -w, y, -A, C, -E,}, \ +} + + +// DST-7 +#define DEFINE_DST7_P4_MATRIX(a,b,c,d) \ +{ \ + { a, b, c, d }, \ + { c, c, 0, -c }, \ + { d, -a, -c, b }, \ + { b, -d, c, -a }, \ +} + +#define DEFINE_DST7_P8_MATRIX(a,b,c,d,e,f,g,h) \ +{ \ + { a, b, c, d, e, f, g, h,}, \ + { c, f, h, e, b, -a, -d, -g,}, \ + { e, g, b, -c, -h, -d, a, f,}, \ + { g, c, -d, -f, a, h, b, -e,}, \ + { h, -a, -g, b, f, -c, -e, d,}, \ + { f, -e, -a, g, -d, -b, h, -c,}, \ + { d, -h, e, -a, -c, g, -f, b,}, \ + { b, -d, f, -h, g, -e, c, -a,}, \ +} + +#define DEFINE_DST7_P16_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p,}, \ + { c, f, i, l, o, o, l, i, f, c, 0, -c, -f, -i, -l, -o,}, \ + { e, j, o, m, h, c, -b, -g, -l, -p, -k, -f, -a, d, i, n,}, \ + { g, n, l, e, -b, -i, -p, -j, -c, d, k, o, h, a, -f, -m,}, \ + { i, o, f, -c, -l, -l, -c, f, o, i, 0, -i, -o, -f, c, l,}, \ + { k, k, 0, -k, -k, 0, k, k, 0, -k, -k, 0, k, k, 0, -k,}, \ + { m, g, -f, -n, -a, l, h, -e, -o, -b, k, i, -d, -p, -c, j,}, \ + { o, c, -l, -f, i, i, -f, -l, c, o, 0, -o, -c, l, f, -i,}, \ + { p, -a, -o, b, n, -c, -m, d, l, -e, -k, f, j, -g, -i, h,}, \ + { n, -e, -i, j, d, -o, a, m, -f, -h, k, c, -p, b, l, -g,}, \ + { l, -i, -c, o, -f, -f, o, -c, -i, l, 0, -l, i, c, -o, f,}, \ + { j, -m, c, g, -p, f, d, -n, i, a, -k, l, -b, -h, o, -e,}, \ + { h, -p, i, -a, -g, o, -j, b, f, -n, k, -c, -e, m, -l, d,}, \ + { f, -l, o, -i, c, c, -i, o, -l, f, 0, -f, l, -o, i, -c,}, \ + { d, -h, l, -p, m, -i, e, -a, -c, g, -k, o, -n, j, -f, b,}, \ + { b, -d, f, -h, j, -l, n, -p, o, -m, k, -i, g, -e, c, -a,}, \ +} + +#define DEFINE_DST7_P32_MATRIX(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F) \ +{ \ + { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F,}, \ + { c, f, i, l, o, r, u, x, A, D, F, C, z, w, t, q, n, k, h, e, b, -a, -d, -g, -j, -m, -p, -s, -v, -y, -B, -E,}, \ + { e, j, o, t, y, D, D, y, t, o, j, e, 0, -e, -j, -o, -t, -y, -D, -D, -y, -t, -o, -j, -e, 0, e, j, o, t, y, D,}, \ + { g, n, u, B, D, w, p, i, b, -e, -l, -s, -z, -F, -y, -r, -k, -d, c, j, q, x, E, A, t, m, f, -a, -h, -o, -v, -C,}, \ + { i, r, A, C, t, k, b, -g, -p, -y, -E, -v, -m, -d, e, n, w, F, x, o, f, -c, -l, -u, -D, -z, -q, -h, a, j, s, B,}, \ + { k, v, F, u, j, -a, -l, -w, -E, -t, -i, b, m, x, D, s, h, -c, -n, -y, -C, -r, -g, d, o, z, B, q, f, -e, -p, -A,}, \ + { m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z, z, m, 0, -m, -z, -z, -m, 0, m, z,}, \ + { o, D, t, e, -j, -y, -y, -j, e, t, D, o, 0, -o, -D, -t, -e, j, y, y, j, -e, -t, -D, -o, 0, o, D, t, e, -j, -y,}, \ + { q, E, n, -c, -t, -B, -k, f, w, y, h, -i, -z, -v, -e, l, C, s, b, -o, -F, -p, a, r, D, m, -d, -u, -A, -j, g, x,}, \ + { s, A, h, -k, -D, -p, c, v, x, e, -n, -F, -m, f, y, u, b, -q, -C, -j, i, B, r, -a, -t, -z, -g, l, E, o, -d, -w,}, \ + { u, w, b, -s, -y, -d, q, A, f, -o, -C, -h, m, E, j, -k, -F, -l, i, D, n, -g, -B, -p, e, z, r, -c, -x, -t, a, v,}, \ + { w, s, -d, -A, -o, h, E, k, -l, -D, -g, p, z, c, -t, -v, a, x, r, -e, -B, -n, i, F, j, -m, -C, -f, q, y, b, -u,}, \ + { y, o, -j, -D, -e, t, t, -e, -D, -j, o, y, 0, -y, -o, j, D, e, -t, -t, e, D, j, -o, -y, 0, y, o, -j, -D, -e, t,}, \ + { A, k, -p, -v, e, F, f, -u, -q, j, B, a, -z, -l, o, w, -d, -E, -g, t, r, -i, -C, -b, y, m, -n, -x, c, D, h, -s,}, \ + { C, g, -v, -n, o, u, -h, -B, a, D, f, -w, -m, p, t, -i, -A, b, E, e, -x, -l, q, s, -j, -z, c, F, d, -y, -k, r,}, \ + { E, c, -B, -f, y, i, -v, -l, s, o, -p, -r, m, u, -j, -x, g, A, -d, -D, a, F, b, -C, -e, z, h, -w, -k, t, n, -q,}, \ + { F, -a, -E, b, D, -c, -C, d, B, -e, -A, f, z, -g, -y, h, x, -i, -w, j, v, -k, -u, l, t, -m, -s, n, r, -o, -q, p,}, \ + { D, -e, -y, j, t, -o, -o, t, j, -y, -e, D, 0, -D, e, y, -j, -t, o, o, -t, -j, y, e, -D, 0, D, -e, -y, j, t, -o,}, \ + { B, -i, -s, r, j, -A, -a, C, -h, -t, q, k, -z, -b, D, -g, -u, p, l, -y, -c, E, -f, -v, o, m, -x, -d, F, -e, -w, n,}, \ + { z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m, -m, z, 0, -z, m, m, -z, 0, z, -m,}, \ + { x, -q, -g, E, -j, -n, A, -c, -u, t, d, -B, m, k, -D, f, r, -w, -a, y, -p, -h, F, -i, -o, z, -b, -v, s, e, -C, l,}, \ + { v, -u, -a, w, -t, -b, x, -s, -c, y, -r, -d, z, -q, -e, A, -p, -f, B, -o, -g, C, -n, -h, D, -m, -i, E, -l, -j, F, -k,}, \ + { t, -y, e, o, -D, j, j, -D, o, e, -y, t, 0, -t, y, -e, -o, D, -j, -j, D, -o, -e, y, -t, 0, t, -y, e, o, -D, j,}, \ + { r, -C, k, g, -y, v, -d, -n, F, -o, -c, u, -z, h, j, -B, s, -a, -q, D, -l, -f, x, -w, e, m, -E, p, b, -t, A, -i,}, \ + { p, -F, q, -a, -o, E, -r, b, n, -D, s, -c, -m, C, -t, d, l, -B, u, -e, -k, A, -v, f, j, -z, w, -g, -i, y, -x, h,}, \ + { n, -B, w, -i, -e, s, -F, r, -d, -j, x, -A, m, a, -o, C, -v, h, f, -t, E, -q, c, k, -y, z, -l, -b, p, -D, u, -g,}, \ + { l, -x, C, -q, e, g, -s, E, -v, j, b, -n, z, -A, o, -c, -i, u, -F, t, -h, -d, p, -B, y, -m, a, k, -w, D, -r, f,}, \ + { j, -t, D, -y, o, -e, -e, o, -y, D, -t, j, 0, -j, t, -D, y, -o, e, e, -o, y, -D, t, -j, 0, j, -t, D, -y, o, -e,}, \ + { h, -p, x, -F, y, -q, i, -a, -g, o, -w, E, -z, r, -j, b, f, -n, v, -D, A, -s, k, -c, -e, m, -u, C, -B, t, -l, d,}, \ + { f, -l, r, -x, D, -C, w, -q, k, -e, -a, g, -m, s, -y, E, -B, v, -p, j, -d, -b, h, -n, t, -z, F, -A, u, -o, i, -c,}, \ + { d, -h, l, -p, t, -x, B, -F, C, -y, u, -q, m, -i, e, -a, -c, g, -k, o, -s, w, -A, E, -D, z, -v, r, -n, j, -f, b,}, \ + { b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a,}, \ +} + +#define TRANSFORM_NUMBER_OF_DIRECTIONS 1 +#define ALIGN_DATA(nBytes,v) __declspec(align(nBytes)) v +#define MEMORY_ALIGN_DEF_SIZE 32 // for use with avx2 (256 bit) +//-------------------------------------------------------------------------------------------------- +// DCT-2 +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2]) = +{ + DEFINE_DCT2_P2_MATRIX(64), + //DEFINE_DCT2_P2_MATRIX(64) +}; + +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = +{ + DEFINE_DCT2_P4_MATRIX(64, 83, 36), + //DEFINE_DCT2_P4_MATRIX(64, 83, 36) +}; + +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = +{ + DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18), + //DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18) +}; + +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = +{ + DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9), + //DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9) +}; + +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = +{ + DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4), + //DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) +}; + +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64]) = +{ + DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2), + //DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2) +}; + +// DCT-8 +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = +{ + DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29), + //DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = +{ + DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17), + //DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = +{ + DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8), + //DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = +{ + DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4), + //DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4) +}; + +// DST-7 +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = +{ + DEFINE_DST7_P4_MATRIX(29, 55, 74, 84), + //DEFINE_DST7_P4_MATRIX(29, 55, 74, 84) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = +{ + DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86), + //DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = +{ + DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88), + //DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88) +}; +ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = +{ + DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90), + //DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90) +}; + +//-------------------------------------------------------------------------------------------------- + +static const int16_t* vvenc_matrix_coeffs[3][6] = { + {g_trCoreDCT2P2[0][0], g_trCoreDCT2P4[0][0], g_trCoreDCT2P8[0][0], g_trCoreDCT2P16[0][0], g_trCoreDCT2P32[0][0], g_trCoreDCT2P64[0][0]}, + {NULL, g_trCoreDCT8P4[0][0], g_trCoreDCT8P8[0][0], g_trCoreDCT8P16[0][0], g_trCoreDCT8P32[0][0], NULL}, + {NULL, g_trCoreDST7P4[0][0], g_trCoreDST7P8[0][0], g_trCoreDST7P16[0][0], g_trCoreDST7P32[0][0], NULL}, +}; + +//! \} + + + +#endif DCT_AVX2_TABLES_H