/***************************************************************************** * This file is part of Kvazaar HEVC encoder. * * Copyright (C) 2013-2014 Tampere University of Technology and others (see * COPYING file). * * Kvazaar is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation. * * Kvazaar is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Kvazaar. If not, see . ****************************************************************************/ /* * \file */ #include #include "dct-avx2.h" #include "strategyselector.h" #include "tables.h" #if COMPILE_INTEL_AVX2 #include extern const int16_t g_t4[4][4]; extern const int16_t g_t8[8][8]; extern const int16_t g_t16[16][16]; extern const int16_t g_t32[32][32]; /** * \brief Generic partial butterfly functions * * TODO: description * * \param TODO * * \returns TODO */ // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm // gives identical results static void fast_forward_dst_4_avx2(short *block, short *coeff, int32_t shift) // input block, output coeff { int32_t i, c[4]; int32_t rnd_factor = 1 << (shift - 1); for (i = 0; i < 4; i++) { // int32_termediate Variables c[0] = block[4 * i + 0] + block[4 * i + 3]; c[1] = block[4 * i + 1] + block[4 * i + 3]; c[2] = block[4 * i + 0] - block[4 * i + 1]; c[3] = 74 * block[4 * i + 2]; coeff[i] = (short)((29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); coeff[4 + i] = (short)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift); coeff[8 + i] = (short)((29 * c[2] + 55 * c[0] - c[3] + rnd_factor) >> shift); coeff[12 + i] = (short)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); } } static void fast_inverse_dst_4_avx2(short *tmp, short *block, int shift) // input tmp, output block { int i, c[4]; int rnd_factor = 1 << (shift - 1); for (i = 0; i < 4; i++) { // Intermediate Variables c[0] = tmp[i] + tmp[8 + i]; c[1] = tmp[8 + i] + tmp[12 + i]; c[2] = tmp[i] - tmp[12 + i]; c[3] = 74 * tmp[4 + i]; block[4 * i + 0] = (short)CLIP(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift); block[4 * i + 1] = (short)CLIP(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift); block[4 * i + 2] = (short)CLIP(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift); block[4 * i + 3] = (short)CLIP(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift); } } static void partial_butterfly_4_avx2(const int16_t * const src, int16_t * const dst, const int32_t shift) { __m256i tmp0, tmp1, coeff, e, o; __m256i add = _mm256_set1_epi32(1 << (shift - 1)); tmp0 = _mm256_loadu_si256( (__m256i *)src ); int32_t a, b, c, d; tmp0 = _mm256_shufflelo_epi16(tmp0, 128 + 16 + 12 + 0); tmp0 = _mm256_shufflehi_epi16(tmp0, 128 + 16 + 12 + 0); tmp1 = _mm256_castsi128_si256(_mm256_extractf128_si256(tmp0, 1)); //Get pairs of coeff a = ((int32_t*)g_t4)[0]; b = ((int32_t*)g_t4)[2]; c = ((int32_t*)g_t4)[4]; d = ((int32_t*)g_t4)[6]; //Copy and set coeffs in the right order for madd coeff = _mm256_castsi128_si256(_mm_set1_epi32(a)); coeff = _mm256_insertf128_si256(coeff, _mm_set1_epi32(b), 1); e = _mm256_hadd_epi16(tmp0, tmp1); o = _mm256_hsub_epi16(tmp0, tmp1); e = _mm256_insertf128_si256(e, _mm256_castsi256_si128(o), 1); //Multiply 16-bit pairs, extends results to 32 bits tmp0 = _mm256_madd_epi16(coeff, e); coeff = _mm256_castsi128_si256(_mm_set1_epi32(c)); coeff = _mm256_insertf128_si256(coeff, _mm_set1_epi32(d), 1); tmp1 = _mm256_madd_epi16(coeff, e); tmp0 = _mm256_add_epi32(tmp0, add); tmp0 = _mm256_srai_epi32(tmp0, shift); tmp1 = _mm256_add_epi32(tmp1, add); tmp1 = _mm256_srai_epi32(tmp1, shift); //32-bit -> 16-bit tmp0 = _mm256_packs_epi32(tmp0, tmp1); tmp0 = _mm256_permute4x64_epi64(tmp0, 8+16+128+64); _mm256_storeu_si256( (__m256i *)dst, tmp0 ); } static void partial_butterfly_inverse_4_avx2(short *src, short *dst, int shift) { int j; int e[2], o[2]; int add = 1 << (shift - 1); const int32_t line = 4; for (j = 0; j < line; j++) { // Utilizing symmetry properties to the maximum to minimize the number of multiplications o[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line]; o[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line]; e[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line]; e[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line]; // Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector dst[0] = (short)CLIP(-32768, 32767, (e[0] + o[0] + add) >> shift); dst[1] = (short)CLIP(-32768, 32767, (e[1] + o[1] + add) >> shift); dst[2] = (short)CLIP(-32768, 32767, (e[1] - o[1] + add) >> shift); dst[3] = (short)CLIP(-32768, 32767, (e[0] - o[0] + add) >> shift); src++; dst += 4; } } static void partial_butterfly_8_avx2(short *src, short *dst, int32_t shift) { int32_t j, k; int32_t e[4], o[4]; int32_t ee[2], eo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 8; for (j = 0; j < line; j++) { // E and O for (k = 0; k < 4; k++) { e[k] = src[k] + src[7 - k]; o[k] = src[k] - src[7 - k]; } // EE and EO ee[0] = e[0] + e[3]; eo[0] = e[0] - e[3]; ee[1] = e[1] + e[2]; eo[1] = e[1] - e[2]; dst[0] = (short)((g_t8[0][0] * ee[0] + g_t8[0][1] * ee[1] + add) >> shift); dst[4 * line] = (short)((g_t8[4][0] * ee[0] + g_t8[4][1] * ee[1] + add) >> shift); dst[2 * line] = (short)((g_t8[2][0] * eo[0] + g_t8[2][1] * eo[1] + add) >> shift); dst[6 * line] = (short)((g_t8[6][0] * eo[0] + g_t8[6][1] * eo[1] + add) >> shift); dst[line] = (short)((g_t8[1][0] * o[0] + g_t8[1][1] * o[1] + g_t8[1][2] * o[2] + g_t8[1][3] * o[3] + add) >> shift); dst[3 * line] = (short)((g_t8[3][0] * o[0] + g_t8[3][1] * o[1] + g_t8[3][2] * o[2] + g_t8[3][3] * o[3] + add) >> shift); dst[5 * line] = (short)((g_t8[5][0] * o[0] + g_t8[5][1] * o[1] + g_t8[5][2] * o[2] + g_t8[5][3] * o[3] + add) >> shift); dst[7 * line] = (short)((g_t8[7][0] * o[0] + g_t8[7][1] * o[1] + g_t8[7][2] * o[2] + g_t8[7][3] * o[3] + add) >> shift); src += 8; dst++; } } static void partial_butterfly_inverse_8_avx2(int16_t *src, int16_t *dst, int32_t shift) { int32_t j, k; int32_t e[4], o[4]; int32_t ee[2], eo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 8; for (j = 0; j < line; j++) { // Utilizing symmetry properties to the maximum to minimize the number of multiplications for (k = 0; k < 4; k++) { o[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line]; } eo[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line]; eo[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line]; ee[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line]; ee[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line]; // Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector e[0] = ee[0] + eo[0]; e[3] = ee[0] - eo[0]; e[1] = ee[1] + eo[1]; e[2] = ee[1] - eo[1]; for (k = 0; k < 4; k++) { dst[k] = (int16_t)MAX(-32768, MIN(32767, (e[k] + o[k] + add) >> shift)); dst[k + 4] = (int16_t)MAX(-32768, MIN(32767, (e[3 - k] - o[3 - k] + add) >> shift)); } src++; dst += 8; } } static void partial_butterfly_16_avx2(short *src, short *dst, int32_t shift) { int32_t j, k; int32_t e[8], o[8]; int32_t ee[4], eo[4]; int32_t eee[2], eeo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 16; for (j = 0; j < line; j++) { // E and O for (k = 0; k < 8; k++) { e[k] = src[k] + src[15 - k]; o[k] = src[k] - src[15 - k]; } // EE and EO for (k = 0; k < 4; k++) { ee[k] = e[k] + e[7 - k]; eo[k] = e[k] - e[7 - k]; } // EEE and EEO eee[0] = ee[0] + ee[3]; eeo[0] = ee[0] - ee[3]; eee[1] = ee[1] + ee[2]; eeo[1] = ee[1] - ee[2]; dst[0] = (short)((g_t16[0][0] * eee[0] + g_t16[0][1] * eee[1] + add) >> shift); dst[8 * line] = (short)((g_t16[8][0] * eee[0] + g_t16[8][1] * eee[1] + add) >> shift); dst[4 * line] = (short)((g_t16[4][0] * eeo[0] + g_t16[4][1] * eeo[1] + add) >> shift); dst[12 * line] = (short)((g_t16[12][0] * eeo[0] + g_t16[12][1] * eeo[1] + add) >> shift); for (k = 2; k < 16; k += 4) { dst[k*line] = (short)((g_t16[k][0] * eo[0] + g_t16[k][1] * eo[1] + g_t16[k][2] * eo[2] + g_t16[k][3] * eo[3] + add) >> shift); } for (k = 1; k < 16; k += 2) { dst[k*line] = (short)((g_t16[k][0] * o[0] + g_t16[k][1] * o[1] + g_t16[k][2] * o[2] + g_t16[k][3] * o[3] + g_t16[k][4] * o[4] + g_t16[k][5] * o[5] + g_t16[k][6] * o[6] + g_t16[k][7] * o[7] + add) >> shift); } src += 16; dst++; } } static void partial_butterfly_inverse_16_avx2(int16_t *src, int16_t *dst, int32_t shift) { int32_t j, k; int32_t e[8], o[8]; int32_t ee[4], eo[4]; int32_t eee[2], eeo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 16; for (j = 0; j < line; j++) { // Utilizing symmetry properties to the maximum to minimize the number of multiplications for (k = 0; k < 8; k++) { o[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] + g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line]; } for (k = 0; k < 4; k++) { eo[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line]; } eeo[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line]; eee[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line]; eeo[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line]; eee[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line]; // Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector for (k = 0; k < 2; k++) { ee[k] = eee[k] + eeo[k]; ee[k + 2] = eee[1 - k] - eeo[1 - k]; } for (k = 0; k < 4; k++) { e[k] = ee[k] + eo[k]; e[k + 4] = ee[3 - k] - eo[3 - k]; } for (k = 0; k < 8; k++) { dst[k] = (short)MAX(-32768, MIN(32767, (e[k] + o[k] + add) >> shift)); dst[k + 8] = (short)MAX(-32768, MIN(32767, (e[7 - k] - o[7 - k] + add) >> shift)); } src++; dst += 16; } } static void partial_butterfly_32_avx2(short *src, short *dst, int32_t shift) { int32_t j, k; int32_t e[16], o[16]; int32_t ee[8], eo[8]; int32_t eee[4], eeo[4]; int32_t eeee[2], eeeo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 32; for (j = 0; j < line; j++) { // E and O for (k = 0; k < 16; k++) { e[k] = src[k] + src[31 - k]; o[k] = src[k] - src[31 - k]; } // EE and EO for (k = 0; k < 8; k++) { ee[k] = e[k] + e[15 - k]; eo[k] = e[k] - e[15 - k]; } // EEE and EEO for (k = 0; k < 4; k++) { eee[k] = ee[k] + ee[7 - k]; eeo[k] = ee[k] - ee[7 - k]; } // EEEE and EEEO eeee[0] = eee[0] + eee[3]; eeeo[0] = eee[0] - eee[3]; eeee[1] = eee[1] + eee[2]; eeeo[1] = eee[1] - eee[2]; dst[0] = (short)((g_t32[0][0] * eeee[0] + g_t32[0][1] * eeee[1] + add) >> shift); dst[16 * line] = (short)((g_t32[16][0] * eeee[0] + g_t32[16][1] * eeee[1] + add) >> shift); dst[8 * line] = (short)((g_t32[8][0] * eeeo[0] + g_t32[8][1] * eeeo[1] + add) >> shift); dst[24 * line] = (short)((g_t32[24][0] * eeeo[0] + g_t32[24][1] * eeeo[1] + add) >> shift); for (k = 4; k < 32; k += 8) { dst[k*line] = (short)((g_t32[k][0] * eeo[0] + g_t32[k][1] * eeo[1] + g_t32[k][2] * eeo[2] + g_t32[k][3] * eeo[3] + add) >> shift); } for (k = 2; k < 32; k += 4) { dst[k*line] = (short)((g_t32[k][0] * eo[0] + g_t32[k][1] * eo[1] + g_t32[k][2] * eo[2] + g_t32[k][3] * eo[3] + g_t32[k][4] * eo[4] + g_t32[k][5] * eo[5] + g_t32[k][6] * eo[6] + g_t32[k][7] * eo[7] + add) >> shift); } for (k = 1; k < 32; k += 2) { dst[k*line] = (short)((g_t32[k][0] * o[0] + g_t32[k][1] * o[1] + g_t32[k][2] * o[2] + g_t32[k][3] * o[3] + g_t32[k][4] * o[4] + g_t32[k][5] * o[5] + g_t32[k][6] * o[6] + g_t32[k][7] * o[7] + g_t32[k][8] * o[8] + g_t32[k][9] * o[9] + g_t32[k][10] * o[10] + g_t32[k][11] * o[11] + g_t32[k][12] * o[12] + g_t32[k][13] * o[13] + g_t32[k][14] * o[14] + g_t32[k][15] * o[15] + add) >> shift); } src += 32; dst++; } } static void partial_butterfly_inverse_32_avx2(int16_t *src, int16_t *dst, int32_t shift) { int32_t j, k; int32_t e[16], o[16]; int32_t ee[8], eo[8]; int32_t eee[4], eeo[4]; int32_t eeee[2], eeeo[2]; int32_t add = 1 << (shift - 1); const int32_t line = 32; for (j = 0; j> shift)); dst[k + 16] = (short)MAX(-32768, MIN(32767, (e[15 - k] - o[15 - k] + add) >> shift)); } src++; dst += 32; } } #define DCT_NXN_AVX2(n) \ static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ int16_t tmp[n*n]; \ int32_t shift_1st = g_convert_to_bit[n] + 1 + (bitdepth - 8); \ int32_t shift_2nd = g_convert_to_bit[n] + 8; \ \ partial_butterfly_ ## n ## _avx2(block, tmp, shift_1st); \ partial_butterfly_ ## n ## _avx2(tmp, coeff, shift_2nd); \ } #define IDCT_NXN_AVX2(n) \ static void idct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \ \ int16_t tmp[ ## n ## * ## n ##]; \ int32_t shift_1st = 7; \ int32_t shift_2nd = 12 - (bitdepth - 8); \ \ partial_butterfly_inverse_ ## n ## _avx2(coeff, tmp, shift_1st); \ partial_butterfly_inverse_ ## n ## _avx2(tmp, block, shift_2nd); \ } DCT_NXN_AVX2(4); DCT_NXN_AVX2(8); DCT_NXN_AVX2(16); DCT_NXN_AVX2(32); IDCT_NXN_AVX2(4); IDCT_NXN_AVX2(8); IDCT_NXN_AVX2(16); IDCT_NXN_AVX2(32); static void fast_forward_dst_4x4_avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { int16_t tmp[4 * 4]; int32_t shift_1st = g_convert_to_bit[4] + 1 + (bitdepth - 8); int32_t shift_2nd = g_convert_to_bit[4] + 8; fast_forward_dst_4_avx2(block, tmp, shift_1st); fast_forward_dst_4_avx2(tmp, coeff, shift_2nd); } static void fast_inverse_dst_4x4_avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { int16_t tmp[4 * 4]; int32_t shift_1st = 7; int32_t shift_2nd = 12 - (bitdepth - 8); fast_inverse_dst_4_avx2(coeff, tmp, shift_1st); fast_inverse_dst_4_avx2(tmp, block, shift_2nd); } #endif //COMPILE_INTEL_AVX2 int strategy_register_dct_avx2(void* opaque) { bool success = true; #if COMPILE_INTEL_AVX2 success &= strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 0, &fast_forward_dst_4x4_avx2); success &= strategyselector_register(opaque, "dct_4x4", "avx2", 0, &dct_4x4_avx2); success &= strategyselector_register(opaque, "dct_8x8", "avx2", 0, &dct_8x8_avx2); success &= strategyselector_register(opaque, "dct_16x16", "avx2", 0, &dct_16x16_avx2); success &= strategyselector_register(opaque, "dct_32x32", "avx2", 0, &dct_32x32_avx2); success &= strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 0, &fast_inverse_dst_4x4_avx2); success &= strategyselector_register(opaque, "idct_4x4", "avx2", 0, &idct_4x4_avx2); success &= strategyselector_register(opaque, "idct_8x8", "avx2", 0, &idct_8x8_avx2); success &= strategyselector_register(opaque, "idct_16x16", "avx2", 0, &idct_16x16_avx2); success &= strategyselector_register(opaque, "idct_32x32", "avx2", 0, &idct_32x32_avx2); #endif //COMPILE_INTEL_AVX2 return success; }