diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index d82d6415..566efba3 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -2198,9 +2198,9 @@ void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_2nd = log2_height_minus1 + 7; const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; - const int16_t* ver_coeff = uvg_g_dct_16; + const int16_t* ver_coeff = &uvg_g_dct_16[0][0]; if (ver == DST7) { - ver_coeff = uvg_g_dst7_16; + ver_coeff = &uvg_g_dst7_16[0][0]; } const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size @@ -2389,7 +2389,7 @@ void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_2nd = log2_height_minus1 + 7; const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor; - const int16_t* ver_coeff = uvg_g_dct_32; + const int16_t* ver_coeff = &uvg_g_dct_32[0][0]; // For result shuffling, can use existing shuffle vector const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle); // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size @@ -2562,7 +2562,7 @@ void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_1st = INVERSE_SHIFT_1ST; const int32_t shift_2nd = INVERSE_SHIFT_2ND; - const int16_t* ver_coeff = uvg_g_dct_32_t; // rename + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; // rename const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename // No coeffs for DCT8 and DST7 transforms since they do not exist for this block size @@ -2986,16 +2986,16 @@ void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_2nd = log2_height_minus1 + 7; const int16_t* hor_coeff = fast_forward_dct2_b4_coeff; - const int16_t* ver_coeff = uvg_g_dct_16; + const int16_t* ver_coeff = &uvg_g_dct_16[0][0]; if (hor == DST7) { hor_coeff = fast_forward_dst7_b4_coeff; } else if (hor == DCT8) { hor_coeff = fast_forward_dct8_b4_coeff; } if (ver == DST7) { - ver_coeff = uvg_g_dst7_16; + ver_coeff = &uvg_g_dst7_16[0][0]; } else if (ver == DCT8) { - ver_coeff = uvg_g_dct8_16; + ver_coeff = &uvg_g_dct8_16[0][0]; } __m256i v_hor_pass_out[4]; @@ -3415,7 +3415,7 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_1st = INVERSE_SHIFT_1ST; const int32_t shift_2nd = INVERSE_SHIFT_2ND; - const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename if (hor == DST7) { hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename @@ -3423,9 +3423,9 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename } if (ver == DST7) { - ver_coeff = uvg_g_dst7_32_t; + ver_coeff = &uvg_g_dst7_32_t[0][0]; } else if (ver == DCT8) { - ver_coeff = uvg_g_dct8_32; + ver_coeff = &uvg_g_dct8_32[0][0]; } __m256i v_ver_pass_out[8]; @@ -4587,7 +4587,7 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_1st = INVERSE_SHIFT_1ST; const int32_t shift_2nd = INVERSE_SHIFT_2ND; - const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table if (hor == DST7) { hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename @@ -4595,9 +4595,9 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename } if (ver == DST7) { - ver_coeff = uvg_g_dst7_32_t; + ver_coeff = &uvg_g_dst7_32_t[0][0]; } else if (ver == DCT8) { - ver_coeff = uvg_g_dct8_32; + ver_coeff = &uvg_g_dct8_32[0][0]; } __m256i v_ver_pass_out[16]; @@ -5949,7 +5949,7 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_1st = INVERSE_SHIFT_1ST; const int32_t shift_2nd = INVERSE_SHIFT_2ND; - const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor; if (hor == DST7) { hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs @@ -5957,9 +5957,9 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, hor_coeff = fi_dct8_16x32_coeff_hor; } if (ver == DST7) { - ver_coeff = uvg_g_dst7_32_t; + ver_coeff = &uvg_g_dst7_32_t[0][0]; } else if (ver == DCT8) { - ver_coeff = uvg_g_dct8_32; + ver_coeff = &uvg_g_dct8_32[0][0]; } __m256i v_ver_pass_out[32]; @@ -6108,8 +6108,8 @@ static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, in // Prepare coeffs // TODO: either rename these old coeff tables to be consistent with other new avx2 functions // or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t - const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[0]); - const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[16]); + const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[0]); + const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[16]); // Got data for 4 vectors, 32 lines with 2 samples each __m256i v_result_e[4]; @@ -6147,7 +6147,7 @@ static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, in // Got data for 8 vectors, 32 lines with 4 samples each // Prepare coeffs - const int16_t* coeff = uvg_g_dct_4; + const int16_t* coeff = &uvg_g_dct_4[0][0]; const int a = coeff[0]; const int b = coeff[1 * 4 + 0]; const int c = coeff[1 * 4 + 1]; @@ -6891,11 +6891,11 @@ void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_2nd = INVERSE_SHIFT_2ND; const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename - const int16_t* hor_coeff = uvg_g_dct_32_t; + const int16_t* hor_coeff = &uvg_g_dct_32_t[0][0]; if (hor == DST7) { - hor_coeff = uvg_g_dst7_32_t; + hor_coeff = &uvg_g_dst7_32_t[0][0]; } else if (hor == DCT8) { - hor_coeff = uvg_g_dct8_32; + hor_coeff = &uvg_g_dct8_32[0][0]; } if (ver == DST7) { ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename @@ -8023,7 +8023,7 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, const int32_t shift_1st = INVERSE_SHIFT_1ST; const int32_t shift_2nd = INVERSE_SHIFT_2ND; - const int16_t* ver_coeff = uvg_g_dct_32_t; + const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor; if (hor == DST7) { hor_coeff = fi_dst7_32xN_coeff_hor; @@ -8031,9 +8031,9 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, hor_coeff = fi_dct8_32xN_coeff_hor; } if (ver == DST7) { - ver_coeff = uvg_g_dst7_32_t; + ver_coeff = &uvg_g_dst7_32_t[0][0]; } else if (ver == DCT8) { - ver_coeff = uvg_g_dct8_32; + ver_coeff = &uvg_g_dct8_32[0][0]; } __m256i v_ver_pass_out[64]; diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h index 946ab6b8..47900966 100644 --- a/src/strategies/avx2/dct_avx2_tables.h +++ b/src/strategies/avx2/dct_avx2_tables.h @@ -4830,101 +4830,5 @@ typedef int16_t TMatrixCoeff; { b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a,}, \ } -#define TRANSFORM_NUMBER_OF_DIRECTIONS 1 -#define ALIGN_DATA(nBytes,v) __declspec(align(nBytes)) v -#define MEMORY_ALIGN_DEF_SIZE 32 // for use with avx2 (256 bit) -//-------------------------------------------------------------------------------------------------- -// DCT-2 -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2]) = -{ - DEFINE_DCT2_P2_MATRIX(64), - //DEFINE_DCT2_P2_MATRIX(64) -}; - -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = -{ - DEFINE_DCT2_P4_MATRIX(64, 83, 36), - //DEFINE_DCT2_P4_MATRIX(64, 83, 36) -}; - -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = -{ - DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18), - //DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18) -}; - -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = -{ - DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9), - //DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9) -}; - -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = -{ - DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4), - //DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4) -}; - -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64]) = -{ - DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2), - //DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2) -}; - -// DCT-8 -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = -{ - DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29), - //DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = -{ - DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17), - //DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = -{ - DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8), - //DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = -{ - DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4), - //DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4) -}; - -// DST-7 -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) = -{ - DEFINE_DST7_P4_MATRIX(29, 55, 74, 84), - //DEFINE_DST7_P4_MATRIX(29, 55, 74, 84) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) = -{ - DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86), - //DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) = -{ - DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88), - //DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88) -}; -ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) = -{ - DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90), - //DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90) -}; - -//-------------------------------------------------------------------------------------------------- - -static const int16_t* vvenc_matrix_coeffs[3][6] = { - {g_trCoreDCT2P2[0][0], g_trCoreDCT2P4[0][0], g_trCoreDCT2P8[0][0], g_trCoreDCT2P16[0][0], g_trCoreDCT2P32[0][0], g_trCoreDCT2P64[0][0]}, - {NULL, g_trCoreDCT8P4[0][0], g_trCoreDCT8P8[0][0], g_trCoreDCT8P16[0][0], g_trCoreDCT8P32[0][0], NULL}, - {NULL, g_trCoreDST7P4[0][0], g_trCoreDST7P8[0][0], g_trCoreDST7P16[0][0], g_trCoreDST7P32[0][0], NULL}, -}; - -//! \} - - #endif DCT_AVX2_TABLES_H