[avx2] Fix compilation errors

This commit is contained in:
Joose Sainio 2023-07-26 15:20:33 +03:00
parent 13d4313e02
commit 1f9955bdda
2 changed files with 25 additions and 121 deletions

View file

@ -2198,9 +2198,9 @@ void fast_forward_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_2nd = log2_height_minus1 + 7;
const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
const int16_t* ver_coeff = uvg_g_dct_16;
const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
if (ver == DST7) {
ver_coeff = uvg_g_dst7_16;
ver_coeff = &uvg_g_dst7_16[0][0];
}
const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@ -2389,7 +2389,7 @@ void fast_forward_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_2nd = log2_height_minus1 + 7;
const int16_t* hor_coeff = ff_dct2_2xN_coeff_hor;
const int16_t* ver_coeff = uvg_g_dct_32;
const int16_t* ver_coeff = &uvg_g_dct_32[0][0];
// For result shuffling, can use existing shuffle vector
const __m256i v_res_shuffle = _mm256_load_si256((const __m256i*)ff_dct2_2x16_ver_result_shuffle);
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@ -2562,7 +2562,7 @@ void fast_inverse_tr_2x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_1st = INVERSE_SHIFT_1ST;
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = uvg_g_dct_32_t; // rename
const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0]; // rename
const int16_t* hor_coeff = fi_dct2_32x2_coeff_ver; // TODO: rename
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@ -2986,16 +2986,16 @@ void fast_forward_tr_4x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_2nd = log2_height_minus1 + 7;
const int16_t* hor_coeff = fast_forward_dct2_b4_coeff;
const int16_t* ver_coeff = uvg_g_dct_16;
const int16_t* ver_coeff = &uvg_g_dct_16[0][0];
if (hor == DST7) {
hor_coeff = fast_forward_dst7_b4_coeff;
} else if (hor == DCT8) {
hor_coeff = fast_forward_dct8_b4_coeff;
}
if (ver == DST7) {
ver_coeff = uvg_g_dst7_16;
ver_coeff = &uvg_g_dst7_16[0][0];
} else if (ver == DCT8) {
ver_coeff = uvg_g_dct8_16;
ver_coeff = &uvg_g_dct8_16[0][0];
}
__m256i v_hor_pass_out[4];
@ -3415,7 +3415,7 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_1st = INVERSE_SHIFT_1ST;
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = uvg_g_dct_32_t;
const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
const int16_t* hor_coeff = fi_dct2_32x4_coeff_ver; // TODO: rename
if (hor == DST7) {
hor_coeff = fi_dst7_32x4_coeff_ver; // TODO: rename
@ -3423,9 +3423,9 @@ void fast_inverse_tr_4x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
hor_coeff = fi_dct8_32x4_coeff_ver; // TODO: rename
}
if (ver == DST7) {
ver_coeff = uvg_g_dst7_32_t;
ver_coeff = &uvg_g_dst7_32_t[0][0];
} else if (ver == DCT8) {
ver_coeff = uvg_g_dct8_32;
ver_coeff = &uvg_g_dct8_32[0][0];
}
__m256i v_ver_pass_out[8];
@ -4587,7 +4587,7 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_1st = INVERSE_SHIFT_1ST;
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = uvg_g_dct_32_t;
const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
const int16_t* hor_coeff = fi_dct2_32x8_coeff_ver; // TODO: rename table
if (hor == DST7) {
hor_coeff = fi_dst7_32x8_coeff_ver; // TODO: rename
@ -4595,9 +4595,9 @@ void fast_inverse_tr_8x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
hor_coeff = fi_dct8_32x8_coeff_ver; // TODO: rename
}
if (ver == DST7) {
ver_coeff = uvg_g_dst7_32_t;
ver_coeff = &uvg_g_dst7_32_t[0][0];
} else if (ver == DCT8) {
ver_coeff = uvg_g_dct8_32;
ver_coeff = &uvg_g_dct8_32[0][0];
}
__m256i v_ver_pass_out[16];
@ -5949,7 +5949,7 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_1st = INVERSE_SHIFT_1ST;
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = uvg_g_dct_32_t;
const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
const int16_t* hor_coeff = fi_dct2_16x16_coeff_hor;
if (hor == DST7) {
hor_coeff = fi_dst7_16x32_coeff_hor; // TODO: coeffs
@ -5957,9 +5957,9 @@ void fast_inverse_tr_16x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
hor_coeff = fi_dct8_16x32_coeff_hor;
}
if (ver == DST7) {
ver_coeff = uvg_g_dst7_32_t;
ver_coeff = &uvg_g_dst7_32_t[0][0];
} else if (ver == DCT8) {
ver_coeff = uvg_g_dct8_32;
ver_coeff = &uvg_g_dct8_32[0][0];
}
__m256i v_ver_pass_out[32];
@ -6108,8 +6108,8 @@ static void fast_forward_DCT2_32x2_avx2_ver(const __m256i* src, int16_t* dst, in
// Prepare coeffs
// TODO: either rename these old coeff tables to be consistent with other new avx2 functions
// or construct them here in place. Should be ease to accomplish with set1_epi32, just use a int32_t combined from two int16_t
const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[0]);
const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)fast_forward_dct2_b2_coeff[16]);
const __m256i v_coeff_0 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[0]);
const __m256i v_coeff_1 = _mm256_load_si256((const __m256i*)&fast_forward_dct2_b2_coeff[16]);
// Got data for 4 vectors, 32 lines with 2 samples each
__m256i v_result_e[4];
@ -6147,7 +6147,7 @@ static void fast_forward_DCT2_32x4_avx2_ver(const __m256i* src, int16_t* dst, in
// Got data for 8 vectors, 32 lines with 4 samples each
// Prepare coeffs
const int16_t* coeff = uvg_g_dct_4;
const int16_t* coeff = &uvg_g_dct_4[0][0];
const int a = coeff[0];
const int b = coeff[1 * 4 + 0];
const int c = coeff[1 * 4 + 1];
@ -6891,11 +6891,11 @@ void fast_inverse_tr_32x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = fi_dct2_4x32_coeff_hor; // TODO: rename
const int16_t* hor_coeff = uvg_g_dct_32_t;
const int16_t* hor_coeff = &uvg_g_dct_32_t[0][0];
if (hor == DST7) {
hor_coeff = uvg_g_dst7_32_t;
hor_coeff = &uvg_g_dst7_32_t[0][0];
} else if (hor == DCT8) {
hor_coeff = uvg_g_dct8_32;
hor_coeff = &uvg_g_dct8_32[0][0];
}
if (ver == DST7) {
ver_coeff = fi_dst7_4x32_coeff_hor; // TODO: rename
@ -8023,7 +8023,7 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_1st = INVERSE_SHIFT_1ST;
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = uvg_g_dct_32_t;
const int16_t* ver_coeff = &uvg_g_dct_32_t[0][0];
const int16_t* hor_coeff = fi_dct2_32xN_coeff_hor;
if (hor == DST7) {
hor_coeff = fi_dst7_32xN_coeff_hor;
@ -8031,9 +8031,9 @@ void fast_inverse_tr_32x32_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
hor_coeff = fi_dct8_32xN_coeff_hor;
}
if (ver == DST7) {
ver_coeff = uvg_g_dst7_32_t;
ver_coeff = &uvg_g_dst7_32_t[0][0];
} else if (ver == DCT8) {
ver_coeff = uvg_g_dct8_32;
ver_coeff = &uvg_g_dct8_32[0][0];
}
__m256i v_ver_pass_out[64];

View file

@ -4830,101 +4830,5 @@ typedef int16_t TMatrixCoeff;
{ b, -d, f, -h, j, -l, n, -p, r, -t, v, -x, z, -B, D, -F, E, -C, A, -y, w, -u, s, -q, o, -m, k, -i, g, -e, c, -a,}, \
}
#define TRANSFORM_NUMBER_OF_DIRECTIONS 1
#define ALIGN_DATA(nBytes,v) __declspec(align(nBytes)) v
#define MEMORY_ALIGN_DEF_SIZE 32 // for use with avx2 (256 bit)
//--------------------------------------------------------------------------------------------------
// DCT-2
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P2[TRANSFORM_NUMBER_OF_DIRECTIONS][2][2]) =
{
DEFINE_DCT2_P2_MATRIX(64),
//DEFINE_DCT2_P2_MATRIX(64)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
{
DEFINE_DCT2_P4_MATRIX(64, 83, 36),
//DEFINE_DCT2_P4_MATRIX(64, 83, 36)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
{
DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18),
//DEFINE_DCT2_P8_MATRIX(64, 83, 36, 89, 75, 50, 18)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
{
DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9),
//DEFINE_DCT2_P16_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
{
DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4),
//DEFINE_DCT2_P32_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT2P64[TRANSFORM_NUMBER_OF_DIRECTIONS][64][64]) =
{
DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2),
//DEFINE_DCT2_P64_MATRIX(64, 83, 36, 89, 75, 50, 18, 90, 87, 80, 70, 57, 43, 25, 9, 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 91, 90, 90, 90, 88, 87, 86, 84, 83, 81, 79, 77, 73, 71, 69, 65, 62, 59, 56, 52, 48, 44, 41, 37, 33, 28, 24, 20, 15, 11, 7, 2)
};
// DCT-8
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
{
DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29),
//DEFINE_DCT8_P4_MATRIX(84, 74, 55, 29)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
{
DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17),
//DEFINE_DCT8_P8_MATRIX(86, 85, 78, 71, 60, 46, 32, 17)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
{
DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8),
//DEFINE_DCT8_P16_MATRIX(88, 88, 87, 85, 81, 77, 73, 68, 62, 55, 48, 40, 33, 25, 17, 8)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDCT8P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
{
DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4),
//DEFINE_DCT8_P32_MATRIX(90, 90, 89, 88, 87, 86, 85, 84, 82, 80, 78, 77, 74, 72, 68, 66, 63, 60, 56, 53, 50, 46, 42, 38, 34, 30, 26, 21, 17, 13, 9, 4)
};
// DST-7
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P4[TRANSFORM_NUMBER_OF_DIRECTIONS][4][4]) =
{
DEFINE_DST7_P4_MATRIX(29, 55, 74, 84),
//DEFINE_DST7_P4_MATRIX(29, 55, 74, 84)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P8[TRANSFORM_NUMBER_OF_DIRECTIONS][8][8]) =
{
DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86),
//DEFINE_DST7_P8_MATRIX(17, 32, 46, 60, 71, 78, 85, 86)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P16[TRANSFORM_NUMBER_OF_DIRECTIONS][16][16]) =
{
DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88),
//DEFINE_DST7_P16_MATRIX(8, 17, 25, 33, 40, 48, 55, 62, 68, 73, 77, 81, 85, 87, 88, 88)
};
ALIGN_DATA(MEMORY_ALIGN_DEF_SIZE, const TMatrixCoeff g_trCoreDST7P32[TRANSFORM_NUMBER_OF_DIRECTIONS][32][32]) =
{
DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90),
//DEFINE_DST7_P32_MATRIX(4, 9, 13, 17, 21, 26, 30, 34, 38, 42, 46, 50, 53, 56, 60, 63, 66, 68, 72, 74, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 90)
};
//--------------------------------------------------------------------------------------------------
static const int16_t* vvenc_matrix_coeffs[3][6] = {
{g_trCoreDCT2P2[0][0], g_trCoreDCT2P4[0][0], g_trCoreDCT2P8[0][0], g_trCoreDCT2P16[0][0], g_trCoreDCT2P32[0][0], g_trCoreDCT2P64[0][0]},
{NULL, g_trCoreDCT8P4[0][0], g_trCoreDCT8P8[0][0], g_trCoreDCT8P16[0][0], g_trCoreDCT8P32[0][0], NULL},
{NULL, g_trCoreDST7P4[0][0], g_trCoreDST7P8[0][0], g_trCoreDST7P16[0][0], g_trCoreDST7P32[0][0], NULL},
};
//! \}
#endif DCT_AVX2_TABLES_H