[avx2] Mostly working

This commit is contained in:
Joose Sainio 2023-07-26 14:05:04 +03:00
parent b78f9aff17
commit 13d4313e02
2 changed files with 156 additions and 6 deletions

View file

@ -2174,6 +2174,9 @@ void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename
if (ver == DST7) {
ver_coeff = fi_dst7_8x2_coeff_hor;
}
const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@ -2359,6 +2362,9 @@ void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename
if (ver == DST7) {
ver_coeff = fi_dst7_16x2_coeff_hor;
}
const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
@ -3607,7 +3613,7 @@ static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const
_mm256_store_si256((__m256i*)dst, v_result);
}
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
{
const int width = 8;
const int height = 2;
@ -3620,6 +3626,9 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_ty
const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename
if (hor == DST7) {
hor_coeff = fi_dst7_2x8_coeff_ver;
}
// Only dct2 transform is defined for this block size
__m256i v_ver_pass_out;
@ -4810,6 +4819,9 @@ void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename
if (hor == DST7) {
hor_coeff = fi_dst7_2x16_coeff_ver;
}
// DST7 and DCT8 are not defined for this block size
__m256i v_ver_pass_out[2];
@ -5455,7 +5467,7 @@ static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, con
// dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
//}
for (int j = 0; j < 16; ++j) {
for (int j = 0; j < line; ++j) {
__m256i res_0 = _mm256_setzero_si256();
__m256i res_1 = _mm256_setzero_si256();
@ -7219,7 +7231,7 @@ void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
}
__m256i v_ver_pass_out[16];
if(ver == DCT2) {
if(ver == DCT2 || hor == DCT2) {
fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
}
else {
@ -8128,11 +8140,28 @@ static void mts_idct_avx2(
else {
const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
if (height == 1) {
if (width == 16) {
fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
_mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
} else if (width == 32) {
fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);
}
}
else if (width == 1){
if (height == 16) {
fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
_mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
} else if (height == 32) {
fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);
}
}
else {
dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
idct_func(input, output, type_hor, type_ver);
}
}
}
#endif // UVG_BIT_DEPTH == 8
#endif //COMPILE_INTEL_AVX2

View file

@ -818,6 +818,41 @@ ALIGNED(32) const int16_t fi_dct2_2x16_coeff_ver[512] = {
83, -80, 75, -70, 83, -80, 75, -70, 36, -25, 18, -9, 36, -25, 18, -9,
};
ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = {
8, 25, 40, 55, 8, 25, 40, 55, 88, 87, 81, 73, 88, 87, 81, 73, // 0
68, 77, 85, 88, 68, 77, 85, 88, 62, 48, 33, 17, 62, 48, 33, 17,
17, 48, 73, 87, 17, 48, 73, 87, -8, -40, -68, -85, -8, -40, -68, -85,
88, 77, 55, 25, 88, 77, 55, 25, -88, -81, -62, -33, -88, -81, -62, -33,
25, 68, 88, 81, 25, 68, 88, 81, -88, -68, -25, 25, -88, -68, -25, 25,
48, 0, -48, -81, 48, 0, -48, -81, 68, 88, 81, 48, 68, 88, 81, 48,
33, 81, 85, 40, 33, 81, 85, 40, 17, 73, 88, 55, 17, 73, 88, 55,
-25, -77, -87, -48, -25, -77, -87, -48, -8, -68, -88, -62, -8, -68, -88, -62,
40, 88, 62, -17, 40, 88, 62, -17, 87, 33, -48, -88, 87, 33, -48, -88, // 8
-81, -77, -8, 68, -81, -77, -8, 68, -55, 25, 85, 73, -55, 25, 85, 73,
48, 88, 25, -68, 48, 88, 25, -68, -25, -88, -48, 48, -25, -88, -48, 48,
-81, 0, 81, 68, -81, 0, 81, 68, 88, 25, -68, -81, 88, 25, -68, -81,
55, 81, -17, -88, 55, 81, -17, -88, -85, 8, 88, 33, -85, 8, 88, 33,
-25, 77, 62, -48, -25, 77, 62, -48, -73, -68, 40, 87, -73, -68, 40, 87,
62, 68, -55, -73, 62, 68, -55, -73, 33, 85, -25, -87, 33, 85, -25, -87,
48, 77, -40, -81, 48, 77, -40, -81, 17, 88, -8, -88, 17, 88, -8, -88,
68, 48, -81, -25, 68, 48, -81, -25, 81, -48, -68, 68, 81, -48, -68, 68, // 16
88, 0, -88, 25, 88, 0, -88, 25, 48, -81, -25, 88, 48, -81, -25, 88,
73, 25, -88, 33, 73, 25, -88, 33, -40, -62, 81, 8, -40, -62, 81, 8,
68, -77, -17, 88, 68, -77, -17, 88, -87, 48, 55, -85, -87, 48, 55, -85,
77, 0, -77, 77, 77, 0, -77, 77, -77, 77, 0, -77, -77, 77, 0, -77,
0, -77, 77, 0, 0, -77, 77, 0, 77, 0, -77, 77, 77, 0, -77, 77,
81, -25, -48, 88, 81, -25, -48, 88, 48, 25, -81, 81, 48, 25, -81, 81,
-68, 0, 68, -88, -68, 0, 68, -88, -25, -48, 88, -68, -25, -48, 88, -68,
85, -48, -8, 62, 85, -48, -8, 62, 73, -88, 68, -17, 73, -88, 68, -17, // 24
-88, 77, -33, -25, -88, 77, -33, -25, -40, 81, -87, 55, -40, 81, -87, 55,
87, -68, 33, 8, 87, -68, 33, 8, -55, 17, 25, -62, -55, 17, 25, -62,
-48, 77, -88, 81, -48, 77, -88, 81, 85, -88, 73, -40, 85, -88, 73, -40,
88, -81, 68, -48, 88, -81, 68, -48, -68, 81, -88, 88, -68, 81, -88, 88,
25, 0, -25, 48, 25, 0, -25, 48, -81, 68, -48, 25, -81, 68, -48, 25,
88, -88, 87, -85, 88, -88, 87, -85, 62, -55, 48, -40, 62, -55, 48, -40,
81, -77, 73, -68, 81, -77, 73, -68, 33, -25, 17, -8, 33, -25, 17, -8,
};
ALIGNED(32) const int16_t fi_dct2_2x32_coeff_ver[2048] = {
64, 90, 90, 90, 89, 88, 87, 85, 64, 90, 90, 90, 89, 88, 87, 85, // 0
83, 82, 80, 78, 75, 73, 70, 67, 83, 82, 80, 78, 75, 73, 70, 67,
@ -1346,6 +1381,17 @@ ALIGNED(32) const int16_t fi_dct2_8x2_coeff_hor[128] = {
64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18,
};
ALIGNED(32) const int16_t fi_dst7_8x2_coeff_hor[128] = {
17, 46, 71, 85, 86, 78, 60, 32, 17, 46, 71, 85, 86, 78, 60, 32,
32, 78, 85, 46, -17, -71, -86, -60, 32, 78, 85, 46, -17, -71, -86, -60,
46, 86, 32, -60, -85, -17, 71, 78, 46, 86, 32, -60, -85, -17, 71, 78,
60, 71, -46, -78, 32, 85, -17, -86, 60, 71, -46, -78, 32, 85, -17, -86,
71, 32, -86, 17, 78, -60, -46, 85, 71, 32, -86, 17, 78, -60, -46, 85,
78, -17, -60, 86, -46, -32, 85, -71, 78, -17, -60, 86, -46, -32, 85, -71,
85, -60, 17, 32, -71, 86, -78, 46, 85, -60, 17, 32, -71, 86, -78, 46,
86, -85, 78, -71, 60, -46, 32, -17, 86, -85, 78, -71, 60, -46, 32, -17,
};
const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
@ -2381,6 +2427,43 @@ ALIGNED(32) const int16_t fi_dct2_16x2_coeff_hor[512] = {
const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
ALIGNED(32) const int16_t fi_dst7_16x2_coeff_hor[512] = {
8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0
88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17,
17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25,
-8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33,
25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81,
-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48,
33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48,
17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62,
40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8
87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73,
48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68,
-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81,
55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48,
-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87,
62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81,
33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88,
68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16
81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88,
73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88,
-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85,
77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0,
-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77,
81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88,
48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68,
85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24
73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55,
87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81,
-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40,
88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48,
-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25,
88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68,
62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8,
};
ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0
87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87,
@ -2881,6 +2964,44 @@ ALIGNED(32) const int16_t fi_dst7_16x16_coeff_hor[256] = {
-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8,
};
ALIGNED(32) const int16_t fi_dct2_16x1_coeff_hor[256] = {
64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0
89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43,
75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57,
64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8
50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80,
36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87,
18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90,
64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90,
-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87,
83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80,
-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70,
64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57,
-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43,
36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25,
-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9,
};
ALIGNED(32) const int16_t fi_dst7_16x1_coeff_hor[256] = {
8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0
40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73,
68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77,
85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81,
88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8
81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87,
62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88,
33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88,
68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88,
-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85,
88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77,
-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68,
81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55,
-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40,
48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25,
-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8,
};
ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;