mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-23 18:14:06 +00:00
[avx2] Mostly working
This commit is contained in:
parent
b78f9aff17
commit
13d4313e02
|
@ -2174,6 +2174,9 @@ void fast_inverse_tr_2x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
|
|||
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
|
||||
|
||||
const int16_t* ver_coeff = fi_dct2_8x2_coeff_hor; // TODO: rename
|
||||
if (ver == DST7) {
|
||||
ver_coeff = fi_dst7_8x2_coeff_hor;
|
||||
}
|
||||
const int16_t* hor_coeff = fi_dct2_8x2_coeff_ver; // rename
|
||||
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
|
||||
|
||||
|
@ -2359,6 +2362,9 @@ void fast_inverse_tr_2x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
|
|||
const int32_t shift_2nd = INVERSE_SHIFT_2ND;
|
||||
|
||||
const int16_t* ver_coeff = fi_dct2_16x2_coeff_hor; // TODO: rename
|
||||
if (ver == DST7) {
|
||||
ver_coeff = fi_dst7_16x2_coeff_hor;
|
||||
}
|
||||
const int16_t* hor_coeff = fi_dct2_16x2_coeff_ver; // rename
|
||||
// No coeffs for DCT8 and DST7 transforms since they do not exist for this block size
|
||||
|
||||
|
@ -3607,7 +3613,7 @@ static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const
|
|||
_mm256_store_si256((__m256i*)dst, v_result);
|
||||
}
|
||||
|
||||
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
|
||||
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
|
||||
{
|
||||
const int width = 8;
|
||||
const int height = 2;
|
||||
|
@ -3620,6 +3626,9 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_ty
|
|||
|
||||
const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
|
||||
const int16_t* hor_coeff = fi_dct2_2x8_coeff_ver; // rename
|
||||
if (hor == DST7) {
|
||||
hor_coeff = fi_dst7_2x8_coeff_ver;
|
||||
}
|
||||
// Only dct2 transform is defined for this block size
|
||||
|
||||
__m256i v_ver_pass_out;
|
||||
|
@ -4810,6 +4819,9 @@ void fast_inverse_tr_16x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
|
|||
|
||||
const int16_t* ver_coeff = ff_dct2_2xN_coeff_hor; // TODO: rename
|
||||
const int16_t* hor_coeff = fi_dct2_2x16_coeff_ver; // rename
|
||||
if (hor == DST7) {
|
||||
hor_coeff = fi_dst7_2x16_coeff_ver;
|
||||
}
|
||||
// DST7 and DCT8 are not defined for this block size
|
||||
|
||||
__m256i v_ver_pass_out[2];
|
||||
|
@ -5455,7 +5467,7 @@ static void fast_inverse_tr_16x16_avx2_hor(const int16_t* src, __m256i* dst, con
|
|||
// dst[i] = _mm256_packs_epi32(v_trunc_lo[i], v_trunc_hi[i]);
|
||||
//}
|
||||
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
for (int j = 0; j < line; ++j) {
|
||||
__m256i res_0 = _mm256_setzero_si256();
|
||||
__m256i res_1 = _mm256_setzero_si256();
|
||||
|
||||
|
@ -7219,7 +7231,7 @@ void fast_inverse_tr_32x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
|
|||
}
|
||||
|
||||
__m256i v_ver_pass_out[16];
|
||||
if(ver == DCT2) {
|
||||
if(ver == DCT2 || hor == DCT2) {
|
||||
fast_inverse_tr_32x8_avx2_ver(src, v_ver_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
|
||||
}
|
||||
else {
|
||||
|
@ -8128,10 +8140,27 @@ static void mts_idct_avx2(
|
|||
else {
|
||||
const int log2_width_minus1 = uvg_g_convert_to_log2[width] - 1;
|
||||
const int log2_height_minus1 = uvg_g_convert_to_log2[height] - 1;
|
||||
|
||||
if (height == 1) {
|
||||
if (width == 16) {
|
||||
fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_hor == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
|
||||
_mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
|
||||
} else if (width == 32) {
|
||||
fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);
|
||||
}
|
||||
}
|
||||
else if (width == 1){
|
||||
if (height == 16) {
|
||||
fast_forward_DCT2_B16_avx2_hor(input, (__m256i*)output, type_ver == DCT2 ? fi_dct2_16x1_coeff_hor : fi_dst7_16x1_coeff_hor, 13, 1, 0, 0);
|
||||
_mm256_store_si256((__m256i*)output, _mm256_permute4x64_epi64(_mm256_load_si256((__m256i*)output), _MM_SHUFFLE(3, 1, 2, 0)));
|
||||
} else if (height == 32) {
|
||||
fast_forward_DCT2_B32_avx2_hor(input, (__m256i*)output, fi_dct2_32xN_coeff_hor, 13, 1, 0, 0);
|
||||
}
|
||||
}
|
||||
else {
|
||||
dct_full_pass* idct_func = idct_function_table[log2_width_minus1][log2_height_minus1];
|
||||
idct_func(input, output, type_hor, type_ver);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // UVG_BIT_DEPTH == 8
|
||||
|
|
|
@ -818,6 +818,41 @@ ALIGNED(32) const int16_t fi_dct2_2x16_coeff_ver[512] = {
|
|||
83, -80, 75, -70, 83, -80, 75, -70, 36, -25, 18, -9, 36, -25, 18, -9,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t fi_dst7_2x16_coeff_ver[512] = {
|
||||
8, 25, 40, 55, 8, 25, 40, 55, 88, 87, 81, 73, 88, 87, 81, 73, // 0
|
||||
68, 77, 85, 88, 68, 77, 85, 88, 62, 48, 33, 17, 62, 48, 33, 17,
|
||||
17, 48, 73, 87, 17, 48, 73, 87, -8, -40, -68, -85, -8, -40, -68, -85,
|
||||
88, 77, 55, 25, 88, 77, 55, 25, -88, -81, -62, -33, -88, -81, -62, -33,
|
||||
25, 68, 88, 81, 25, 68, 88, 81, -88, -68, -25, 25, -88, -68, -25, 25,
|
||||
48, 0, -48, -81, 48, 0, -48, -81, 68, 88, 81, 48, 68, 88, 81, 48,
|
||||
33, 81, 85, 40, 33, 81, 85, 40, 17, 73, 88, 55, 17, 73, 88, 55,
|
||||
-25, -77, -87, -48, -25, -77, -87, -48, -8, -68, -88, -62, -8, -68, -88, -62,
|
||||
40, 88, 62, -17, 40, 88, 62, -17, 87, 33, -48, -88, 87, 33, -48, -88, // 8
|
||||
-81, -77, -8, 68, -81, -77, -8, 68, -55, 25, 85, 73, -55, 25, 85, 73,
|
||||
48, 88, 25, -68, 48, 88, 25, -68, -25, -88, -48, 48, -25, -88, -48, 48,
|
||||
-81, 0, 81, 68, -81, 0, 81, 68, 88, 25, -68, -81, 88, 25, -68, -81,
|
||||
55, 81, -17, -88, 55, 81, -17, -88, -85, 8, 88, 33, -85, 8, 88, 33,
|
||||
-25, 77, 62, -48, -25, 77, 62, -48, -73, -68, 40, 87, -73, -68, 40, 87,
|
||||
62, 68, -55, -73, 62, 68, -55, -73, 33, 85, -25, -87, 33, 85, -25, -87,
|
||||
48, 77, -40, -81, 48, 77, -40, -81, 17, 88, -8, -88, 17, 88, -8, -88,
|
||||
68, 48, -81, -25, 68, 48, -81, -25, 81, -48, -68, 68, 81, -48, -68, 68, // 16
|
||||
88, 0, -88, 25, 88, 0, -88, 25, 48, -81, -25, 88, 48, -81, -25, 88,
|
||||
73, 25, -88, 33, 73, 25, -88, 33, -40, -62, 81, 8, -40, -62, 81, 8,
|
||||
68, -77, -17, 88, 68, -77, -17, 88, -87, 48, 55, -85, -87, 48, 55, -85,
|
||||
77, 0, -77, 77, 77, 0, -77, 77, -77, 77, 0, -77, -77, 77, 0, -77,
|
||||
0, -77, 77, 0, 0, -77, 77, 0, 77, 0, -77, 77, 77, 0, -77, 77,
|
||||
81, -25, -48, 88, 81, -25, -48, 88, 48, 25, -81, 81, 48, 25, -81, 81,
|
||||
-68, 0, 68, -88, -68, 0, 68, -88, -25, -48, 88, -68, -25, -48, 88, -68,
|
||||
85, -48, -8, 62, 85, -48, -8, 62, 73, -88, 68, -17, 73, -88, 68, -17, // 24
|
||||
-88, 77, -33, -25, -88, 77, -33, -25, -40, 81, -87, 55, -40, 81, -87, 55,
|
||||
87, -68, 33, 8, 87, -68, 33, 8, -55, 17, 25, -62, -55, 17, 25, -62,
|
||||
-48, 77, -88, 81, -48, 77, -88, 81, 85, -88, 73, -40, 85, -88, 73, -40,
|
||||
88, -81, 68, -48, 88, -81, 68, -48, -68, 81, -88, 88, -68, 81, -88, 88,
|
||||
25, 0, -25, 48, 25, 0, -25, 48, -81, 68, -48, 25, -81, 68, -48, 25,
|
||||
88, -88, 87, -85, 88, -88, 87, -85, 62, -55, 48, -40, 62, -55, 48, -40,
|
||||
81, -77, 73, -68, 81, -77, 73, -68, 33, -25, 17, -8, 33, -25, 17, -8,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t fi_dct2_2x32_coeff_ver[2048] = {
|
||||
64, 90, 90, 90, 89, 88, 87, 85, 64, 90, 90, 90, 89, 88, 87, 85, // 0
|
||||
83, 82, 80, 78, 75, 73, 70, 67, 83, 82, 80, 78, 75, 73, 70, 67,
|
||||
|
@ -1346,6 +1381,17 @@ ALIGNED(32) const int16_t fi_dct2_8x2_coeff_hor[128] = {
|
|||
64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t fi_dst7_8x2_coeff_hor[128] = {
|
||||
17, 46, 71, 85, 86, 78, 60, 32, 17, 46, 71, 85, 86, 78, 60, 32,
|
||||
32, 78, 85, 46, -17, -71, -86, -60, 32, 78, 85, 46, -17, -71, -86, -60,
|
||||
46, 86, 32, -60, -85, -17, 71, 78, 46, 86, 32, -60, -85, -17, 71, 78,
|
||||
60, 71, -46, -78, 32, 85, -17, -86, 60, 71, -46, -78, 32, 85, -17, -86,
|
||||
71, 32, -86, 17, 78, -60, -46, 85, 71, 32, -86, 17, 78, -60, -46, 85,
|
||||
78, -17, -60, 86, -46, -32, 85, -71, 78, -17, -60, 86, -46, -32, 85, -71,
|
||||
85, -60, 17, 32, -71, 86, -78, 46, 85, -60, 17, 32, -71, 86, -78, 46,
|
||||
86, -85, 78, -71, 60, -46, 32, -17, 86, -85, 78, -71, 60, -46, 32, -17,
|
||||
};
|
||||
|
||||
const int16_t* fi_dct2_8x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
|
||||
|
||||
|
||||
|
@ -2381,6 +2427,43 @@ ALIGNED(32) const int16_t fi_dct2_16x2_coeff_hor[512] = {
|
|||
const int16_t* fi_dct2_16x2_coeff_ver = ff_dct2_2xN_coeff_hor; // This is identical to existing table
|
||||
|
||||
|
||||
|
||||
ALIGNED(32) const int16_t fi_dst7_16x2_coeff_hor[512] = {
|
||||
8, 25, 40, 55, 68, 77, 85, 88, 8, 25, 40, 55, 68, 77, 85, 88, // 0
|
||||
88, 87, 81, 73, 62, 48, 33, 17, 88, 87, 81, 73, 62, 48, 33, 17,
|
||||
17, 48, 73, 87, 88, 77, 55, 25, 17, 48, 73, 87, 88, 77, 55, 25,
|
||||
-8, -40, -68, -85, -88, -81, -62, -33, -8, -40, -68, -85, -88, -81, -62, -33,
|
||||
25, 68, 88, 81, 48, 0, -48, -81, 25, 68, 88, 81, 48, 0, -48, -81,
|
||||
-88, -68, -25, 25, 68, 88, 81, 48, -88, -68, -25, 25, 68, 88, 81, 48,
|
||||
33, 81, 85, 40, -25, -77, -87, -48, 33, 81, 85, 40, -25, -77, -87, -48,
|
||||
17, 73, 88, 55, -8, -68, -88, -62, 17, 73, 88, 55, -8, -68, -88, -62,
|
||||
40, 88, 62, -17, -81, -77, -8, 68, 40, 88, 62, -17, -81, -77, -8, 68, // 8
|
||||
87, 33, -48, -88, -55, 25, 85, 73, 87, 33, -48, -88, -55, 25, 85, 73,
|
||||
48, 88, 25, -68, -81, 0, 81, 68, 48, 88, 25, -68, -81, 0, 81, 68,
|
||||
-25, -88, -48, 48, 88, 25, -68, -81, -25, -88, -48, 48, 88, 25, -68, -81,
|
||||
55, 81, -17, -88, -25, 77, 62, -48, 55, 81, -17, -88, -25, 77, 62, -48,
|
||||
-85, 8, 88, 33, -73, -68, 40, 87, -85, 8, 88, 33, -73, -68, 40, 87,
|
||||
62, 68, -55, -73, 48, 77, -40, -81, 62, 68, -55, -73, 48, 77, -40, -81,
|
||||
33, 85, -25, -87, 17, 88, -8, -88, 33, 85, -25, -87, 17, 88, -8, -88,
|
||||
68, 48, -81, -25, 88, 0, -88, 25, 68, 48, -81, -25, 88, 0, -88, 25, // 16
|
||||
81, -48, -68, 68, 48, -81, -25, 88, 81, -48, -68, 68, 48, -81, -25, 88,
|
||||
73, 25, -88, 33, 68, -77, -17, 88, 73, 25, -88, 33, 68, -77, -17, 88,
|
||||
-40, -62, 81, 8, -87, 48, 55, -85, -40, -62, 81, 8, -87, 48, 55, -85,
|
||||
77, 0, -77, 77, 0, -77, 77, 0, 77, 0, -77, 77, 0, -77, 77, 0,
|
||||
-77, 77, 0, -77, 77, 0, -77, 77, -77, 77, 0, -77, 77, 0, -77, 77,
|
||||
81, -25, -48, 88, -68, 0, 68, -88, 81, -25, -48, 88, -68, 0, 68, -88,
|
||||
48, 25, -81, 81, -25, -48, 88, -68, 48, 25, -81, 81, -25, -48, 88, -68,
|
||||
85, -48, -8, 62, -88, 77, -33, -25, 85, -48, -8, 62, -88, 77, -33, -25, // 24
|
||||
73, -88, 68, -17, -40, 81, -87, 55, 73, -88, 68, -17, -40, 81, -87, 55,
|
||||
87, -68, 33, 8, -48, 77, -88, 81, 87, -68, 33, 8, -48, 77, -88, 81,
|
||||
-55, 17, 25, -62, 85, -88, 73, -40, -55, 17, 25, -62, 85, -88, 73, -40,
|
||||
88, -81, 68, -48, 25, 0, -25, 48, 88, -81, 68, -48, 25, 0, -25, 48,
|
||||
-68, 81, -88, 88, -81, 68, -48, 25, -68, 81, -88, 88, -81, 68, -48, 25,
|
||||
88, -88, 87, -85, 81, -77, 73, -68, 88, -88, 87, -85, 81, -77, 73, -68,
|
||||
62, -55, 48, -40, 33, -25, 17, -8, 62, -55, 48, -40, 33, -25, 17, -8,
|
||||
};
|
||||
|
||||
|
||||
ALIGNED(32) const int16_t ff_dct2_16x8_butterfly_o_row_coeff_hor[1024] = {
|
||||
90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, 90, -90, // 0
|
||||
87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87, 87, -87,
|
||||
|
@ -2881,6 +2964,44 @@ ALIGNED(32) const int16_t fi_dst7_16x16_coeff_hor[256] = {
|
|||
-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t fi_dct2_16x1_coeff_hor[256] = {
|
||||
64, 90, 64, 87, 64, 80, 64, 70, 64, 57, 64, 43, 64, 25, 64, 9, // 0
|
||||
89, 87, 75, 57, 50, 9, 18, -43, -18, -80, -50, -90, -75, -70, -89, -25,
|
||||
83, 80, 36, 9, -36, -70, -83, -87, -83, -25, -36, 57, 36, 90, 83, 43,
|
||||
75, 70, -18, -43, -89, -87, -50, 9, 50, 90, 89, 25, 18, -80, -75, -57,
|
||||
64, 57, -64, -80, -64, -25, 64, 90, 64, -9, -64, -87, -64, 43, 64, 70, // 8
|
||||
50, 43, -89, -90, 18, 57, 75, 25, -75, -87, -18, 70, 89, 9, -50, -80,
|
||||
36, 25, -83, -70, 83, 90, -36, -80, -36, 43, 83, 9, -83, -57, 36, 87,
|
||||
18, 9, -50, -25, 75, 43, -89, -57, 89, 70, -75, -80, 50, 87, -18, -90,
|
||||
64, -9, 64, -25, 64, -43, 64, -57, 64, -70, 64, -80, 64, -87, 64, -90,
|
||||
-89, 25, -75, 70, -50, 90, -18, 80, 18, 43, 50, -9, 75, -57, 89, -87,
|
||||
83, -43, 36, -90, -36, -57, -83, 25, -83, 87, -36, 70, 36, -9, 83, -80,
|
||||
-75, 57, 18, 80, 89, -25, 50, -90, -50, -9, -89, 87, -18, 43, 75, -70,
|
||||
64, -70, -64, -43, -64, 87, 64, 9, 64, -90, -64, 25, -64, 80, 64, -57,
|
||||
-50, 80, 89, -9, -18, -70, -75, 87, 75, -25, 18, -57, -89, 90, 50, -43,
|
||||
36, -87, -83, 57, 83, -9, -36, -43, -36, 80, 83, -90, -83, 70, 36, -25,
|
||||
-18, 90, 50, -87, -75, 80, 89, -70, -89, 57, 75, -43, -50, 25, 18, -9,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t fi_dst7_16x1_coeff_hor[256] = {
|
||||
8, 25, 17, 48, 25, 68, 33, 81, 40, 88, 48, 88, 55, 81, 62, 68, // 0
|
||||
40, 55, 73, 87, 88, 81, 85, 40, 62, -17, 25, -68, -17, -88, -55, -73,
|
||||
68, 77, 88, 77, 48, 0, -25, -77, -81, -77, -81, 0, -25, 77, 48, 77,
|
||||
85, 88, 55, 25, -48, -81, -87, -48, -8, 68, 81, 68, 62, -48, -40, -81,
|
||||
88, 87, -8, -40, -88, -68, 17, 73, 87, 33, -25, -88, -85, 8, 33, 85, // 8
|
||||
81, 73, -68, -85, -25, 25, 88, 55, -48, -88, -48, 48, 88, 33, -25, -87,
|
||||
62, 48, -88, -81, 68, 88, -8, -68, -55, 25, 88, 25, -73, -68, 17, 88,
|
||||
33, 17, -62, -33, 81, 48, -88, -62, 85, 73, -68, -81, 40, 87, -8, -88,
|
||||
68, 48, 73, 25, 77, 0, 81, -25, 85, -48, 87, -68, 88, -81, 88, -88,
|
||||
-81, -25, -88, 33, -77, 77, -48, 88, -8, 62, 33, 8, 68, -48, 87, -85,
|
||||
88, 0, 68, -77, 0, -77, -68, 0, -88, 77, -48, 77, 25, 0, 81, -77,
|
||||
-88, 25, -17, 88, 77, 0, 68, -88, -33, -25, -88, 81, -25, 48, 73, -68,
|
||||
81, -48, -40, -62, -77, 77, 48, 25, 73, -88, -55, 17, -68, 81, 62, -55,
|
||||
-68, 68, 81, 8, 0, -77, -81, 81, 68, -17, 25, -62, -88, 88, 48, -40,
|
||||
48, -81, -87, 48, 77, 0, -25, -48, -40, 81, 85, -88, -81, 68, 33, -25,
|
||||
-25, 88, 55, -85, -77, 77, 88, -68, -87, 55, 73, -40, -48, 25, 17, -8,
|
||||
};
|
||||
|
||||
ALIGNED(32) const int16_t* fi_dct8_16x16_coeff_hor = ff_dct8_16x16_coeff_ver;
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue