[avx2] Inverses work when ISP is not enabled

This commit is contained in:
Joose Sainio 2023-07-26 10:45:39 +03:00
parent 4dccbcc30d
commit b78f9aff17
2 changed files with 64 additions and 34 deletions

View file

@ -2752,9 +2752,9 @@ void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
}
__m256i v_hor_pass_out;
fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
}
@ -3568,39 +3568,46 @@ static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const
static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
{
const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
const int32_t add = (shift > 0) ? (1 << (shift - 1)) : 0;
const __m256i debias = _mm256_set1_epi32(add);
const __m256i* v_coeff = (const __m256i*)coeff;
const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
__m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]);
__m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]);
__m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]);
__m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]);
__m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]);
__m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]);
__m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]);
__m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]);
// Duplicate sources to enable vertical addition
__m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0));
__m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2));
__m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
__m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
__m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5);
__m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7);
__m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
__m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
__m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
__m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
__m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
__m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
__m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
__m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
__m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01);
__m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11);
__m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21);
__m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31);
__m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
__m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
__m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
//v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
//v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
//v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
_mm256_store_si256((__m256i*)dst, v_result);
}
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
{
const int width = 8;
const int height = 2;
@ -3621,7 +3628,6 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
}
void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
{
const int width = 8;
@ -4062,9 +4068,9 @@ void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
}
__m256i v_hor_pass_out[4];
fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
}
@ -5636,9 +5642,9 @@ void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
}
__m256i v_hor_pass_out[16];
fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
}
@ -8152,7 +8158,7 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
//success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
}
#endif // UVG_BIT_DEPTH == 8

View file

@ -749,16 +749,40 @@ const int16_t ff_dst7_2x8_coeff_ver[128] = {
ALIGNED(32) const int16_t fi_dct2_2x8_coeff_ver[128] = {
64, 89, 83, 75, 64, 89, 83, 75, 64, 75, 36, -18, 64, 75, 36, -18,
64, 50, 36, 18, 64, 50, 36, 18, -64, -89, -83, -50, -64, -89, -83, -50,
64, 50, -36, -89, 64, 50, -36, -89, 64, 18, -83, -50, 64, 18, -83, -50,
-64, 18, 83, 75, -64, 18, 83, 75, 64, 75, -36, -89, 64, 75, -36, -89,
64, -18, -83, 50, 64, -18, -83, 50, 64, -50, -36, 89, 64, -50, -36, 89,
64, -75, -36, 89, 64, -75, -36, 89, -64, -18, 83, -75, -64, -18, 83, -75,
64, -75, 36, 18, 64, -75, 36, 18, 64, -89, 83, -75, 64, -89, 83, -75,
-64, 89, -83, 50, -64, 89, -83, 50, 64, -50, 36, -18, 64, -50, 36, -18,
64, 89, 83, 75, 64, 75, 36, -18, 64, 89, 83, 75, 64, 75, 36, -18,
64, 50, 36, 18, -64, -89, -83, -50, 64, 50, 36, 18, -64, -89, -83, -50,
64, 50, -36, -89, 64, 18, -83, -50, 64, 50, -36, -89, 64, 18, -83, -50,
-64, 18, 83, 75, 64, 75, -36, -89, -64, 18, 83, 75, 64, 75, -36, -89,
64, -18, -83, 50, 64, -50, -36, 89, 64, -18, -83, 50, 64, -50, -36, 89,
64, -75, -36, 89, -64, -18, 83, -75, 64, -75, -36, 89, -64, -18, 83, -75,
64, -75, 36, 18, 64, -89, 83, -75, 64, -75, 36, 18, 64, -89, 83, -75,
-64, 89, -83, 50, 64, -50, 36, -18, -64, 89, -83, 50, 64, -50, 36, -18,
};
ALIGNED(32) const int16_t fi_dst7_2x8_coeff_ver[128] = {
17, 46, 71, 85, 32, 78, 85, 46, 17, 46, 71, 85, 32, 78, 85, 46,
86, 78, 60, 32, -17, -71, -86, -60, 86, 78, 60, 32, -17, -71, -86, -60,
46, 86, 32, -60, 60, 71, -46, -78, 46, 86, 32, -60, 60, 71, -46, -78,
-85, -17, 71, 78, 32, 85, -17, -86, -85, -17, 71, 78, 32, 85, -17, -86,
71, 32, -86, 17, 78, -17, -60, 86, 71, 32, -86, 17, 78, -17, -60, 86,
78, -60, -46, 85, -46, -32, 85, -71, 78, -60, -46, 85, -46, -32, 85, -71,
85, -60, 17, 32, 86, -85, 78, -71, 85, -60, 17, 32, 86, -85, 78, -71,
-71, 86, -78, 46, 60, -46, 32, -17, -71, 86, -78, 46, 60, -46, 32, -17,
};
ALIGNED(32) const int16_t fi_dct8_2x8_coeff_ver[128] = {
86, 85, 78, 71, 85, 60, 17, -32, 86, 85, 78, 71, 85, 60, 17, -32,
60, 46, 32, 17, -71, -86, -78, -46, 60, 46, 32, 17, -71, -86, -78, -46,
78, 17, -60, -86, 71, -32, -86, -17, 78, 17, -60, -86, 71, -32, -86, -17,
-46, 32, 85, 71, 78, 60, -46, -85, -46, 32, 85, 71, 78, 60, -46, -85,
60, -71, -46, 78, 46, -86, 32, 60, 60, -71, -46, 78, 46, -86, 32, 60,
32, -85, -17, 86, -85, 17, 71, -78, 32, -85, -17, 86, -85, 17, 71, -78,
32, -78, 85, -46, 17, -46, 71, -85, 32, -78, 85, -46, 17, -46, 71, -85,
-17, 71, -86, 60, 86, -78, 60, -32, -17, 71, -86, 60, 86, -78, 60, -32,
};
ALIGNED(32) const int16_t fi_dct2_2x16_coeff_ver[512] = {
64, 90, 89, 87, 64, 90, 89, 87, 64, 57, 50, 43, 64, 57, 50, 43, // 0
83, 80, 75, 70, 83, 80, 75, 70, 36, 25, 18, 9, 36, 25, 18, 9,