From 8bbf01c37682ca01c380d1e82a2ce08f52e85829 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 12 Sep 2023 14:07:30 +0300 Subject: [PATCH] Change the right shift in pred_planar_avx2 to use a 128 bit register version of the right shift instrinsics, since when the integer version does not have a compile time constant the compiler is forced to generate the 128 bit register using version anyways, but also has to convert the integer to the 128 bit register, and the compiler does not optimize this properly and instead does the conversion on every call of the loop. ***THIS NEEDS TO BE DONE FOR ALL SHIFTS THAT DO NOT USE COMPILE TIME CONSTANT SHIFTS*** --- src/strategies/avx2/intra-avx2.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index e935e8a2..17f63414 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -980,6 +980,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, }*/ // New loop + __m128i shift_r_v = _mm_setzero_si128(); + shift_r_v = _mm_insert_epi32(shift_r_v, shift_r, 0); for (int i = 0, d = 0; i < samples; i += 16, ++d) { __m256i v_lo = _mm256_unpacklo_epi16(v_pred_hor[d], v_pred_ver[d]); __m256i v_hi = _mm256_unpackhi_epi16(v_pred_hor[d], v_pred_ver[d]); @@ -991,8 +993,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc, v_madd_lo = _mm256_add_epi32(v_madd_lo, v_samples); v_madd_hi = _mm256_add_epi32(v_madd_hi, v_samples); - v_madd_lo = _mm256_srli_epi32(v_madd_lo, shift_r); - v_madd_hi = _mm256_srli_epi32(v_madd_hi, shift_r); + v_madd_lo = _mm256_srl_epi32(v_madd_lo, shift_r_v); + v_madd_hi = _mm256_srl_epi32(v_madd_hi, shift_r_v); v_res[d] = _mm256_packs_epi32(v_madd_lo, v_madd_hi); }