From 8bbf01c37682ca01c380d1e82a2ce08f52e85829 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 12 Sep 2023 14:07:30 +0300
Subject: [PATCH] Change the right shift in pred_planar_avx2 to use a 128 bit
 register version of the right shift instrinsics, since when the integer
 version does not have a compile time constant the compiler is forced to
 generate the 128 bit register using version anyways, but also has to convert
 the integer to the 128 bit register, and the compiler does not optimize this
 properly and instead does the conversion on every call of the loop. ***THIS
 NEEDS TO BE DONE FOR ALL SHIFTS THAT DO NOT USE COMPILE TIME CONSTANT
 SHIFTS***

---
 src/strategies/avx2/intra-avx2.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c
index e935e8a2..17f63414 100644
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@@ -980,6 +980,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc,
   }*/
 
   // New loop
+  __m128i shift_r_v = _mm_setzero_si128();
+  shift_r_v = _mm_insert_epi32(shift_r_v, shift_r, 0);
   for (int i = 0, d = 0; i < samples; i += 16, ++d) {
     __m256i v_lo = _mm256_unpacklo_epi16(v_pred_hor[d], v_pred_ver[d]);
     __m256i v_hi = _mm256_unpackhi_epi16(v_pred_hor[d], v_pred_ver[d]);
@@ -991,8 +993,8 @@ void uvg_intra_pred_planar_avx2(const cu_loc_t* const cu_loc,
     v_madd_lo = _mm256_add_epi32(v_madd_lo, v_samples);
     v_madd_hi = _mm256_add_epi32(v_madd_hi, v_samples);
 
-    v_madd_lo = _mm256_srli_epi32(v_madd_lo, shift_r);
-    v_madd_hi = _mm256_srli_epi32(v_madd_hi, shift_r);
+    v_madd_lo = _mm256_srl_epi32(v_madd_lo, shift_r_v);
+    v_madd_hi = _mm256_srl_epi32(v_madd_hi, shift_r_v);
 
     v_res[d] = _mm256_packs_epi32(v_madd_lo, v_madd_hi);
   }