Use _mm_store_epi64 instead of _mm_cvtsi128_si64

Fix 32-bit builds that tend to lack the cvt intrinsic. Hope it will be optimized to a movq r64, xmm on modern platforms though
2024-11-27 19:24:06 +00:00 · 2020-04-07 23:51:54 +03:00 · 2020-04-07 23:51:54 +03:00 · 8617530b13
parent a82966c0f5
commit 8617530b13
1 changed files with 5 additions and 2 deletions
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -1056,6 +1056,7 @@ static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
  const float len_f  = (float)len;
  const __m256i zero = _mm256_setzero_si256();

+  int64_t sum;
  size_t i;
  __m256i sums = zero;
  for (i = 0; i + 31 < len; i += 32) {
@ -1069,7 +1070,7 @@ static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
  __m128i sum_4  = _mm_shuffle_epi32       (sum_3,  _MM_SHUFFLE(1, 0, 3, 2));
  __m128i sum_5  = _mm_add_epi64           (sum_3,  sum_4);

-  int64_t sum    = _mm_cvtsi128_si64(sum_5);
+  _mm_storel_epi64((__m128i *)&sum, sum_5);

  // Remaining len mod 32 pixels
  for (; i < len; ++i) {
@ -1172,6 +1173,7 @@ static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
  const __m256i zero      = _mm256_setzero_si256();
  const __m256i himask_15 = _mm256_set1_epi16(0x7f00);

+  uint64_t vars;
  size_t i;
  __m256i sums = zero;
  for (i = 0; i < len; i += 32) {
@ -1213,7 +1215,8 @@ static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
  __m256i accum4 = _mm256_permute4x64_epi64(accum3, _MM_SHUFFLE(2, 3, 1, 0));
  __m256i v_tot  = _mm256_add_epi64        (accum3, accum4);
  __m128i vt128  = _mm256_castsi256_si128  (v_tot);
-  uint64_t vars  = _mm_cvtsi128_si64       (vt128);
+
+  _mm_storel_epi64((__m128i *)&vars, vt128);

  return (float)vars * varsum_to_f;
 }