Work around the ancient Win32 calling convention hassle

See if this'll work now
2024-12-18 03:04:06 +00:00 · 2019-09-06 12:53:45 +03:00 · 2019-09-06 12:53:45 +03:00 · 99597b828a
parent c5ca18950c
commit 99597b828a
1 changed files with 36 additions and 25 deletions
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@ -34,12 +34,23 @@
 #include "sao.h"
 #include "strategyselector.h"

+// The calling convention used by MSVC on 32-bit builds will essentially
+// disallow functions to have more than 3 XMM/YMM parameters, because it
+// will not provide more than 8-byte param alignment, and only the first
+// three vector params will be carried in SIMD registers. Now the
+// vectorcall convention could probably be problematic in globally visible
+// funcitons, but likely not in static ones.
+#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
+  #define FIX_W32 __vectorcall
+#else
+  #define FIX_W32
+#endif

 // These optimizations are based heavily on sao-generic.c.
 // Might be useful to check that if (when) this file
 // is difficult to understand.

-static int32_t hsum_8x32b(const __m256i v)
+static int32_t FIX_W32 hsum_8x32b(const __m256i v)
 {
  __m256i sum1 = v;
  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
@ -69,7 +80,7 @@ static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
 }

 // Mapping of edge_idx values to eo-classes, 32x8b at once
-static __m256i calc_eo_cat(const __m256i a,
+static __m256i FIX_W32 calc_eo_cat(const __m256i a,
                                   const __m256i b,
                                   const __m256i c)
 {
@ -222,7 +233,7 @@ static INLINE __m256i broadcast_xmm2ymm(const __m128i v)
 }

 // Used for edge_ddistortion and band_ddistortion
-static __m256i calc_diff_off_delta(const __m256i diff_lo,
+static __m256i FIX_W32 calc_diff_off_delta(const __m256i diff_lo,
                                           const __m256i diff_hi,
                                           const __m256i offsets,
                                           const __m256i orig)
@ -266,7 +277,7 @@ static __m256i calc_diff_off_delta(const __m256i diff_lo,
  return                 _mm256_add_epi32     (sum0,         sum1);
 }

-static INLINE __m256i do_one_edge_ymm(const __m256i a,
+static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a,
                                              const __m256i b,
                                              const __m256i c,
                                              const __m256i orig,
@ -387,7 +398,7 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
  return hsum_8x32b(sum);
 }

-static void calc_edge_dir_one_ymm(const __m256i  a,
+static void FIX_W32 calc_edge_dir_one_ymm(const __m256i  a,
                                          const __m256i  b,
                                          const __m256i  c,
                                          const __m256i  orig,
@ -684,7 +695,7 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
  }
 }

-static __m256i do_one_nonband_ymm(const __m256i a,
+static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a,
                                          const __m256i b,
                                          const __m256i c,
                                          const __m256i sao_offs)