From 99597b828abbba6e606a0bf70fddf19c9a488bca Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Fri, 6 Sep 2019 12:53:45 +0300
Subject: [PATCH] Work around the ancient Win32 calling convention hassle

See if this'll work now
---
 src/strategies/avx2/sao-avx2.c | 61 ++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c
index 6afc30a8..c0b4d723 100644
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@@ -34,12 +34,23 @@
 #include "sao.h"
 #include "strategyselector.h"
 
+// The calling convention used by MSVC on 32-bit builds will essentially
+// disallow functions to have more than 3 XMM/YMM parameters, because it
+// will not provide more than 8-byte param alignment, and only the first
+// three vector params will be carried in SIMD registers. Now the
+// vectorcall convention could probably be problematic in globally visible
+// funcitons, but likely not in static ones.
+#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
+  #define FIX_W32 __vectorcall
+#else
+  #define FIX_W32
+#endif
 
 // These optimizations are based heavily on sao-generic.c.
 // Might be useful to check that if (when) this file
 // is difficult to understand.
 
-static int32_t hsum_8x32b(const __m256i v)
+static int32_t FIX_W32 hsum_8x32b(const __m256i v)
 {
   __m256i sum1 = v;
   __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
@@ -69,9 +80,9 @@ static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
 }
 
 // Mapping of edge_idx values to eo-classes, 32x8b at once
-static __m256i calc_eo_cat(const __m256i a,
-                           const __m256i b,
-                           const __m256i c)
+static __m256i FIX_W32 calc_eo_cat(const __m256i a,
+                                   const __m256i b,
+                                   const __m256i c)
 {
   const __m256i twos       = _mm256_set1_epi8  (0x02);
   const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0,
@@ -222,10 +233,10 @@ static INLINE __m256i broadcast_xmm2ymm(const __m128i v)
 }
 
 // Used for edge_ddistortion and band_ddistortion
-static __m256i calc_diff_off_delta(const __m256i diff_lo,
-                                   const __m256i diff_hi,
-                                   const __m256i offsets,
-                                   const __m256i orig)
+static __m256i FIX_W32 calc_diff_off_delta(const __m256i diff_lo,
+                                           const __m256i diff_hi,
+                                           const __m256i offsets,
+                                           const __m256i orig)
 {
   const __m256i zero          = _mm256_setzero_si256();
   const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001);
@@ -266,12 +277,12 @@ static __m256i calc_diff_off_delta(const __m256i diff_lo,
   return                 _mm256_add_epi32     (sum0,         sum1);
 }
 
-static INLINE __m256i do_one_edge_ymm(const __m256i a,
-                                      const __m256i b,
-                                      const __m256i c,
-                                      const __m256i orig,
-                                      const __m256i badbyte_mask,
-                                      const __m256i offsets_256)
+static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a,
+                                              const __m256i b,
+                                              const __m256i c,
+                                              const __m256i orig,
+                                              const __m256i badbyte_mask,
+                                              const __m256i offsets_256)
 {
   __m256i eo_cat = calc_eo_cat(a, b, c);
           eo_cat = _mm256_or_si256    (eo_cat,      badbyte_mask);
@@ -387,13 +398,13 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
   return hsum_8x32b(sum);
 }
 
-static void calc_edge_dir_one_ymm(const __m256i  a,
-                                  const __m256i  b,
-                                  const __m256i  c,
-                                  const __m256i  orig,
-                                  const __m256i  badbyte_mask,
-                                        __m256i *diff_accum,
-                                        int32_t *hit_cnt)
+static void FIX_W32 calc_edge_dir_one_ymm(const __m256i  a,
+                                          const __m256i  b,
+                                          const __m256i  c,
+                                          const __m256i  orig,
+                                          const __m256i  badbyte_mask,
+                                                __m256i *diff_accum,
+                                                int32_t *hit_cnt)
 {
   const __m256i ones_16 = _mm256_set1_epi16(1);
         __m256i eo_cat  = calc_eo_cat      (a, b, c);
@@ -684,10 +695,10 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
   }
 }
 
-static __m256i do_one_nonband_ymm(const __m256i a,
-                                  const __m256i b,
-                                  const __m256i c,
-                                  const __m256i sao_offs)
+static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a,
+                                          const __m256i b,
+                                          const __m256i c,
+                                          const __m256i sao_offs)
 {
   const __m256i zero = _mm256_setzero_si256();