From 99597b828abbba6e606a0bf70fddf19c9a488bca Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Fri, 6 Sep 2019 12:53:45 +0300 Subject: [PATCH] Work around the ancient Win32 calling convention hassle See if this'll work now --- src/strategies/avx2/sao-avx2.c | 61 ++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index 6afc30a8..c0b4d723 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -34,12 +34,23 @@ #include "sao.h" #include "strategyselector.h" +// The calling convention used by MSVC on 32-bit builds will essentially +// disallow functions to have more than 3 XMM/YMM parameters, because it +// will not provide more than 8-byte param alignment, and only the first +// three vector params will be carried in SIMD registers. Now the +// vectorcall convention could probably be problematic in globally visible +// funcitons, but likely not in static ones. +#if defined _MSC_VER && defined _WIN32 && !defined _WIN64 + #define FIX_W32 __vectorcall +#else + #define FIX_W32 +#endif // These optimizations are based heavily on sao-generic.c. // Might be useful to check that if (when) this file // is difficult to understand. -static int32_t hsum_8x32b(const __m256i v) +static int32_t FIX_W32 hsum_8x32b(const __m256i v) { __m256i sum1 = v; __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2)); @@ -69,9 +80,9 @@ static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b) } // Mapping of edge_idx values to eo-classes, 32x8b at once -static __m256i calc_eo_cat(const __m256i a, - const __m256i b, - const __m256i c) +static __m256i FIX_W32 calc_eo_cat(const __m256i a, + const __m256i b, + const __m256i c) { const __m256i twos = _mm256_set1_epi8 (0x02); const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0, @@ -222,10 +233,10 @@ static INLINE __m256i broadcast_xmm2ymm(const __m128i v) } // Used for edge_ddistortion and band_ddistortion -static __m256i calc_diff_off_delta(const __m256i diff_lo, - const __m256i diff_hi, - const __m256i offsets, - const __m256i orig) +static __m256i FIX_W32 calc_diff_off_delta(const __m256i diff_lo, + const __m256i diff_hi, + const __m256i offsets, + const __m256i orig) { const __m256i zero = _mm256_setzero_si256(); const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001); @@ -266,12 +277,12 @@ static __m256i calc_diff_off_delta(const __m256i diff_lo, return _mm256_add_epi32 (sum0, sum1); } -static INLINE __m256i do_one_edge_ymm(const __m256i a, - const __m256i b, - const __m256i c, - const __m256i orig, - const __m256i badbyte_mask, - const __m256i offsets_256) +static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a, + const __m256i b, + const __m256i c, + const __m256i orig, + const __m256i badbyte_mask, + const __m256i offsets_256) { __m256i eo_cat = calc_eo_cat(a, b, c); eo_cat = _mm256_or_si256 (eo_cat, badbyte_mask); @@ -387,13 +398,13 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, return hsum_8x32b(sum); } -static void calc_edge_dir_one_ymm(const __m256i a, - const __m256i b, - const __m256i c, - const __m256i orig, - const __m256i badbyte_mask, - __m256i *diff_accum, - int32_t *hit_cnt) +static void FIX_W32 calc_edge_dir_one_ymm(const __m256i a, + const __m256i b, + const __m256i c, + const __m256i orig, + const __m256i badbyte_mask, + __m256i *diff_accum, + int32_t *hit_cnt) { const __m256i ones_16 = _mm256_set1_epi16(1); __m256i eo_cat = calc_eo_cat (a, b, c); @@ -684,10 +695,10 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder, } } -static __m256i do_one_nonband_ymm(const __m256i a, - const __m256i b, - const __m256i c, - const __m256i sao_offs) +static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a, + const __m256i b, + const __m256i c, + const __m256i sao_offs) { const __m256i zero = _mm256_setzero_si256();