mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-12-18 03:04:06 +00:00
Work around the ancient Win32 calling convention hassle
See if this'll work now
This commit is contained in:
parent
c5ca18950c
commit
99597b828a
|
@ -34,12 +34,23 @@
|
||||||
#include "sao.h"
|
#include "sao.h"
|
||||||
#include "strategyselector.h"
|
#include "strategyselector.h"
|
||||||
|
|
||||||
|
// The calling convention used by MSVC on 32-bit builds will essentially
|
||||||
|
// disallow functions to have more than 3 XMM/YMM parameters, because it
|
||||||
|
// will not provide more than 8-byte param alignment, and only the first
|
||||||
|
// three vector params will be carried in SIMD registers. Now the
|
||||||
|
// vectorcall convention could probably be problematic in globally visible
|
||||||
|
// funcitons, but likely not in static ones.
|
||||||
|
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
|
||||||
|
#define FIX_W32 __vectorcall
|
||||||
|
#else
|
||||||
|
#define FIX_W32
|
||||||
|
#endif
|
||||||
|
|
||||||
// These optimizations are based heavily on sao-generic.c.
|
// These optimizations are based heavily on sao-generic.c.
|
||||||
// Might be useful to check that if (when) this file
|
// Might be useful to check that if (when) this file
|
||||||
// is difficult to understand.
|
// is difficult to understand.
|
||||||
|
|
||||||
static int32_t hsum_8x32b(const __m256i v)
|
static int32_t FIX_W32 hsum_8x32b(const __m256i v)
|
||||||
{
|
{
|
||||||
__m256i sum1 = v;
|
__m256i sum1 = v;
|
||||||
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
|
__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
@ -69,9 +80,9 @@ static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mapping of edge_idx values to eo-classes, 32x8b at once
|
// Mapping of edge_idx values to eo-classes, 32x8b at once
|
||||||
static __m256i calc_eo_cat(const __m256i a,
|
static __m256i FIX_W32 calc_eo_cat(const __m256i a,
|
||||||
const __m256i b,
|
const __m256i b,
|
||||||
const __m256i c)
|
const __m256i c)
|
||||||
{
|
{
|
||||||
const __m256i twos = _mm256_set1_epi8 (0x02);
|
const __m256i twos = _mm256_set1_epi8 (0x02);
|
||||||
const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0,
|
const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0,
|
||||||
|
@ -222,10 +233,10 @@ static INLINE __m256i broadcast_xmm2ymm(const __m128i v)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used for edge_ddistortion and band_ddistortion
|
// Used for edge_ddistortion and band_ddistortion
|
||||||
static __m256i calc_diff_off_delta(const __m256i diff_lo,
|
static __m256i FIX_W32 calc_diff_off_delta(const __m256i diff_lo,
|
||||||
const __m256i diff_hi,
|
const __m256i diff_hi,
|
||||||
const __m256i offsets,
|
const __m256i offsets,
|
||||||
const __m256i orig)
|
const __m256i orig)
|
||||||
{
|
{
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001);
|
const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001);
|
||||||
|
@ -266,12 +277,12 @@ static __m256i calc_diff_off_delta(const __m256i diff_lo,
|
||||||
return _mm256_add_epi32 (sum0, sum1);
|
return _mm256_add_epi32 (sum0, sum1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE __m256i do_one_edge_ymm(const __m256i a,
|
static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a,
|
||||||
const __m256i b,
|
const __m256i b,
|
||||||
const __m256i c,
|
const __m256i c,
|
||||||
const __m256i orig,
|
const __m256i orig,
|
||||||
const __m256i badbyte_mask,
|
const __m256i badbyte_mask,
|
||||||
const __m256i offsets_256)
|
const __m256i offsets_256)
|
||||||
{
|
{
|
||||||
__m256i eo_cat = calc_eo_cat(a, b, c);
|
__m256i eo_cat = calc_eo_cat(a, b, c);
|
||||||
eo_cat = _mm256_or_si256 (eo_cat, badbyte_mask);
|
eo_cat = _mm256_or_si256 (eo_cat, badbyte_mask);
|
||||||
|
@ -387,13 +398,13 @@ static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
|
||||||
return hsum_8x32b(sum);
|
return hsum_8x32b(sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void calc_edge_dir_one_ymm(const __m256i a,
|
static void FIX_W32 calc_edge_dir_one_ymm(const __m256i a,
|
||||||
const __m256i b,
|
const __m256i b,
|
||||||
const __m256i c,
|
const __m256i c,
|
||||||
const __m256i orig,
|
const __m256i orig,
|
||||||
const __m256i badbyte_mask,
|
const __m256i badbyte_mask,
|
||||||
__m256i *diff_accum,
|
__m256i *diff_accum,
|
||||||
int32_t *hit_cnt)
|
int32_t *hit_cnt)
|
||||||
{
|
{
|
||||||
const __m256i ones_16 = _mm256_set1_epi16(1);
|
const __m256i ones_16 = _mm256_set1_epi16(1);
|
||||||
__m256i eo_cat = calc_eo_cat (a, b, c);
|
__m256i eo_cat = calc_eo_cat (a, b, c);
|
||||||
|
@ -684,10 +695,10 @@ static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m256i do_one_nonband_ymm(const __m256i a,
|
static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a,
|
||||||
const __m256i b,
|
const __m256i b,
|
||||||
const __m256i c,
|
const __m256i c,
|
||||||
const __m256i sao_offs)
|
const __m256i sao_offs)
|
||||||
{
|
{
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue