diff --git a/src/strategies/avx2/avx2_common_functions.h b/src/strategies/avx2/avx2_common_functions.h index 3492abf1..3b6063ad 100644 --- a/src/strategies/avx2/avx2_common_functions.h +++ b/src/strategies/avx2/avx2_common_functions.h @@ -123,6 +123,7 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1; } +/* MOVED TO SAO-AVX2.C WHERE THIS IS USED int32_t FIX_W32 kvz_hsum_8x32b(const __m256i v) { __m256i sum1 = v; @@ -137,5 +138,5 @@ int32_t FIX_W32 kvz_hsum_8x32b(const __m256i v) int32_t sum9 = _mm_cvtsi128_si32 (sum8); return sum9; } - +*/ #endif diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index 5dc1a4da..5bd2dd91 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -271,6 +271,21 @@ static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a, return calc_diff_off_delta(diff_lo, diff_hi, offset, orig); } +int32_t FIX_W32 kvz_hsum_8x32b(const __m256i v) +{ + __m256i sum1 = v; + __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i sum3 = _mm256_add_epi32(sum1, sum2); + __m256i sum4 = _mm256_shuffle_epi32(sum3, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i sum5 = _mm256_add_epi32(sum3, sum4); + __m256i sum6 = _mm256_shuffle_epi32(sum5, _MM_SHUFFLE(2, 3, 0, 1)); + __m256i sum7 = _mm256_add_epi32(sum5, sum6); + + __m128i sum8 = _mm256_castsi256_si128(sum7); + int32_t sum9 = _mm_cvtsi128_si32(sum8); + return sum9; +} + static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data, int32_t block_width,