From ce77bfa15b20eebd89e450e514c735868efd4d38 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 22 Aug 2016 19:08:46 +0300 Subject: [PATCH] Replace KVZ_PERMUTE with _MM_SHUFFLE The same exact macro already exists --- src/strategies/avx2/intra-avx2.c | 33 +++++++++++++------------- src/strategies/avx2/ipol-avx2.c | 21 ++++++++--------- src/strategies/avx2/picture-avx2.c | 37 +++++++++++++++--------------- src/strategies/avx2/quant-avx2.c | 5 ++-- src/strategies/avx2/sao-avx2.c | 17 +++++++------- src/strategies/strategies-common.h | 4 ---- 6 files changed, 54 insertions(+), 63 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index c619b6fe..151971ec 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -25,7 +25,6 @@ #include #include "kvazaar.h" -#include "strategies/strategies-common.h" #include "strategyselector.h" @@ -252,14 +251,14 @@ static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sam int rx = 0; int ry = y; - row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3)); - row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2)); - row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3)); - row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2)); - row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3)); - row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2)); - row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3)); - row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2)); + row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3)); + row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2)); + row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3)); + row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2)); + row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3)); + row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2)); + row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3)); + row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2)); _mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0)); _mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1)); @@ -343,14 +342,14 @@ static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl } else { //Move all filtered pixels to the lower lane to reduce memory accesses - row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3)); - row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2)); - row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3)); - row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2)); - row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3)); - row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2)); - row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3)); - row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2)); + row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3)); + row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2)); + row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3)); + row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2)); + row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3)); + row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2)); + row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3)); + row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2)); _mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0)); _mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1)); diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 349064e2..b80d2d05 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -34,7 +34,6 @@ #include "strategies/generic/picture-generic.h" #include "strategies/strategies-ipol.h" #include "strategyselector.h" -#include "strategies/strategies-common.h" #include "strategies/generic/ipol-generic.h" @@ -99,12 +98,12 @@ static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *fil temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]); temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]); temp[0] = _mm_add_epi32(temp_lo, temp_hi); - temp[0] = _mm_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3)); + temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3)); temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]); temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]); temp[4] = _mm_add_epi32(temp_lo, temp_hi); - temp[4] = _mm_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3)); + temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3)); __m128i add = _mm_set1_epi32(offset23); temp[0] = _mm_add_epi32(temp[0], add); @@ -152,12 +151,12 @@ static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]); temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]); temp[0] = _mm256_add_epi32(temp_lo, temp_hi); - temp[0] = _mm256_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3)); + temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3)); temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]); temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]); temp[4] = _mm256_add_epi32(temp_lo, temp_hi); - temp[4] = _mm256_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3)); + temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3)); __m256i add = _mm256_set1_epi32(offset23); temp[0] = _mm256_add_epi32(temp[0], add); @@ -205,7 +204,7 @@ static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filt { __m256i temp[4]; __m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter[0])), _mm_loadl_epi64((__m128i*)filter[1]), 1); - fir = _mm256_shuffle_epi32(fir, KVZ_PERMUTE(0, 1, 0, 1)); + fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(0, 1, 0, 1)); temp[0] = _mm256_unpacklo_epi64(row[0], row[1]); temp[0] = _mm256_maddubs_epi16(temp[0], fir); @@ -398,8 +397,8 @@ int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) __m128i packed_filter = _mm_loadl_epi64((__m128i*)filter); sample = _mm_maddubs_epi16(packed_data, packed_filter); - sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0))); - sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, KVZ_PERMUTE(1, 0, 1, 0))); + sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0))); + sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(1, 0, 1, 0))); return (int16_t)_mm_cvtsi128_si32(sample); } @@ -413,8 +412,8 @@ int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) __m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter)); sample = _mm_madd_epi16(packed_data, packed_filter); - sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(2, 3, 0, 1))); - sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0))); + sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(2, 3, 0, 1))); + sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0))); return _mm_extract_epi32(sample, 0); } @@ -533,7 +532,7 @@ void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift temp0 = _mm256_srai_epi16(temp0, shift); - temp0 = _mm256_permute4x64_epi64(temp0, KVZ_PERMUTE(0, 2, 1, 3)); + temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(0, 2, 1, 3)); _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0)); } diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index af22be64..033fec6d 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -30,7 +30,6 @@ #include "kvazaar.h" #include "strategies/strategies-picture.h" #include "strategyselector.h" -#include "strategies/strategies-common.h" #include "strategies/generic/picture-generic.h" @@ -175,9 +174,9 @@ static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur) row3 = _mm_add_epi16(row2, row3); - row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); - row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); - row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); + row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) )); + row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) )); + row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; @@ -222,9 +221,9 @@ static void satd_8bit_4x4_dual_avx2( row3 = _mm256_add_epi16(row2, row3); - row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); - row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); - row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); + row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) )); + row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) )); + row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) )); unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0); sum1 = (sum1 + 1) >> 1; @@ -241,18 +240,18 @@ static INLINE void hor_transform_row_avx2(__m128i* row){ __m128i mask_pos = _mm_set1_epi16(1); __m128i mask_neg = _mm_set1_epi16(-1); __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg); - __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); + __m128i temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg); - temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); + temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg); - temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); - temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); + temp = _mm_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2)); + temp = _mm_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); } @@ -262,18 +261,18 @@ static INLINE void hor_transform_row_dual_avx2(__m256i* row){ __m256i mask_pos = _mm256_set1_epi16(1); __m256i mask_neg = _mm256_set1_epi16(-1); __m256i sign_mask = _mm256_unpacklo_epi64(mask_pos, mask_neg); - __m256i temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); + __m256i temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1)); *row = _mm256_sign_epi16(*row, sign_mask); *row = _mm256_add_epi16(*row, temp); sign_mask = _mm256_unpacklo_epi32(mask_pos, mask_neg); - temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); + temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2)); *row = _mm256_sign_epi16(*row, sign_mask); *row = _mm256_add_epi16(*row, temp); sign_mask = _mm256_unpacklo_epi16(mask_pos, mask_neg); - temp = _mm256_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); - temp = _mm256_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); + temp = _mm256_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2)); + temp = _mm256_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2)); *row = _mm256_sign_epi16(*row, sign_mask); *row = _mm256_add_epi16(*row, temp); } @@ -357,8 +356,8 @@ INLINE static unsigned sum_block_avx2(__m128i *ver_row) haddwd_accumulate_avx2(&sad, ver_row + 6); haddwd_accumulate_avx2(&sad, ver_row + 7); - sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1))); - sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0))); + sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1))); + sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0))); return _mm_cvtsi128_si32(sad); } @@ -375,8 +374,8 @@ INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigne haddwd_accumulate_dual_avx2(&sad, ver_row + 6); haddwd_accumulate_dual_avx2(&sad, ver_row + 7); - sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1))); - sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0))); + sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1))); + sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0))); *sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0)); *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1)); diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 6b6e2f83..66535750 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -35,7 +35,6 @@ #include "rdo.h" #include "scalinglist.h" #include "strategies/generic/quant-generic.h" -#include "strategies/strategies-common.h" #include "strategies/strategies-quant.h" #include "strategyselector.h" #include "tables.h" @@ -101,8 +100,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe } __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1)); - temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1))); - temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0))); + temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(2, 3, 0, 1))); + temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(1, 0, 1, 0))); ac_sum += _mm_cvtsi128_si32(temp); if (!(encoder->sign_hiding && ac_sum >= 2)) return; diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index 57d01673..14d0c062 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -28,7 +28,6 @@ #include "encoderstate.h" #include "kvazaar.h" #include "sao.h" -#include "strategies/strategies-common.h" #include "strategyselector.h" @@ -132,8 +131,8 @@ int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *r //Full horizontal sum v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1))); - v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1))); - v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum)); return sum; @@ -224,14 +223,14 @@ void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec //Full horizontal sum of accumulated values v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_diff_accum[eo_cat], 1))); - v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(2, 3, 0, 1))); - v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(1, 0, 1, 0))); + v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(2, 3, 0, 1))); + v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(1, 0, 1, 0))); accum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_diff_accum[eo_cat])); //Full horizontal sum of accumulated values v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_count[eo_cat], 1))); - v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(2, 3, 0, 1))); - v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(1, 0, 1, 0))); + v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(2, 3, 0, 1))); + v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(1, 0, 1, 0))); count += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_count[eo_cat])); cat_sum_cnt[0][eo_cat] += accum; @@ -335,8 +334,8 @@ int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz //Full horizontal sum v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1))); - v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1))); - v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum)); return sum; diff --git a/src/strategies/strategies-common.h b/src/strategies/strategies-common.h index 0a6d5c98..760c9e7f 100644 --- a/src/strategies/strategies-common.h +++ b/src/strategies/strategies-common.h @@ -10,8 +10,4 @@ #include "global.h" // IWYU pragma: keep -//Use with shuffle and permutation intrinsics. -//Parameters are indices to packed elements. Each must be 0, 1, 2 or 3. -#define KVZ_PERMUTE(a, b, c, d) ( (a << 0) | (b << 2) | (c << 4) | (d << 6) ) - #endif //STRATEGIES_COMMON_H_