Replace KVZ_PERMUTE with _MM_SHUFFLE

The same exact macro already exists
This commit is contained in:
Ari Lemmetti 2016-08-22 19:08:46 +03:00
parent 68eef660bd
commit ce77bfa15b
6 changed files with 54 additions and 63 deletions

View file

@ -25,7 +25,6 @@
#include <stdlib.h>
#include "kvazaar.h"
#include "strategies/strategies-common.h"
#include "strategyselector.h"
@ -252,14 +251,14 @@ static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sam
int rx = 0;
int ry = y;
row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3));
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2));
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3));
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2));
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3));
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2));
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3));
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2));
_mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0));
_mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1));
@ -343,14 +342,14 @@ static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl
} else {
//Move all filtered pixels to the lower lane to reduce memory accesses
row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3));
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2));
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3));
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2));
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3));
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2));
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3));
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2));
_mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0));
_mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1));

View file

@ -34,7 +34,6 @@
#include "strategies/generic/picture-generic.h"
#include "strategies/strategies-ipol.h"
#include "strategyselector.h"
#include "strategies/strategies-common.h"
#include "strategies/generic/ipol-generic.h"
@ -99,12 +98,12 @@ static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *fil
temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]);
temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]);
temp[0] = _mm_add_epi32(temp_lo, temp_hi);
temp[0] = _mm_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3));
temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3));
temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]);
temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]);
temp[4] = _mm_add_epi32(temp_lo, temp_hi);
temp[4] = _mm_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3));
temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3));
__m128i add = _mm_set1_epi32(offset23);
temp[0] = _mm_add_epi32(temp[0], add);
@ -152,12 +151,12 @@ static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t
temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]);
temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]);
temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
temp[0] = _mm256_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3));
temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3));
temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]);
temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]);
temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
temp[4] = _mm256_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3));
temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3));
__m256i add = _mm256_set1_epi32(offset23);
temp[0] = _mm256_add_epi32(temp[0], add);
@ -205,7 +204,7 @@ static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filt
{
__m256i temp[4];
__m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter[0])), _mm_loadl_epi64((__m128i*)filter[1]), 1);
fir = _mm256_shuffle_epi32(fir, KVZ_PERMUTE(0, 1, 0, 1));
fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(0, 1, 0, 1));
temp[0] = _mm256_unpacklo_epi64(row[0], row[1]);
temp[0] = _mm256_maddubs_epi16(temp[0], fir);
@ -398,8 +397,8 @@ int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
__m128i packed_filter = _mm_loadl_epi64((__m128i*)filter);
sample = _mm_maddubs_epi16(packed_data, packed_filter);
sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0)));
sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, KVZ_PERMUTE(1, 0, 1, 0)));
sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0)));
sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(1, 0, 1, 0)));
return (int16_t)_mm_cvtsi128_si32(sample);
}
@ -413,8 +412,8 @@ int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
__m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
sample = _mm_madd_epi16(packed_data, packed_filter);
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(2, 3, 0, 1)));
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0)));
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(2, 3, 0, 1)));
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0)));
return _mm_extract_epi32(sample, 0);
}
@ -533,7 +532,7 @@ void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift
temp0 = _mm256_srai_epi16(temp0, shift);
temp0 = _mm256_permute4x64_epi64(temp0, KVZ_PERMUTE(0, 2, 1, 3));
temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(0, 2, 1, 3));
_mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0));
}

View file

@ -30,7 +30,6 @@
#include "kvazaar.h"
#include "strategies/strategies-picture.h"
#include "strategyselector.h"
#include "strategies/strategies-common.h"
#include "strategies/generic/picture-generic.h"
@ -175,9 +174,9 @@ static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur)
row3 = _mm_add_epi16(row2, row3);
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) ));
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
unsigned sum = _mm_extract_epi16(row3, 0);
unsigned satd = (sum + 1) >> 1;
@ -222,9 +221,9 @@ static void satd_8bit_4x4_dual_avx2(
row3 = _mm256_add_epi16(row2, row3);
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) ));
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
sum1 = (sum1 + 1) >> 1;
@ -241,18 +240,18 @@ static INLINE void hor_transform_row_avx2(__m128i* row){
__m128i mask_pos = _mm_set1_epi16(1);
__m128i mask_neg = _mm_set1_epi16(-1);
__m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
__m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
__m128i temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
*row = _mm_sign_epi16(*row, sign_mask);
*row = _mm_add_epi16(*row, temp);
sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
*row = _mm_sign_epi16(*row, sign_mask);
*row = _mm_add_epi16(*row, temp);
sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
temp = _mm_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2));
temp = _mm_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2));
*row = _mm_sign_epi16(*row, sign_mask);
*row = _mm_add_epi16(*row, temp);
}
@ -262,18 +261,18 @@ static INLINE void hor_transform_row_dual_avx2(__m256i* row){
__m256i mask_pos = _mm256_set1_epi16(1);
__m256i mask_neg = _mm256_set1_epi16(-1);
__m256i sign_mask = _mm256_unpacklo_epi64(mask_pos, mask_neg);
__m256i temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
__m256i temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
*row = _mm256_sign_epi16(*row, sign_mask);
*row = _mm256_add_epi16(*row, temp);
sign_mask = _mm256_unpacklo_epi32(mask_pos, mask_neg);
temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
*row = _mm256_sign_epi16(*row, sign_mask);
*row = _mm256_add_epi16(*row, temp);
sign_mask = _mm256_unpacklo_epi16(mask_pos, mask_neg);
temp = _mm256_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
temp = _mm256_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
temp = _mm256_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2));
temp = _mm256_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2));
*row = _mm256_sign_epi16(*row, sign_mask);
*row = _mm256_add_epi16(*row, temp);
}
@ -357,8 +356,8 @@ INLINE static unsigned sum_block_avx2(__m128i *ver_row)
haddwd_accumulate_avx2(&sad, ver_row + 6);
haddwd_accumulate_avx2(&sad, ver_row + 7);
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1)));
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0)));
return _mm_cvtsi128_si32(sad);
}
@ -375,8 +374,8 @@ INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigne
haddwd_accumulate_dual_avx2(&sad, ver_row + 6);
haddwd_accumulate_dual_avx2(&sad, ver_row + 7);
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1)));
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0)));
*sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0));
*sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1));

View file

@ -35,7 +35,6 @@
#include "rdo.h"
#include "scalinglist.h"
#include "strategies/generic/quant-generic.h"
#include "strategies/strategies-common.h"
#include "strategies/strategies-quant.h"
#include "strategyselector.h"
#include "tables.h"
@ -101,8 +100,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
}
__m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1));
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1)));
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0)));
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(2, 3, 0, 1)));
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(1, 0, 1, 0)));
ac_sum += _mm_cvtsi128_si32(temp);
if (!(encoder->sign_hiding && ac_sum >= 2)) return;

View file

@ -28,7 +28,6 @@
#include "encoderstate.h"
#include "kvazaar.h"
#include "sao.h"
#include "strategies/strategies-common.h"
#include "strategyselector.h"
@ -132,8 +131,8 @@ int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *r
//Full horizontal sum
v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0)));
sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
return sum;
@ -224,14 +223,14 @@ void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec
//Full horizontal sum of accumulated values
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_diff_accum[eo_cat], 1)));
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(2, 3, 0, 1)));
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(1, 0, 1, 0)));
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(2, 3, 0, 1)));
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(1, 0, 1, 0)));
accum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_diff_accum[eo_cat]));
//Full horizontal sum of accumulated values
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_count[eo_cat], 1)));
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(2, 3, 0, 1)));
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(1, 0, 1, 0)));
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(2, 3, 0, 1)));
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(1, 0, 1, 0)));
count += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_count[eo_cat]));
cat_sum_cnt[0][eo_cat] += accum;
@ -335,8 +334,8 @@ int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz
//Full horizontal sum
v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1)));
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0)));
sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
return sum;

View file

@ -10,8 +10,4 @@
#include "global.h" // IWYU pragma: keep
//Use with shuffle and permutation intrinsics.
//Parameters are indices to packed elements. Each must be 0, 1, 2 or 3.
#define KVZ_PERMUTE(a, b, c, d) ( (a << 0) | (b << 2) | (c << 4) | (d << 6) )
#endif //STRATEGIES_COMMON_H_