mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Replace KVZ_PERMUTE with _MM_SHUFFLE
The same exact macro already exists
This commit is contained in:
parent
68eef660bd
commit
ce77bfa15b
|
@ -25,7 +25,6 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#include "kvazaar.h"
|
||||
#include "strategies/strategies-common.h"
|
||||
#include "strategyselector.h"
|
||||
|
||||
|
||||
|
@ -252,14 +251,14 @@ static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sam
|
|||
int rx = 0;
|
||||
int ry = y;
|
||||
|
||||
row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
|
||||
row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
|
||||
row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
|
||||
row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
|
||||
row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
|
||||
row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
|
||||
row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
|
||||
row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
|
||||
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3));
|
||||
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2));
|
||||
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3));
|
||||
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2));
|
||||
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3));
|
||||
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2));
|
||||
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3));
|
||||
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2));
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0));
|
||||
_mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1));
|
||||
|
@ -343,14 +342,14 @@ static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl
|
|||
} else {
|
||||
|
||||
//Move all filtered pixels to the lower lane to reduce memory accesses
|
||||
row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
|
||||
row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
|
||||
row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
|
||||
row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
|
||||
row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
|
||||
row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
|
||||
row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
|
||||
row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
|
||||
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(0,2,1,3));
|
||||
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(1,3,0,2));
|
||||
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(0,2,1,3));
|
||||
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(1,3,0,2));
|
||||
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(0,2,1,3));
|
||||
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(1,3,0,2));
|
||||
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(0,2,1,3));
|
||||
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(1,3,0,2));
|
||||
|
||||
_mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0));
|
||||
_mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1));
|
||||
|
|
|
@ -34,7 +34,6 @@
|
|||
#include "strategies/generic/picture-generic.h"
|
||||
#include "strategies/strategies-ipol.h"
|
||||
#include "strategyselector.h"
|
||||
#include "strategies/strategies-common.h"
|
||||
#include "strategies/generic/ipol-generic.h"
|
||||
|
||||
|
||||
|
@ -99,12 +98,12 @@ static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *fil
|
|||
temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]);
|
||||
temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]);
|
||||
temp[0] = _mm_add_epi32(temp_lo, temp_hi);
|
||||
temp[0] = _mm_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3));
|
||||
temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3));
|
||||
|
||||
temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]);
|
||||
temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]);
|
||||
temp[4] = _mm_add_epi32(temp_lo, temp_hi);
|
||||
temp[4] = _mm_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3));
|
||||
temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3));
|
||||
|
||||
__m128i add = _mm_set1_epi32(offset23);
|
||||
temp[0] = _mm_add_epi32(temp[0], add);
|
||||
|
@ -152,12 +151,12 @@ static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t
|
|||
temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]);
|
||||
temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]);
|
||||
temp[0] = _mm256_add_epi32(temp_lo, temp_hi);
|
||||
temp[0] = _mm256_shuffle_epi32(temp[0], KVZ_PERMUTE(0, 2, 1, 3));
|
||||
temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(0, 2, 1, 3));
|
||||
|
||||
temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]);
|
||||
temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]);
|
||||
temp[4] = _mm256_add_epi32(temp_lo, temp_hi);
|
||||
temp[4] = _mm256_shuffle_epi32(temp[4], KVZ_PERMUTE(0, 2, 1, 3));
|
||||
temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(0, 2, 1, 3));
|
||||
|
||||
__m256i add = _mm256_set1_epi32(offset23);
|
||||
temp[0] = _mm256_add_epi32(temp[0], add);
|
||||
|
@ -205,7 +204,7 @@ static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filt
|
|||
{
|
||||
__m256i temp[4];
|
||||
__m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter[0])), _mm_loadl_epi64((__m128i*)filter[1]), 1);
|
||||
fir = _mm256_shuffle_epi32(fir, KVZ_PERMUTE(0, 1, 0, 1));
|
||||
fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(0, 1, 0, 1));
|
||||
|
||||
temp[0] = _mm256_unpacklo_epi64(row[0], row[1]);
|
||||
temp[0] = _mm256_maddubs_epi16(temp[0], fir);
|
||||
|
@ -398,8 +397,8 @@ int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
|
|||
__m128i packed_filter = _mm_loadl_epi64((__m128i*)filter);
|
||||
|
||||
sample = _mm_maddubs_epi16(packed_data, packed_filter);
|
||||
sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
|
||||
return (int16_t)_mm_cvtsi128_si32(sample);
|
||||
}
|
||||
|
@ -413,8 +412,8 @@ int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
|
|||
__m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
|
||||
|
||||
sample = _mm_madd_epi16(packed_data, packed_filter);
|
||||
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
|
||||
return _mm_extract_epi32(sample, 0);
|
||||
}
|
||||
|
@ -533,7 +532,7 @@ void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift
|
|||
|
||||
temp0 = _mm256_srai_epi16(temp0, shift);
|
||||
|
||||
temp0 = _mm256_permute4x64_epi64(temp0, KVZ_PERMUTE(0, 2, 1, 3));
|
||||
temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(0, 2, 1, 3));
|
||||
|
||||
_mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0));
|
||||
}
|
||||
|
|
|
@ -30,7 +30,6 @@
|
|||
#include "kvazaar.h"
|
||||
#include "strategies/strategies-picture.h"
|
||||
#include "strategyselector.h"
|
||||
#include "strategies/strategies-common.h"
|
||||
#include "strategies/generic/picture-generic.h"
|
||||
|
||||
|
||||
|
@ -175,9 +174,9 @@ static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur)
|
|||
|
||||
row3 = _mm_add_epi16(row2, row3);
|
||||
|
||||
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
|
||||
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
|
||||
row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
|
||||
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) ));
|
||||
row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
|
||||
row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
|
||||
|
||||
unsigned sum = _mm_extract_epi16(row3, 0);
|
||||
unsigned satd = (sum + 1) >> 1;
|
||||
|
@ -222,9 +221,9 @@ static void satd_8bit_4x4_dual_avx2(
|
|||
|
||||
row3 = _mm256_add_epi16(row2, row3);
|
||||
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(2, 3, 0, 1) ));
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
|
||||
row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, _MM_SHUFFLE(1, 0, 1, 0) ));
|
||||
|
||||
unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
|
||||
sum1 = (sum1 + 1) >> 1;
|
||||
|
@ -241,18 +240,18 @@ static INLINE void hor_transform_row_avx2(__m128i* row){
|
|||
__m128i mask_pos = _mm_set1_epi16(1);
|
||||
__m128i mask_neg = _mm_set1_epi16(-1);
|
||||
__m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
|
||||
__m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
|
||||
__m128i temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
*row = _mm_sign_epi16(*row, sign_mask);
|
||||
*row = _mm_add_epi16(*row, temp);
|
||||
|
||||
sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
|
||||
temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
|
||||
temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
*row = _mm_sign_epi16(*row, sign_mask);
|
||||
*row = _mm_add_epi16(*row, temp);
|
||||
|
||||
sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
|
||||
temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
|
||||
temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
|
||||
temp = _mm_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2));
|
||||
temp = _mm_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2));
|
||||
*row = _mm_sign_epi16(*row, sign_mask);
|
||||
*row = _mm_add_epi16(*row, temp);
|
||||
}
|
||||
|
@ -262,18 +261,18 @@ static INLINE void hor_transform_row_dual_avx2(__m256i* row){
|
|||
__m256i mask_pos = _mm256_set1_epi16(1);
|
||||
__m256i mask_neg = _mm256_set1_epi16(-1);
|
||||
__m256i sign_mask = _mm256_unpacklo_epi64(mask_pos, mask_neg);
|
||||
__m256i temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
|
||||
__m256i temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
*row = _mm256_sign_epi16(*row, sign_mask);
|
||||
*row = _mm256_add_epi16(*row, temp);
|
||||
|
||||
sign_mask = _mm256_unpacklo_epi32(mask_pos, mask_neg);
|
||||
temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
|
||||
temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
*row = _mm256_sign_epi16(*row, sign_mask);
|
||||
*row = _mm256_add_epi16(*row, temp);
|
||||
|
||||
sign_mask = _mm256_unpacklo_epi16(mask_pos, mask_neg);
|
||||
temp = _mm256_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
|
||||
temp = _mm256_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
|
||||
temp = _mm256_shufflelo_epi16(*row, _MM_SHUFFLE(1,0,3,2));
|
||||
temp = _mm256_shufflehi_epi16(temp, _MM_SHUFFLE(1,0,3,2));
|
||||
*row = _mm256_sign_epi16(*row, sign_mask);
|
||||
*row = _mm256_add_epi16(*row, temp);
|
||||
}
|
||||
|
@ -357,8 +356,8 @@ INLINE static unsigned sum_block_avx2(__m128i *ver_row)
|
|||
haddwd_accumulate_avx2(&sad, ver_row + 6);
|
||||
haddwd_accumulate_avx2(&sad, ver_row + 7);
|
||||
|
||||
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
|
||||
return _mm_cvtsi128_si32(sad);
|
||||
}
|
||||
|
@ -375,8 +374,8 @@ INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigne
|
|||
haddwd_accumulate_dual_avx2(&sad, ver_row + 6);
|
||||
haddwd_accumulate_dual_avx2(&sad, ver_row + 7);
|
||||
|
||||
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
|
||||
*sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0));
|
||||
*sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1));
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#include "rdo.h"
|
||||
#include "scalinglist.h"
|
||||
#include "strategies/generic/quant-generic.h"
|
||||
#include "strategies/strategies-common.h"
|
||||
#include "strategies/strategies-quant.h"
|
||||
#include "strategyselector.h"
|
||||
#include "tables.h"
|
||||
|
@ -101,8 +100,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
|
|||
}
|
||||
|
||||
__m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1));
|
||||
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
ac_sum += _mm_cvtsi128_si32(temp);
|
||||
|
||||
if (!(encoder->sign_hiding && ac_sum >= 2)) return;
|
||||
|
|
|
@ -28,7 +28,6 @@
|
|||
#include "encoderstate.h"
|
||||
#include "kvazaar.h"
|
||||
#include "sao.h"
|
||||
#include "strategies/strategies-common.h"
|
||||
#include "strategyselector.h"
|
||||
|
||||
|
||||
|
@ -132,8 +131,8 @@ int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *r
|
|||
|
||||
//Full horizontal sum
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
|
||||
|
||||
return sum;
|
||||
|
@ -224,14 +223,14 @@ void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec
|
|||
|
||||
//Full horizontal sum of accumulated values
|
||||
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_diff_accum[eo_cat], 1)));
|
||||
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
v_diff_accum[eo_cat] = _mm256_add_epi32(v_diff_accum[eo_cat], _mm256_shuffle_epi32(v_diff_accum[eo_cat], _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
accum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_diff_accum[eo_cat]));
|
||||
|
||||
//Full horizontal sum of accumulated values
|
||||
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_castsi128_si256(_mm256_extracti128_si256(v_count[eo_cat], 1)));
|
||||
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
v_count[eo_cat] = _mm256_add_epi32(v_count[eo_cat], _mm256_shuffle_epi32(v_count[eo_cat], _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
count += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_count[eo_cat]));
|
||||
|
||||
cat_sum_cnt[0][eo_cat] += accum;
|
||||
|
@ -335,8 +334,8 @@ int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz
|
|||
|
||||
//Full horizontal sum
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 1, 0)));
|
||||
sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
|
||||
|
||||
return sum;
|
||||
|
|
|
@ -10,8 +10,4 @@
|
|||
#include "global.h" // IWYU pragma: keep
|
||||
|
||||
|
||||
//Use with shuffle and permutation intrinsics.
|
||||
//Parameters are indices to packed elements. Each must be 0, 1, 2 or 3.
|
||||
#define KVZ_PERMUTE(a, b, c, d) ( (a << 0) | (b << 2) | (c << 4) | (d << 6) )
|
||||
|
||||
#endif //STRATEGIES_COMMON_H_
|
||||
|
|
Loading…
Reference in a new issue