mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Optimize rearrange_512 function
This commit is contained in:
parent
cb8209d1b3
commit
6bbd3e5a44
|
@ -88,13 +88,10 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
|
||||||
// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
|
// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
|
||||||
static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
|
static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
|
||||||
{
|
{
|
||||||
__m256i tmphi, tmplo;
|
const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
|
||||||
|
|
||||||
tmphi = _mm256_shuffle_epi32(*hi, _MM_SHUFFLE(3, 1, 2, 0));
|
__m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
|
||||||
tmplo = _mm256_shuffle_epi32(*lo, _MM_SHUFFLE(3, 1, 2, 0));
|
__m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
|
||||||
|
|
||||||
tmphi = _mm256_permute4x64_epi64(tmphi, _MM_SHUFFLE(3, 1, 2, 0));
|
|
||||||
tmplo = _mm256_permute4x64_epi64(tmplo, _MM_SHUFFLE(3, 1, 2, 0));
|
|
||||||
|
|
||||||
*hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
|
*hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
|
||||||
*lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
|
*lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
|
||||||
|
@ -116,6 +113,7 @@ static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
|
||||||
// Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
|
// Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
|
||||||
// to have the same data layout as in costs. Zero extend to 32b width, shift
|
// to have the same data layout as in costs. Zero extend to 32b width, shift
|
||||||
// changes 16 bits to the left, and store them into the same vectors.
|
// changes 16 bits to the left, and store them into the same vectors.
|
||||||
|
// TODO: unpack instead of this
|
||||||
tmp1 = _mm256_cvtepu16_epi32(nslo);
|
tmp1 = _mm256_cvtepu16_epi32(nslo);
|
||||||
tmp2 = _mm256_cvtepu16_epi32(chlo);
|
tmp2 = _mm256_cvtepu16_epi32(chlo);
|
||||||
tmp2 = _mm256_bslli_epi128(tmp2, 2);
|
tmp2 = _mm256_bslli_epi128(tmp2, 2);
|
||||||
|
|
Loading…
Reference in a new issue