From 6bbd3e5a4470a60bce46449307e415a8648e3357 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 29 Nov 2018 15:22:34 +0200
Subject: [PATCH] Optimize rearrange_512 function

---
 src/strategies/avx2/quant-avx2.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index b98f648b..a6ebf373 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -88,13 +88,10 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
 // (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
 static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
 {
-  __m256i tmphi, tmplo;
+  const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
 
-  tmphi = _mm256_shuffle_epi32(*hi, _MM_SHUFFLE(3, 1, 2, 0));
-  tmplo = _mm256_shuffle_epi32(*lo, _MM_SHUFFLE(3, 1, 2, 0));
-
-  tmphi = _mm256_permute4x64_epi64(tmphi, _MM_SHUFFLE(3, 1, 2, 0));
-  tmplo = _mm256_permute4x64_epi64(tmplo, _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
+  __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
 
   *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
   *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
@@ -116,6 +113,7 @@ static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
   // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
   // to have the same data layout as in costs. Zero extend to 32b width, shift
   // changes 16 bits to the left, and store them into the same vectors.
+  // TODO: unpack instead of this
   tmp1 = _mm256_cvtepu16_epi32(nslo);
   tmp2 = _mm256_cvtepu16_epi32(chlo);
   tmp2 = _mm256_bslli_epi128(tmp2, 2);