From c5cd03497e7462e77aeed60a9139b0e345776bf5 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Tue, 13 Nov 2018 18:22:56 +0200
Subject: [PATCH 01/11] Require BMI and ABM instruction sets for AVX2 build

AVX2 support on a processor should always imply BMI and ABM support.
The lzcnt and tzcnt instructions have more suitable semantics in the
corner case that source word is 0, and allow us to even handle that
scenario without a branch. Apparently Visual Studio will already
include this support when building with AVX2 enabled, so only the
automake files need to be tweaked.
---
 configure.ac    | 4 +++-
 src/Makefile.am | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index c481bb03..3b1c9393 100644
--- a/configure.ac
+++ b/configure.ac
@@ -49,9 +49,11 @@ AX_CHECK_COMPILE_FLAG([-maltivec],[flag_altivec="true"])
 AX_CHECK_COMPILE_FLAG([-mavx2],   [flag_avx2="true"])
 AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"])
 AX_CHECK_COMPILE_FLAG([-msse2],   [flag_sse2="true"])
+AX_CHECK_COMPILE_FLAG([-mbmi],   [flag_bmi="true"])
+AX_CHECK_COMPILE_FLAG([-mabm],   [flag_abm="true"])
 
 AM_CONDITIONAL([HAVE_ALTIVEC], [test x"$flag_altivec" = x"true"])
-AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true"])
+AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true"])
 AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"])
 AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"])
 
diff --git a/src/Makefile.am b/src/Makefile.am
index 55d6d115..b10e18c3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -207,7 +207,7 @@ endif #HAVE_PPC
 if HAVE_X86
 
 if HAVE_AVX2
-libavx2_la_CFLAGS = -mavx2
+libavx2_la_CFLAGS = -mavx2 -mbmi -mabm
 endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1

From 1befc69a4cefd87b5b80336b8cf710dc9b988053 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 8 Nov 2018 12:57:30 +0200
Subject: [PATCH 02/11] Implement sign bit hiding in AVX2

---
 src/strategies/avx2/quant-avx2.c | 379 +++++++++++++++++++++++--------
 1 file changed, 287 insertions(+), 92 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 651fbeee..029be093 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -40,6 +40,273 @@
 #include "tables.h"
 #include "transform.h"
 
+static INLINE int32_t reduce_mm256i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+
+  a = _mm_add_epi32(a, b);
+  return _mm_cvtsi128_si32(a);
+}
+
+static INLINE int32_t reduce_16x_i16(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+  __m256i c = _mm256_cvtepi16_epi32(a);
+  __m256i d = _mm256_cvtepi16_epi32(b);
+
+  c = _mm256_add_epi32(c, d);
+  return reduce_mm256i(c);
+}
+
+// If ints is completely zero, returns 16 in *first and -1 in *last
+static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
+{
+  // Note that nonzero_bytes will always have both bytes set for a set word
+  // even if said word only had one of its bytes set, because we're doing 16
+  // bit wide comparisons. No big deal, just shift results to the right by one
+  // bit to have the results represent indexes of first set words, not bytes.
+  // Another note, it has to use right shift instead of division to preserve
+  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
+  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
+  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
+  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
+}
+
+// Rearranges a 16x32b double vector into a format suitable for a stable SIMD
+// max algorithm:
+// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
+static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
+{
+  __m256i tmphi, tmplo;
+
+  tmphi = _mm256_shuffle_epi32(*hi, _MM_SHUFFLE(3, 1, 2, 0));
+  tmplo = _mm256_shuffle_epi32(*lo, _MM_SHUFFLE(3, 1, 2, 0));
+
+  tmphi = _mm256_permute4x64_epi64(tmphi, _MM_SHUFFLE(3, 1, 2, 0));
+  tmplo = _mm256_permute4x64_epi64(tmplo, _MM_SHUFFLE(3, 1, 2, 0));
+
+  *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
+  *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
+}
+
+static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
+    __m256i ns, __m256i changes,
+    int16_t *final_change, int32_t *min_pos)
+{
+  __m128i nslo, nshi, chlo, chhi;
+  __m256i pllo, plhi; // Payload
+  __m256i tmp1, tmp2;
+
+  nshi = _mm256_extracti128_si256(ns, 1);
+  nslo = _mm256_extracti128_si256(ns, 0);
+  chhi = _mm256_extracti128_si256(changes, 1);
+  chlo = _mm256_extracti128_si256(changes, 0);
+
+  // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
+  // to have the same data layout as in costs. Zero extend to 32b width, shift
+  // changes 16 bits to the left, and store them into the same vectors.
+  tmp1 = _mm256_cvtepu16_epi32(nslo);
+  tmp2 = _mm256_cvtepu16_epi32(chlo);
+  tmp2 = _mm256_bslli_epi128(tmp2, 2);
+  pllo = _mm256_or_si256(tmp1, tmp2);
+
+  tmp1 = _mm256_cvtepu16_epi32(nshi);
+  tmp2 = _mm256_cvtepu16_epi32(chhi);
+  tmp2 = _mm256_bslli_epi128(tmp2, 2);
+  plhi = _mm256_or_si256(tmp1, tmp2);
+
+  // Reorder to afford result stability (if multiple atoms tie for cheapest,
+  // rightmost ie. the highest is the wanted one)
+  rearrange_512(&costs_hi, &costs_lo);
+  rearrange_512(&plhi, &pllo);
+
+  // 0: pick hi, 1: pick lo (equality evaluates as 0)
+  __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
+  __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
+  __m256i pl1      = _mm256_blendv_epi8(plhi, pllo, cmpmask1);
+
+  __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl2      = _mm256_shuffle_epi32(pl1,   _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
+  __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
+  __m256i pl3      = _mm256_blendv_epi8(pl2,   pl1,   cmpmask2);
+
+  __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl4      = _mm256_shuffle_epi32(pl3,   _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
+  __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
+  __m256i pl5      = _mm256_blendv_epi8(pl4,   pl3,   cmpmask3);
+
+  __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl6      = _mm256_permute4x64_epi64(pl5,   _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
+  __m256i pl7      = _mm256_blendv_epi8(pl6,   pl5,   cmpmask4);
+
+  __m128i res128 = _mm256_castsi256_si128(pl7);
+  uint32_t tmp = (uint32_t)_mm_extract_epi32(res128, 0);
+  uint16_t n = (uint16_t)(tmp & 0xffff);
+  uint16_t chng = (uint16_t)(tmp >> 16);
+
+  *final_change = (int16_t)chng;
+  *min_pos = (int32_t)n;
+}
+
+#define VEC_WIDTH 16
+#define SCAN_SET_SIZE 16
+#define LOG2_SCAN_SET_SIZE 4
+
+static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict q_coef_reord, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
+{
+  // Ensure that the block is 256 bit (32 byte) aligned
+  assert(subpos % (32 / sizeof(coeff_t)) == 0);
+  assert(((size_t)q_coef_reord) % 32 == 0);
+  assert(SCAN_SET_SIZE == 16);
+
+  __m256i q_coefs = _mm256_load_si256((__m256i *)(q_coef_reord + subpos));
+
+  int32_t first_nz_pos_in_cg, last_nz_pos_in_cg;
+  int32_t abssum = 0;
+
+  // Find first and last nonzero coeffs
+  get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+
+  // Sum all kvz_quant coeffs between first and last
+  abssum = reduce_16x_i16(q_coefs);
+
+  if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
+    last_cg = 1;
+  }
+
+  if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
+
+    uint32_t q_coef_signbits = _mm256_movemask_epi8(q_coefs);
+    int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1;
+
+    if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
+      int32_t min_pos = -1;
+      int16_t final_change = 0;
+
+      const int32_t mask_max = (last_cg == 1) ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1;
+
+      const __m256i zero = _mm256_setzero_si256();
+      const __m256i ones = _mm256_set1_epi16(1);
+      const __m256i maxiters = _mm256_set1_epi16(mask_max);
+      const __m256i ff = _mm256_set1_epi8(0xff);
+
+      const __m256i fnpics = _mm256_set1_epi16((int16_t)first_nz_pos_in_cg);
+      const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+      __m256i block_signbit = _mm256_set1_epi16(((int16_t)signbit) * -1);
+      __m256i coef_signbits = _mm256_cmpgt_epi16(zero, coefs);
+      __m256i signbits_equal_block = _mm256_cmpeq_epi16(coef_signbits, block_signbit);
+
+      __m256i q_coefs_zero = _mm256_cmpeq_epi16(q_coefs, zero);
+
+      __m256i dus_packed = _mm256_packs_epi32(deltas_l, deltas_h);
+      __m256i dus_ordered = _mm256_permute4x64_epi64(dus_packed, _MM_SHUFFLE(3, 1, 2, 0));
+      __m256i dus_positive = _mm256_cmpgt_epi16(dus_ordered, zero);
+
+      __m256i q_coef_abss = _mm256_abs_epi16(q_coefs);
+      __m256i q_coefs_plusminus_one = _mm256_cmpeq_epi16(q_coef_abss, ones);
+
+      __m256i eq_fnpics = _mm256_cmpeq_epi16(fnpics, ns);
+      __m256i lt_fnpics = _mm256_cmpgt_epi16(fnpics, ns);
+
+      __m256i maxcost_subcond1s = _mm256_and_si256(eq_fnpics, q_coefs_plusminus_one);
+      __m256i maxcost_subcond2s = _mm256_andnot_si256(signbits_equal_block, lt_fnpics);
+      __m256i elsecond1s_inv = _mm256_or_si256(dus_positive, maxcost_subcond1s);
+      __m256i elsecond1s = _mm256_andnot_si256(elsecond1s_inv, ff);
+
+      __m256i outside_maxiters = _mm256_cmpgt_epi16(ns, maxiters);
+
+      __m256i negdelta_cond1s = _mm256_andnot_si256(q_coefs_zero, dus_positive);
+      __m256i negdelta_cond2s = _mm256_andnot_si256(maxcost_subcond2s, q_coefs_zero);
+      __m256i negdelta_mask16s_part1 = _mm256_or_si256(negdelta_cond1s, negdelta_cond2s);
+      __m256i negdelta_mask16s = _mm256_andnot_si256(outside_maxiters, negdelta_mask16s_part1);
+
+      __m256i posdelta_mask16s_part1 = _mm256_andnot_si256(q_coefs_zero, elsecond1s);
+      __m256i posdelta_mask16s = _mm256_andnot_si256(outside_maxiters, posdelta_mask16s_part1);
+
+      __m256i maxcost_cond1_parts = _mm256_andnot_si256(dus_positive, maxcost_subcond1s);
+      __m256i maxcost_cond1s = _mm256_andnot_si256(q_coefs_zero, maxcost_cond1_parts);
+      __m256i maxcost_cond2s = _mm256_and_si256(q_coefs_zero, maxcost_subcond2s);
+      __m256i maxcost_mask16s_parts = _mm256_or_si256(maxcost_cond1s, maxcost_cond2s);
+      __m256i maxcost_mask16s = _mm256_or_si256(maxcost_mask16s_parts, outside_maxiters);
+
+      __m128i tmp_l, tmp_h;
+      tmp_l = _mm256_extracti128_si256(negdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(negdelta_mask16s, 1);
+      __m256i negdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i negdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(posdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(posdelta_mask16s, 1);
+      __m256i posdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i posdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(maxcost_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(maxcost_mask16s, 1);
+      __m256i maxcost_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i maxcost_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      // Output value generation
+      // cur_change_max: zero
+      // cur_change_negdelta: ff
+      // cur_change_posdelta: ones
+      __m256i costs_negdelta_h = _mm256_sub_epi32(zero, deltas_h);
+      __m256i costs_negdelta_l = _mm256_sub_epi32(zero, deltas_l);
+      // costs_posdelta_l and _h: deltas_l and _h
+      __m256i costs_max_lh = _mm256_set1_epi32(0x7fffffff);
+
+      __m256i change_neg = _mm256_and_si256(negdelta_mask16s, ones);
+      __m256i change_pos = _mm256_and_si256(posdelta_mask16s, ff);
+      __m256i change_max = _mm256_and_si256(maxcost_mask16s, zero);
+
+      __m256i cost_neg_l = _mm256_and_si256(negdelta_mask32s_l, costs_negdelta_l);
+      __m256i cost_neg_h = _mm256_and_si256(negdelta_mask32s_h, costs_negdelta_h);
+      __m256i cost_pos_l = _mm256_and_si256(posdelta_mask32s_l, deltas_l);
+      __m256i cost_pos_h = _mm256_and_si256(posdelta_mask32s_h, deltas_h);
+      __m256i cost_max_l = _mm256_and_si256(maxcost_mask32s_l, costs_max_lh);
+      __m256i cost_max_h = _mm256_and_si256(maxcost_mask32s_h, costs_max_lh);
+
+      __m256i changes = _mm256_or_si256(change_neg, _mm256_or_si256(change_pos, change_max));
+      __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l));
+      __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h));
+
+      get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos);
+
+      coeff_t cheapest_q = q_coef_reord[min_pos + subpos];
+      if (cheapest_q == 32767 || cheapest_q == -32768)
+        final_change = -1;
+
+      uint32_t coef_signs = _mm256_movemask_epi8(coef_signbits);
+      uint32_t cheapest_coef_sign_mask = (uint32_t)(1 << (2 * min_pos));
+
+      if (!(coef_signs & cheapest_coef_sign_mask))
+        q_coef[scan[min_pos + subpos]] += final_change;
+      else
+        q_coef[scan[min_pos + subpos]] -= final_change;
+    } // Hide
+  }
+  if (last_cg == 1)
+    last_cg = 0;
+
+  return last_cg;
+}
 
 /**
  * \brief quantize transformed coefficents
@@ -64,11 +331,12 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t
 
   uint32_t ac_sum = 0;
+  int32_t last_cg = -1;
 
   __m256i v_ac_sum = _mm256_setzero_si256();
   __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]);
 
-  for (int32_t n = 0; n < width * height; n += 16) {
+  for (int32_t n = 0; n < width * height; n += VEC_WIDTH) {
 
     __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n]));
     __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level);
@@ -104,15 +372,23 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1)));
   ac_sum += _mm_cvtsi128_si32(temp);
 
-  if (!encoder->cfg.signhide_enable || ac_sum < 2) return;
+  if (!encoder->cfg.signhide_enable || ac_sum < 2)
+    return;
 
-  int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2];
+  coeff_t coef_reord[LCU_WIDTH * LCU_WIDTH >> 2] ALIGNED(32);
+  coeff_t q_coef_reord[LCU_WIDTH * LCU_WIDTH >> 2] ALIGNED(32);
 
-  for (int32_t n = 0; n < width * height; n += 16) {
+  // Reorder coef and q_coef for sequential access
+  for (int32_t n = 0; n < width * height; n++) {
+    coef_reord[n] = coef[scan[n]];
+    q_coef_reord[n] = q_coef[scan[n]];
+  }
 
-    __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n]));
+  assert(VEC_WIDTH == SCAN_SET_SIZE);
+  for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) {
 
-    v_level = _mm256_abs_epi16(v_level);
+    __m256i v_coef = _mm256_load_si256((__m256i *)(coef_reord + subpos));
+    __m256i v_level = _mm256_abs_epi16(v_coef);
     __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
     __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));
 
@@ -130,7 +406,6 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
 
     v_level = _mm256_packs_epi32(v_level32_a, v_level32_b);
 
-    __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n]));
     __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
     __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
     __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
@@ -142,95 +417,15 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
     v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8);
     v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8);
     
-    _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1));
+    __m256i deltas_h = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x31);
+    __m256i deltas_l = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x20);
+
+    last_cg = hide_block_sign(v_coef, q_coef_reord, deltas_h, deltas_l, q_coef, scan, subpos, last_cg);
   }
 
-  if (ac_sum >= 2) {
-#define SCAN_SET_SIZE 16
-#define LOG2_SCAN_SET_SIZE 4
-    int32_t n, last_cg = -1, abssum = 0, subset, subpos;
-    for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
-      int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1;
-      subpos = subset << LOG2_SCAN_SET_SIZE;
-      abssum = 0;
-
-      // Find last coeff pos
-      for (n = SCAN_SET_SIZE - 1; n >= 0; n--)  {
-        if (q_coef[scan[n + subpos]])  {
-          last_nz_pos_in_cg = n;
-          break;
-        }
-      }
-
-      // First coeff pos
-      for (n = 0; n <SCAN_SET_SIZE; n++) {
-        if (q_coef[scan[n + subpos]]) {
-          first_nz_pos_in_cg = n;
-          break;
-        }
-      }
-
-      // Sum all kvz_quant coeffs between first and last
-      for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) {
-        abssum += q_coef[scan[n + subpos]];
-      }
-
-      if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
-        last_cg = 1;
-      }
-
-      if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
-        int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1);
-        if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
-          int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff;
-          int16_t final_change = 0, cur_change = 0;
-          for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) {
-            uint32_t blkPos = scan[n + subpos];
-            if (q_coef[blkPos] != 0) {
-              if (delta_u[blkPos] > 0) {
-                cur_cost = -delta_u[blkPos];
-                cur_change = 1;
-              }
-              else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) {
-                cur_cost = 0x7fffffff;
-              }
-              else {
-                cur_cost = delta_u[blkPos];
-                cur_change = -1;
-              }
-            }
-            else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) {
-              cur_cost = 0x7fffffff;
-            }
-            else {
-              cur_cost = -delta_u[blkPos];
-              cur_change = 1;
-            }
-
-            if (cur_cost < min_cost_inc) {
-              min_cost_inc = cur_cost;
-              final_change = cur_change;
-              min_pos = blkPos;
-            }
-          } // CG loop
-
-          if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) {
-            final_change = -1;
-          }
-
-          if (coef[min_pos] >= 0) q_coef[min_pos] += final_change;
-          else q_coef[min_pos] -= final_change;
-        } // Hide
-      }
-      if (last_cg == 1) last_cg = 0;
-    }
-
+#undef VEC_WIDTH
 #undef SCAN_SET_SIZE
 #undef LOG2_SCAN_SET_SIZE
-  }
 }
 
 static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){

From 316cd8a846538c6d77ed1ce2becaacb553ba4e19 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 29 Nov 2018 14:25:33 +0200
Subject: [PATCH 03/11] Fix ALIGNED keyword and grow alignment to 64B

---
 src/strategies/avx2/quant-avx2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 029be093..00f57a44 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -375,8 +375,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   if (!encoder->cfg.signhide_enable || ac_sum < 2)
     return;
 
-  coeff_t coef_reord[LCU_WIDTH * LCU_WIDTH >> 2] ALIGNED(32);
-  coeff_t q_coef_reord[LCU_WIDTH * LCU_WIDTH >> 2] ALIGNED(32);
+  ALIGNED(64) coeff_t coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
+  ALIGNED(64) coeff_t q_coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
 
   // Reorder coef and q_coef for sequential access
   for (int32_t n = 0; n < width * height; n++) {

From 7cf4c7ae5fbc398cd9e65a83c8a49ae75be6a2c9 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 29 Nov 2018 15:21:09 +0200
Subject: [PATCH 04/11] Rename "reduce" functions to hsum

That's what the functions fundamendally do anyway
---
 src/strategies/avx2/quant-avx2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 00f57a44..9fc81342 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -40,7 +40,7 @@
 #include "tables.h"
 #include "transform.h"
 
-static INLINE int32_t reduce_mm256i(__m256i src)
+static INLINE int32_t hsum32_8x32i(__m256i src)
 {
   __m128i a = _mm256_extracti128_si256(src, 0);
   __m128i b = _mm256_extracti128_si256(src, 1);
@@ -55,7 +55,7 @@ static INLINE int32_t reduce_mm256i(__m256i src)
   return _mm_cvtsi128_si32(a);
 }
 
-static INLINE int32_t reduce_16x_i16(__m256i src)
+static INLINE int32_t hsum32_16x16i(__m256i src)
 {
   __m128i a = _mm256_extracti128_si256(src, 0);
   __m128i b = _mm256_extracti128_si256(src, 1);
@@ -63,7 +63,7 @@ static INLINE int32_t reduce_16x_i16(__m256i src)
   __m256i d = _mm256_cvtepi16_epi32(b);
 
   c = _mm256_add_epi32(c, d);
-  return reduce_mm256i(c);
+  return hsum32_8x32i(c);
 }
 
 // If ints is completely zero, returns 16 in *first and -1 in *last
@@ -185,7 +185,7 @@ static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict
   get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
 
   // Sum all kvz_quant coeffs between first and last
-  abssum = reduce_16x_i16(q_coefs);
+  abssum = hsum32_16x16i(q_coefs);
 
   if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
     last_cg = 1;

From cb8209d1b35cbd407d874596c7f169927168a266 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 29 Nov 2018 15:22:50 +0200
Subject: [PATCH 05/11] Vectorize transform coefficient reordering loop

---
 src/strategies/avx2/quant-avx2.c | 101 +++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 6 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 9fc81342..b98f648b 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -312,12 +312,12 @@ static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict
  * \brief quantize transformed coefficents
  *
  */
-void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t * const  __restrict scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
   int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
@@ -328,6 +328,31 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
+  // For vectorized reordering of coef and q_coef
+  const __m128i low128_shuffle_masks[3] = {
+    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
+  };
+
+  const __m128i blend_masks[3] = {
+    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m128i invec_rearr_masks_upper[3] = {
+    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
+  };
+
+  const __m128i invec_rearr_masks_lower[3] = {
+    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
+  };
+
   assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t
 
   uint32_t ac_sum = 0;
@@ -378,10 +403,74 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   ALIGNED(64) coeff_t coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
   ALIGNED(64) coeff_t q_coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
 
-  // Reorder coef and q_coef for sequential access
-  for (int32_t n = 0; n < width * height; n++) {
-    coef_reord[n] = coef[scan[n]];
-    q_coef_reord[n] = q_coef[scan[n]];
+  /*
+   * Reorder coef and q_coef for sequential access
+   * Fun fact: Once upon a time, this loop looked like this:
+   * for (int32_t n = 0; n < width * height; n++) {
+   *   coef_reord[n] = coef[scan[n]];
+   *   q_coef_reord[n] = q_coef[scan[n]];
+   * }
+   */
+  for (int32_t i = 0; i < width * height; i += SCAN_SET_SIZE) {
+    const size_t row_offsets[4] = {
+      scan[i] + width * 0,
+      scan[i] + width * 1,
+      scan[i] + width * 2,
+      scan[i] + width * 3,
+    };
+
+    __m128d   coefs_d_upper;
+    __m128d   coefs_d_lower;
+    __m128d q_coefs_d_upper;
+    __m128d q_coefs_d_lower;
+
+    __m128i   coefs_upper;
+    __m128i   coefs_lower;
+    __m128i q_coefs_upper;
+    __m128i q_coefs_lower;
+
+    __m128i   coefs_rearr1_upper;
+    __m128i   coefs_rearr1_lower;
+    __m128i q_coefs_rearr1_upper;
+    __m128i q_coefs_rearr1_lower;
+
+    __m128i   coefs_rearr2_upper;
+    __m128i   coefs_rearr2_lower;
+    __m128i q_coefs_rearr2_upper;
+    __m128i q_coefs_rearr2_lower;
+
+    coefs_d_upper   = _mm_loadl_pd(coefs_d_upper,   (double *)(coef   + row_offsets[0]));
+    coefs_d_upper   = _mm_loadh_pd(coefs_d_upper,   (double *)(coef   + row_offsets[1]));
+    q_coefs_d_upper = _mm_loadl_pd(q_coefs_d_upper, (double *)(q_coef + row_offsets[0]));
+    q_coefs_d_upper = _mm_loadh_pd(q_coefs_d_upper, (double *)(q_coef + row_offsets[1]));
+
+    coefs_d_lower   = _mm_loadl_pd(coefs_d_lower,   (double *)(coef   + row_offsets[2]));
+    coefs_d_lower   = _mm_loadh_pd(coefs_d_lower,   (double *)(coef   + row_offsets[3]));
+    q_coefs_d_lower = _mm_loadl_pd(q_coefs_d_lower, (double *)(q_coef + row_offsets[2]));
+    q_coefs_d_lower = _mm_loadh_pd(q_coefs_d_lower, (double *)(q_coef + row_offsets[3]));
+
+    coefs_upper     = _mm_castpd_si128(coefs_d_upper);
+    coefs_lower     = _mm_castpd_si128(coefs_d_lower);
+    q_coefs_upper   = _mm_castpd_si128(q_coefs_d_upper);
+    q_coefs_lower   = _mm_castpd_si128(q_coefs_d_lower);
+
+    coefs_lower     = _mm_shuffle_epi8(coefs_lower,   low128_shuffle_masks[scan_idx]);
+    q_coefs_lower   = _mm_shuffle_epi8(q_coefs_lower, low128_shuffle_masks[scan_idx]);
+
+    coefs_rearr1_upper   = _mm_blendv_epi8(coefs_upper,   coefs_lower,   blend_masks[scan_idx]);
+    coefs_rearr1_lower   = _mm_blendv_epi8(coefs_lower,   coefs_upper,   blend_masks[scan_idx]);
+    q_coefs_rearr1_upper = _mm_blendv_epi8(q_coefs_upper, q_coefs_lower, blend_masks[scan_idx]);
+    q_coefs_rearr1_lower = _mm_blendv_epi8(q_coefs_lower, q_coefs_upper, blend_masks[scan_idx]);
+
+    coefs_rearr2_upper   = _mm_shuffle_epi8(coefs_rearr1_upper,   invec_rearr_masks_upper[scan_idx]);
+    coefs_rearr2_lower   = _mm_shuffle_epi8(coefs_rearr1_lower,   invec_rearr_masks_lower[scan_idx]);
+    q_coefs_rearr2_upper = _mm_shuffle_epi8(q_coefs_rearr1_upper, invec_rearr_masks_upper[scan_idx]);
+    q_coefs_rearr2_lower = _mm_shuffle_epi8(q_coefs_rearr1_lower, invec_rearr_masks_lower[scan_idx]);
+
+    _mm_store_si128((__m128i *)(coef_reord   + i    ),   coefs_rearr2_upper);
+    _mm_store_si128((__m128i *)(coef_reord   + i + 8),   coefs_rearr2_lower);
+    _mm_store_si128((__m128i *)(q_coef_reord + i    ), q_coefs_rearr2_upper);
+    _mm_store_si128((__m128i *)(q_coef_reord + i + 8), q_coefs_rearr2_lower);
   }
 
   assert(VEC_WIDTH == SCAN_SET_SIZE);

From 6bbd3e5a4470a60bce46449307e415a8648e3357 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 29 Nov 2018 15:22:34 +0200
Subject: [PATCH 06/11] Optimize rearrange_512 function

---
 src/strategies/avx2/quant-avx2.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index b98f648b..a6ebf373 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -88,13 +88,10 @@ static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t
 // (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
 static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
 {
-  __m256i tmphi, tmplo;
+  const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
 
-  tmphi = _mm256_shuffle_epi32(*hi, _MM_SHUFFLE(3, 1, 2, 0));
-  tmplo = _mm256_shuffle_epi32(*lo, _MM_SHUFFLE(3, 1, 2, 0));
-
-  tmphi = _mm256_permute4x64_epi64(tmphi, _MM_SHUFFLE(3, 1, 2, 0));
-  tmplo = _mm256_permute4x64_epi64(tmplo, _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
+  __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
 
   *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
   *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
@@ -116,6 +113,7 @@ static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
   // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
   // to have the same data layout as in costs. Zero extend to 32b width, shift
   // changes 16 bits to the left, and store them into the same vectors.
+  // TODO: unpack instead of this
   tmp1 = _mm256_cvtepu16_epi32(nslo);
   tmp2 = _mm256_cvtepu16_epi32(chlo);
   tmp2 = _mm256_bslli_epi128(tmp2, 2);

From 7fe454c51ffb455e147350a5ec77f912e44d6aea Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Fri, 30 Nov 2018 00:40:40 +0200
Subject: [PATCH 07/11] Optimize get_cheapest_alternative()

---
 src/strategies/avx2/quant-avx2.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index a6ebf373..c2a0b5d5 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -101,28 +101,14 @@ static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
     __m256i ns, __m256i changes,
     int16_t *final_change, int32_t *min_pos)
 {
-  __m128i nslo, nshi, chlo, chhi;
-  __m256i pllo, plhi; // Payload
-  __m256i tmp1, tmp2;
-
-  nshi = _mm256_extracti128_si256(ns, 1);
-  nslo = _mm256_extracti128_si256(ns, 0);
-  chhi = _mm256_extracti128_si256(changes, 1);
-  chlo = _mm256_extracti128_si256(changes, 0);
-
   // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
   // to have the same data layout as in costs. Zero extend to 32b width, shift
   // changes 16 bits to the left, and store them into the same vectors.
-  // TODO: unpack instead of this
-  tmp1 = _mm256_cvtepu16_epi32(nslo);
-  tmp2 = _mm256_cvtepu16_epi32(chlo);
-  tmp2 = _mm256_bslli_epi128(tmp2, 2);
-  pllo = _mm256_or_si256(tmp1, tmp2);
+  __m256i tmphi = _mm256_unpackhi_epi16(ns, changes);
+  __m256i tmplo = _mm256_unpacklo_epi16(ns, changes);
 
-  tmp1 = _mm256_cvtepu16_epi32(nshi);
-  tmp2 = _mm256_cvtepu16_epi32(chhi);
-  tmp2 = _mm256_bslli_epi128(tmp2, 2);
-  plhi = _mm256_or_si256(tmp1, tmp2);
+  __m256i plhi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
+  __m256i pllo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
 
   // Reorder to afford result stability (if multiple atoms tie for cheapest,
   // rightmost ie. the highest is the wanted one)

From d9591f1b49554f0a4aa2c08f5636d4ee80276089 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Fri, 30 Nov 2018 16:15:34 +0200
Subject: [PATCH 08/11] Eliminate midway buffering of reordered coefs

TODO: For some mysterious reason seems slightly slower than the
buffered one
---
 src/strategies/avx2/quant-avx2.c | 109 ++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 47 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index c2a0b5d5..14b07b22 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -98,70 +98,85 @@ static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
 }
 
 static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
-    __m256i ns, __m256i changes,
-    int16_t *final_change, int32_t *min_pos)
+    __m256i ns, __m256i changes, __m256i q_coefs,
+    int16_t *final_change, int32_t *min_pos, int16_t *cheapest_q)
 {
+  const __m256i zero = _mm256_set1_epi8(0);
+
   // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
   // to have the same data layout as in costs. Zero extend to 32b width, shift
   // changes 16 bits to the left, and store them into the same vectors.
-  __m256i tmphi = _mm256_unpackhi_epi16(ns, changes);
-  __m256i tmplo = _mm256_unpacklo_epi16(ns, changes);
+  __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes);
+  __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes);
 
-  __m256i plhi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
-  __m256i pllo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
+  __m256i tmp2hi = _mm256_unpackhi_epi16(q_coefs, zero);
+  __m256i tmp2lo = _mm256_unpacklo_epi16(q_coefs, zero);
+
+  __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31);
+  __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20);
+
+  __m256i pl2hi = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x31);
+  __m256i pl2lo = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x20);
 
   // Reorder to afford result stability (if multiple atoms tie for cheapest,
   // rightmost ie. the highest is the wanted one)
   rearrange_512(&costs_hi, &costs_lo);
-  rearrange_512(&plhi, &pllo);
+  rearrange_512(&pl1hi, &pl1lo);
+  rearrange_512(&pl2hi, &pl2lo);
 
   // 0: pick hi, 1: pick lo (equality evaluates as 0)
   __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
   __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
-  __m256i pl1      = _mm256_blendv_epi8(plhi, pllo, cmpmask1);
+  __m256i pl1_1    = _mm256_blendv_epi8(pl1hi,    pl1lo,    cmpmask1);
+  __m256i pl2_1    = _mm256_blendv_epi8(pl2hi,    pl2lo,    cmpmask1);
 
   __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
-  __m256i pl2      = _mm256_shuffle_epi32(pl1,   _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl1_2    = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl2_2    = _mm256_shuffle_epi32(pl2_1, _MM_SHUFFLE(2, 3, 0, 1));
 
   __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
   __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
-  __m256i pl3      = _mm256_blendv_epi8(pl2,   pl1,   cmpmask2);
+  __m256i pl1_3    = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2);
+  __m256i pl2_3    = _mm256_blendv_epi8(pl2_2, pl2_1, cmpmask2);
 
   __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i pl4      = _mm256_shuffle_epi32(pl3,   _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_4    = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl2_4    = _mm256_shuffle_epi32(pl2_3, _MM_SHUFFLE(1, 0, 3, 2));
 
   __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
   __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
-  __m256i pl5      = _mm256_blendv_epi8(pl4,   pl3,   cmpmask3);
+  __m256i pl1_5    = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3);
+  __m256i pl2_5    = _mm256_blendv_epi8(pl2_4, pl2_3, cmpmask3);
 
   __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i pl6      = _mm256_permute4x64_epi64(pl5,   _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_6    = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl2_6    = _mm256_permute4x64_epi64(pl2_5, _MM_SHUFFLE(1, 0, 3, 2));
 
   __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
-  __m256i pl7      = _mm256_blendv_epi8(pl6,   pl5,   cmpmask4);
+  __m256i pl1_7    = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4);
+  __m256i pl2_7    = _mm256_blendv_epi8(pl2_6, pl2_5, cmpmask4);
 
-  __m128i res128 = _mm256_castsi256_si128(pl7);
-  uint32_t tmp = (uint32_t)_mm_extract_epi32(res128, 0);
-  uint16_t n = (uint16_t)(tmp & 0xffff);
-  uint16_t chng = (uint16_t)(tmp >> 16);
+  __m128i res1_128 = _mm256_castsi256_si128(pl1_7);
+  __m128i res2_128 = _mm256_castsi256_si128(pl2_7);
+  uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0);
+  uint32_t tmp2 = (uint32_t)_mm_extract_epi32(res2_128, 0);
+  uint16_t n = (uint16_t)(tmp1 & 0xffff);
+  uint16_t chng = (uint16_t)(tmp1 >> 16);
+  uint16_t chpst = (uint16_t)(tmp2 & 0xffff);
 
   *final_change = (int16_t)chng;
   *min_pos = (int32_t)n;
+  *cheapest_q = (int16_t)chpst;
 }
 
 #define VEC_WIDTH 16
 #define SCAN_SET_SIZE 16
 #define LOG2_SCAN_SET_SIZE 4
 
-static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict q_coef_reord, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
+static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
 {
-  // Ensure that the block is 256 bit (32 byte) aligned
-  assert(subpos % (32 / sizeof(coeff_t)) == 0);
-  assert(((size_t)q_coef_reord) % 32 == 0);
   assert(SCAN_SET_SIZE == 16);
 
-  __m256i q_coefs = _mm256_load_si256((__m256i *)(q_coef_reord + subpos));
-
   int32_t first_nz_pos_in_cg, last_nz_pos_in_cg;
   int32_t abssum = 0;
 
@@ -181,8 +196,9 @@ static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict
     int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1;
 
     if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
-      int32_t min_pos = -1;
-      int16_t final_change = 0;
+      int32_t min_pos;
+      int16_t final_change;
+      int16_t cheapest_q;
 
       const int32_t mask_max = (last_cg == 1) ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1;
 
@@ -271,9 +287,8 @@ static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict
       __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l));
       __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h));
 
-      get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos);
+      get_cheapest_alternative(costs_h, costs_l, ns, changes, q_coefs, &final_change, &min_pos, &cheapest_q);
 
-      coeff_t cheapest_q = q_coef_reord[min_pos + subpos];
       if (cheapest_q == 32767 || cheapest_q == -32768)
         final_change = -1;
 
@@ -281,9 +296,11 @@ static INLINE int32_t hide_block_sign(__m256i coefs, const coeff_t * __restrict
       uint32_t cheapest_coef_sign_mask = (uint32_t)(1 << (2 * min_pos));
 
       if (!(coef_signs & cheapest_coef_sign_mask))
-        q_coef[scan[min_pos + subpos]] += final_change;
+        cheapest_q += final_change;
       else
-        q_coef[scan[min_pos + subpos]] -= final_change;
+        cheapest_q -= final_change;
+
+      q_coef[scan[min_pos + subpos]] = cheapest_q;
     } // Hide
   }
   if (last_cg == 1)
@@ -384,9 +401,6 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
   if (!encoder->cfg.signhide_enable || ac_sum < 2)
     return;
 
-  ALIGNED(64) coeff_t coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
-  ALIGNED(64) coeff_t q_coef_reord[LCU_WIDTH * LCU_WIDTH >> 2];
-
   /*
    * Reorder coef and q_coef for sequential access
    * Fun fact: Once upon a time, this loop looked like this:
@@ -395,12 +409,13 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
    *   q_coef_reord[n] = q_coef[scan[n]];
    * }
    */
-  for (int32_t i = 0; i < width * height; i += SCAN_SET_SIZE) {
+  assert(VEC_WIDTH == SCAN_SET_SIZE);
+  for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) {
     const size_t row_offsets[4] = {
-      scan[i] + width * 0,
-      scan[i] + width * 1,
-      scan[i] + width * 2,
-      scan[i] + width * 3,
+      scan[subpos] + width * 0,
+      scan[subpos] + width * 1,
+      scan[subpos] + width * 2,
+      scan[subpos] + width * 3,
     };
 
     __m128d   coefs_d_upper;
@@ -451,16 +466,16 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
     q_coefs_rearr2_upper = _mm_shuffle_epi8(q_coefs_rearr1_upper, invec_rearr_masks_upper[scan_idx]);
     q_coefs_rearr2_lower = _mm_shuffle_epi8(q_coefs_rearr1_lower, invec_rearr_masks_lower[scan_idx]);
 
-    _mm_store_si128((__m128i *)(coef_reord   + i    ),   coefs_rearr2_upper);
-    _mm_store_si128((__m128i *)(coef_reord   + i + 8),   coefs_rearr2_lower);
-    _mm_store_si128((__m128i *)(q_coef_reord + i    ), q_coefs_rearr2_upper);
-    _mm_store_si128((__m128i *)(q_coef_reord + i + 8), q_coefs_rearr2_lower);
-  }
+    // Why, oh why, is there no _mm256_setr_m128i intrinsic in the header that
+    // would do the exact same operation in the exact same way? :(
+    __m256i v_coef = _mm256_insertf128_si256(_mm256_castsi128_si256(coefs_rearr2_upper),
+                                             coefs_rearr2_lower,
+                                             1);
 
-  assert(VEC_WIDTH == SCAN_SET_SIZE);
-  for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) {
+    __m256i q_coefs = _mm256_insertf128_si256(_mm256_castsi128_si256(q_coefs_rearr2_upper),
+                                              q_coefs_rearr2_lower, 
+                                              1);
 
-    __m256i v_coef = _mm256_load_si256((__m256i *)(coef_reord + subpos));
     __m256i v_level = _mm256_abs_epi16(v_coef);
     __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
     __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));
@@ -493,7 +508,7 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
     __m256i deltas_h = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x31);
     __m256i deltas_l = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x20);
 
-    last_cg = hide_block_sign(v_coef, q_coef_reord, deltas_h, deltas_l, q_coef, scan, subpos, last_cg);
+    last_cg = hide_block_sign(v_coef, q_coefs, deltas_h, deltas_l, q_coef, scan, subpos, last_cg);
   }
 
 #undef VEC_WIDTH

From f78bf2ebcb5c6c3817a4a7d1873c52bc00034c49 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Fri, 30 Nov 2018 17:15:26 +0200
Subject: [PATCH 09/11] Optimize q_coefs usage for indexed fetch

---
 src/strategies/avx2/quant-avx2.c | 32 ++++++++------------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 14b07b22..d3b44672 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -98,75 +98,55 @@ static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
 }
 
 static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
-    __m256i ns, __m256i changes, __m256i q_coefs,
-    int16_t *final_change, int32_t *min_pos, int16_t *cheapest_q)
+    __m256i ns, __m256i changes,
+    int16_t *final_change, int32_t *min_pos)
 {
-  const __m256i zero = _mm256_set1_epi8(0);
-
   // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
   // to have the same data layout as in costs. Zero extend to 32b width, shift
   // changes 16 bits to the left, and store them into the same vectors.
   __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes);
   __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes);
 
-  __m256i tmp2hi = _mm256_unpackhi_epi16(q_coefs, zero);
-  __m256i tmp2lo = _mm256_unpacklo_epi16(q_coefs, zero);
-
   __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31);
   __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20);
 
-  __m256i pl2hi = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x31);
-  __m256i pl2lo = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x20);
-
   // Reorder to afford result stability (if multiple atoms tie for cheapest,
   // rightmost ie. the highest is the wanted one)
   rearrange_512(&costs_hi, &costs_lo);
   rearrange_512(&pl1hi, &pl1lo);
-  rearrange_512(&pl2hi, &pl2lo);
 
   // 0: pick hi, 1: pick lo (equality evaluates as 0)
   __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
   __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
   __m256i pl1_1    = _mm256_blendv_epi8(pl1hi,    pl1lo,    cmpmask1);
-  __m256i pl2_1    = _mm256_blendv_epi8(pl2hi,    pl2lo,    cmpmask1);
 
   __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
   __m256i pl1_2    = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1));
-  __m256i pl2_2    = _mm256_shuffle_epi32(pl2_1, _MM_SHUFFLE(2, 3, 0, 1));
 
   __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
   __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
   __m256i pl1_3    = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2);
-  __m256i pl2_3    = _mm256_blendv_epi8(pl2_2, pl2_1, cmpmask2);
 
   __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
   __m256i pl1_4    = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i pl2_4    = _mm256_shuffle_epi32(pl2_3, _MM_SHUFFLE(1, 0, 3, 2));
 
   __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
   __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
   __m256i pl1_5    = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3);
-  __m256i pl2_5    = _mm256_blendv_epi8(pl2_4, pl2_3, cmpmask3);
 
   __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
   __m256i pl1_6    = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2));
-  __m256i pl2_6    = _mm256_permute4x64_epi64(pl2_5, _MM_SHUFFLE(1, 0, 3, 2));
 
   __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
   __m256i pl1_7    = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4);
-  __m256i pl2_7    = _mm256_blendv_epi8(pl2_6, pl2_5, cmpmask4);
 
   __m128i res1_128 = _mm256_castsi256_si128(pl1_7);
-  __m128i res2_128 = _mm256_castsi256_si128(pl2_7);
   uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0);
-  uint32_t tmp2 = (uint32_t)_mm_extract_epi32(res2_128, 0);
   uint16_t n = (uint16_t)(tmp1 & 0xffff);
   uint16_t chng = (uint16_t)(tmp1 >> 16);
-  uint16_t chpst = (uint16_t)(tmp2 & 0xffff);
 
   *final_change = (int16_t)chng;
   *min_pos = (int32_t)n;
-  *cheapest_q = (int16_t)chpst;
 }
 
 #define VEC_WIDTH 16
@@ -287,8 +267,10 @@ static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i de
       __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l));
       __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h));
 
-      get_cheapest_alternative(costs_h, costs_l, ns, changes, q_coefs, &final_change, &min_pos, &cheapest_q);
+      get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos);
+      const int32_t best_id = scan[min_pos + subpos];
 
+      cheapest_q = q_coef[best_id];
       if (cheapest_q == 32767 || cheapest_q == -32768)
         final_change = -1;
 
@@ -300,7 +282,7 @@ static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i de
       else
         cheapest_q -= final_change;
 
-      q_coef[scan[min_pos + subpos]] = cheapest_q;
+      q_coef[best_id] = cheapest_q;
     } // Hide
   }
   if (last_cg == 1)
@@ -476,6 +458,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
                                               q_coefs_rearr2_lower, 
                                               1);
 
+    // Reordering done
+
     __m256i v_level = _mm256_abs_epi16(v_coef);
     __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
     __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));

From c4655780484c3c2cf9a41b782dbecd5f44c97fb1 Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Mon, 3 Dec 2018 14:41:41 +0200
Subject: [PATCH 10/11] Add a descriptive comment to coefficient reordering

---
 src/strategies/avx2/quant-avx2.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index d3b44672..473cd610 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -400,6 +400,10 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri
       scan[subpos] + width * 3,
     };
 
+    // NOTE: Upper means "higher in pixel order inside block", which implies
+    // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
+    // so upper 128b vector actually becomes the lower part of a 256-bit coeff
+    // vector and lower vector the higher part!
     __m128d   coefs_d_upper;
     __m128d   coefs_d_lower;
     __m128d q_coefs_d_upper;

From b6b89672fd85f5f4c52b3fc6df46a91800b7838a Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Thu, 8 Nov 2018 12:58:25 +0200
Subject: [PATCH 11/11] Include Vim swap files in .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a47e0b61..70f5bd1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ Makefile.in
 *.lo
 *.o
 *.trs
+.*.swp
 
 *.log
 .kdev4