Merge branch 'sign-hiding-avx2' into 'master'

Sign hiding avx2 See merge request TIE/ultravideo/kvazaar!2
2024-11-24 02:24:07 +00:00 · 2018-12-10 14:24:40 +02:00 · 2018-12-10 14:24:40 +02:00 · c2906de114
parent a5a10a33c3 b6b89672fd
commit c2906de114
4 changed files with 370 additions and 96 deletions
--- a/.gitignore
+++ b/.gitignore
@ -42,6 +42,7 @@ Makefile.in
 *.lo
 *.o
 *.trs
+.*.swp

 *.log
 .kdev4
--- a/configure.ac
+++ b/configure.ac
@ -49,9 +49,11 @@ AX_CHECK_COMPILE_FLAG([-maltivec],[flag_altivec="true"])
 AX_CHECK_COMPILE_FLAG([-mavx2],   [flag_avx2="true"])
 AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"])
 AX_CHECK_COMPILE_FLAG([-msse2],   [flag_sse2="true"])
+AX_CHECK_COMPILE_FLAG([-mbmi],   [flag_bmi="true"])
+AX_CHECK_COMPILE_FLAG([-mabm],   [flag_abm="true"])

 AM_CONDITIONAL([HAVE_ALTIVEC], [test x"$flag_altivec" = x"true"])
-AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true"])
+AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true"])
 AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"])
 AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"])

--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -207,7 +207,7 @@ endif #HAVE_PPC
 if HAVE_X86

 if HAVE_AVX2
-libavx2_la_CFLAGS = -mavx2
+libavx2_la_CFLAGS = -mavx2 -mbmi -mabm
 endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -40,17 +40,267 @@
 #include "tables.h"
 #include "transform.h"

+static INLINE int32_t hsum32_8x32i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+
+  a = _mm_add_epi32(a, b);
+  return _mm_cvtsi128_si32(a);
+}
+
+static INLINE int32_t hsum32_16x16i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+  __m256i c = _mm256_cvtepi16_epi32(a);
+  __m256i d = _mm256_cvtepi16_epi32(b);
+
+  c = _mm256_add_epi32(c, d);
+  return hsum32_8x32i(c);
+}
+
+// If ints is completely zero, returns 16 in *first and -1 in *last
+static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
+{
+  // Note that nonzero_bytes will always have both bytes set for a set word
+  // even if said word only had one of its bytes set, because we're doing 16
+  // bit wide comparisons. No big deal, just shift results to the right by one
+  // bit to have the results represent indexes of first set words, not bytes.
+  // Another note, it has to use right shift instead of division to preserve
+  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
+  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
+  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
+  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
+}
+
+// Rearranges a 16x32b double vector into a format suitable for a stable SIMD
+// max algorithm:
+// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
+static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
+{
+  const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+
+  __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
+  __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
+
+  *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
+  *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
+}
+
+static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
+    __m256i ns, __m256i changes,
+    int16_t *final_change, int32_t *min_pos)
+{
+  // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
+  // to have the same data layout as in costs. Zero extend to 32b width, shift
+  // changes 16 bits to the left, and store them into the same vectors.
+  __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes);
+  __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes);
+
+  __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31);
+  __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20);
+
+  // Reorder to afford result stability (if multiple atoms tie for cheapest,
+  // rightmost ie. the highest is the wanted one)
+  rearrange_512(&costs_hi, &costs_lo);
+  rearrange_512(&pl1hi, &pl1lo);
+
+  // 0: pick hi, 1: pick lo (equality evaluates as 0)
+  __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
+  __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
+  __m256i pl1_1    = _mm256_blendv_epi8(pl1hi,    pl1lo,    cmpmask1);
+
+  __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl1_2    = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
+  __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
+  __m256i pl1_3    = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2);
+
+  __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_4    = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
+  __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
+  __m256i pl1_5    = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3);
+
+  __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_6    = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
+  __m256i pl1_7    = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4);
+
+  __m128i res1_128 = _mm256_castsi256_si128(pl1_7);
+  uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0);
+  uint16_t n = (uint16_t)(tmp1 & 0xffff);
+  uint16_t chng = (uint16_t)(tmp1 >> 16);
+
+  *final_change = (int16_t)chng;
+  *min_pos = (int32_t)n;
+}
+
+#define VEC_WIDTH 16
+#define SCAN_SET_SIZE 16
+#define LOG2_SCAN_SET_SIZE 4
+
+static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
+{
+  assert(SCAN_SET_SIZE == 16);
+
+  int32_t first_nz_pos_in_cg, last_nz_pos_in_cg;
+  int32_t abssum = 0;
+
+  // Find first and last nonzero coeffs
+  get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+
+  // Sum all kvz_quant coeffs between first and last
+  abssum = hsum32_16x16i(q_coefs);
+
+  if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
+    last_cg = 1;
+  }
+
+  if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
+
+    uint32_t q_coef_signbits = _mm256_movemask_epi8(q_coefs);
+    int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1;
+
+    if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
+      int32_t min_pos;
+      int16_t final_change;
+      int16_t cheapest_q;
+
+      const int32_t mask_max = (last_cg == 1) ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1;
+
+      const __m256i zero = _mm256_setzero_si256();
+      const __m256i ones = _mm256_set1_epi16(1);
+      const __m256i maxiters = _mm256_set1_epi16(mask_max);
+      const __m256i ff = _mm256_set1_epi8(0xff);
+
+      const __m256i fnpics = _mm256_set1_epi16((int16_t)first_nz_pos_in_cg);
+      const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+      __m256i block_signbit = _mm256_set1_epi16(((int16_t)signbit) * -1);
+      __m256i coef_signbits = _mm256_cmpgt_epi16(zero, coefs);
+      __m256i signbits_equal_block = _mm256_cmpeq_epi16(coef_signbits, block_signbit);
+
+      __m256i q_coefs_zero = _mm256_cmpeq_epi16(q_coefs, zero);
+
+      __m256i dus_packed = _mm256_packs_epi32(deltas_l, deltas_h);
+      __m256i dus_ordered = _mm256_permute4x64_epi64(dus_packed, _MM_SHUFFLE(3, 1, 2, 0));
+      __m256i dus_positive = _mm256_cmpgt_epi16(dus_ordered, zero);
+
+      __m256i q_coef_abss = _mm256_abs_epi16(q_coefs);
+      __m256i q_coefs_plusminus_one = _mm256_cmpeq_epi16(q_coef_abss, ones);
+
+      __m256i eq_fnpics = _mm256_cmpeq_epi16(fnpics, ns);
+      __m256i lt_fnpics = _mm256_cmpgt_epi16(fnpics, ns);
+
+      __m256i maxcost_subcond1s = _mm256_and_si256(eq_fnpics, q_coefs_plusminus_one);
+      __m256i maxcost_subcond2s = _mm256_andnot_si256(signbits_equal_block, lt_fnpics);
+      __m256i elsecond1s_inv = _mm256_or_si256(dus_positive, maxcost_subcond1s);
+      __m256i elsecond1s = _mm256_andnot_si256(elsecond1s_inv, ff);
+
+      __m256i outside_maxiters = _mm256_cmpgt_epi16(ns, maxiters);
+
+      __m256i negdelta_cond1s = _mm256_andnot_si256(q_coefs_zero, dus_positive);
+      __m256i negdelta_cond2s = _mm256_andnot_si256(maxcost_subcond2s, q_coefs_zero);
+      __m256i negdelta_mask16s_part1 = _mm256_or_si256(negdelta_cond1s, negdelta_cond2s);
+      __m256i negdelta_mask16s = _mm256_andnot_si256(outside_maxiters, negdelta_mask16s_part1);
+
+      __m256i posdelta_mask16s_part1 = _mm256_andnot_si256(q_coefs_zero, elsecond1s);
+      __m256i posdelta_mask16s = _mm256_andnot_si256(outside_maxiters, posdelta_mask16s_part1);
+
+      __m256i maxcost_cond1_parts = _mm256_andnot_si256(dus_positive, maxcost_subcond1s);
+      __m256i maxcost_cond1s = _mm256_andnot_si256(q_coefs_zero, maxcost_cond1_parts);
+      __m256i maxcost_cond2s = _mm256_and_si256(q_coefs_zero, maxcost_subcond2s);
+      __m256i maxcost_mask16s_parts = _mm256_or_si256(maxcost_cond1s, maxcost_cond2s);
+      __m256i maxcost_mask16s = _mm256_or_si256(maxcost_mask16s_parts, outside_maxiters);
+
+      __m128i tmp_l, tmp_h;
+      tmp_l = _mm256_extracti128_si256(negdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(negdelta_mask16s, 1);
+      __m256i negdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i negdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(posdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(posdelta_mask16s, 1);
+      __m256i posdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i posdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(maxcost_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(maxcost_mask16s, 1);
+      __m256i maxcost_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i maxcost_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      // Output value generation
+      // cur_change_max: zero
+      // cur_change_negdelta: ff
+      // cur_change_posdelta: ones
+      __m256i costs_negdelta_h = _mm256_sub_epi32(zero, deltas_h);
+      __m256i costs_negdelta_l = _mm256_sub_epi32(zero, deltas_l);
+      // costs_posdelta_l and _h: deltas_l and _h
+      __m256i costs_max_lh = _mm256_set1_epi32(0x7fffffff);
+
+      __m256i change_neg = _mm256_and_si256(negdelta_mask16s, ones);
+      __m256i change_pos = _mm256_and_si256(posdelta_mask16s, ff);
+      __m256i change_max = _mm256_and_si256(maxcost_mask16s, zero);
+
+      __m256i cost_neg_l = _mm256_and_si256(negdelta_mask32s_l, costs_negdelta_l);
+      __m256i cost_neg_h = _mm256_and_si256(negdelta_mask32s_h, costs_negdelta_h);
+      __m256i cost_pos_l = _mm256_and_si256(posdelta_mask32s_l, deltas_l);
+      __m256i cost_pos_h = _mm256_and_si256(posdelta_mask32s_h, deltas_h);
+      __m256i cost_max_l = _mm256_and_si256(maxcost_mask32s_l, costs_max_lh);
+      __m256i cost_max_h = _mm256_and_si256(maxcost_mask32s_h, costs_max_lh);
+
+      __m256i changes = _mm256_or_si256(change_neg, _mm256_or_si256(change_pos, change_max));
+      __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l));
+      __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h));
+
+      get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos);
+      const int32_t best_id = scan[min_pos + subpos];
+
+      cheapest_q = q_coef[best_id];
+      if (cheapest_q == 32767 || cheapest_q == -32768)
+        final_change = -1;
+
+      uint32_t coef_signs = _mm256_movemask_epi8(coef_signbits);
+      uint32_t cheapest_coef_sign_mask = (uint32_t)(1 << (2 * min_pos));
+
+      if (!(coef_signs & cheapest_coef_sign_mask))
+        cheapest_q += final_change;
+      else
+        cheapest_q -= final_change;
+
+      q_coef[best_id] = cheapest_q;
+    } // Hide
+  }
+  if (last_cg == 1)
+    last_cg = 0;
+
+  return last_cg;
+}

 /**
 * \brief quantize transformed coefficents
 *
 */
-void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width,
  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
-  const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
+  const uint32_t * const  __restrict scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];

  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
  const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
@ -61,14 +311,40 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
  const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
  const int32_t q_bits8 = q_bits - 8;

+  // For vectorized reordering of coef and q_coef
+  const __m128i low128_shuffle_masks[3] = {
+    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
+  };
+
+  const __m128i blend_masks[3] = {
+    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m128i invec_rearr_masks_upper[3] = {
+    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
+  };
+
+  const __m128i invec_rearr_masks_lower[3] = {
+    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
+  };
+
  assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t

  uint32_t ac_sum = 0;
+  int32_t last_cg = -1;

  __m256i v_ac_sum = _mm256_setzero_si256();
  __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]);

-  for (int32_t n = 0; n < width * height; n += 16) {
+  for (int32_t n = 0; n < width * height; n += VEC_WIDTH) {

    __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n]));
    __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level);
@ -104,15 +380,91 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1)));
  ac_sum += _mm_cvtsi128_si32(temp);

-  if (!encoder->cfg.signhide_enable || ac_sum < 2) return;
+  if (!encoder->cfg.signhide_enable || ac_sum < 2)
+    return;

-  int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2];
+  /*
+   * Reorder coef and q_coef for sequential access
+   * Fun fact: Once upon a time, this loop looked like this:
+   * for (int32_t n = 0; n < width * height; n++) {
+   *   coef_reord[n] = coef[scan[n]];
+   *   q_coef_reord[n] = q_coef[scan[n]];
+   * }
+   */
+  assert(VEC_WIDTH == SCAN_SET_SIZE);
+  for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) {
+    const size_t row_offsets[4] = {
+      scan[subpos] + width * 0,
+      scan[subpos] + width * 1,
+      scan[subpos] + width * 2,
+      scan[subpos] + width * 3,
+    };

-  for (int32_t n = 0; n < width * height; n += 16) {
+    // NOTE: Upper means "higher in pixel order inside block", which implies
+    // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
+    // so upper 128b vector actually becomes the lower part of a 256-bit coeff
+    // vector and lower vector the higher part!
+    __m128d   coefs_d_upper;
+    __m128d   coefs_d_lower;
+    __m128d q_coefs_d_upper;
+    __m128d q_coefs_d_lower;

-    __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n]));
+    __m128i   coefs_upper;
+    __m128i   coefs_lower;
+    __m128i q_coefs_upper;
+    __m128i q_coefs_lower;

-    v_level = _mm256_abs_epi16(v_level);
+    __m128i   coefs_rearr1_upper;
+    __m128i   coefs_rearr1_lower;
+    __m128i q_coefs_rearr1_upper;
+    __m128i q_coefs_rearr1_lower;
+
+    __m128i   coefs_rearr2_upper;
+    __m128i   coefs_rearr2_lower;
+    __m128i q_coefs_rearr2_upper;
+    __m128i q_coefs_rearr2_lower;
+
+    coefs_d_upper   = _mm_loadl_pd(coefs_d_upper,   (double *)(coef   + row_offsets[0]));
+    coefs_d_upper   = _mm_loadh_pd(coefs_d_upper,   (double *)(coef   + row_offsets[1]));
+    q_coefs_d_upper = _mm_loadl_pd(q_coefs_d_upper, (double *)(q_coef + row_offsets[0]));
+    q_coefs_d_upper = _mm_loadh_pd(q_coefs_d_upper, (double *)(q_coef + row_offsets[1]));
+
+    coefs_d_lower   = _mm_loadl_pd(coefs_d_lower,   (double *)(coef   + row_offsets[2]));
+    coefs_d_lower   = _mm_loadh_pd(coefs_d_lower,   (double *)(coef   + row_offsets[3]));
+    q_coefs_d_lower = _mm_loadl_pd(q_coefs_d_lower, (double *)(q_coef + row_offsets[2]));
+    q_coefs_d_lower = _mm_loadh_pd(q_coefs_d_lower, (double *)(q_coef + row_offsets[3]));
+
+    coefs_upper     = _mm_castpd_si128(coefs_d_upper);
+    coefs_lower     = _mm_castpd_si128(coefs_d_lower);
+    q_coefs_upper   = _mm_castpd_si128(q_coefs_d_upper);
+    q_coefs_lower   = _mm_castpd_si128(q_coefs_d_lower);
+
+    coefs_lower     = _mm_shuffle_epi8(coefs_lower,   low128_shuffle_masks[scan_idx]);
+    q_coefs_lower   = _mm_shuffle_epi8(q_coefs_lower, low128_shuffle_masks[scan_idx]);
+
+    coefs_rearr1_upper   = _mm_blendv_epi8(coefs_upper,   coefs_lower,   blend_masks[scan_idx]);
+    coefs_rearr1_lower   = _mm_blendv_epi8(coefs_lower,   coefs_upper,   blend_masks[scan_idx]);
+    q_coefs_rearr1_upper = _mm_blendv_epi8(q_coefs_upper, q_coefs_lower, blend_masks[scan_idx]);
+    q_coefs_rearr1_lower = _mm_blendv_epi8(q_coefs_lower, q_coefs_upper, blend_masks[scan_idx]);
+
+    coefs_rearr2_upper   = _mm_shuffle_epi8(coefs_rearr1_upper,   invec_rearr_masks_upper[scan_idx]);
+    coefs_rearr2_lower   = _mm_shuffle_epi8(coefs_rearr1_lower,   invec_rearr_masks_lower[scan_idx]);
+    q_coefs_rearr2_upper = _mm_shuffle_epi8(q_coefs_rearr1_upper, invec_rearr_masks_upper[scan_idx]);
+    q_coefs_rearr2_lower = _mm_shuffle_epi8(q_coefs_rearr1_lower, invec_rearr_masks_lower[scan_idx]);
+
+    // Why, oh why, is there no _mm256_setr_m128i intrinsic in the header that
+    // would do the exact same operation in the exact same way? :(
+    __m256i v_coef = _mm256_insertf128_si256(_mm256_castsi128_si256(coefs_rearr2_upper),
+                                             coefs_rearr2_lower,
+                                             1);
+
+    __m256i q_coefs = _mm256_insertf128_si256(_mm256_castsi128_si256(q_coefs_rearr2_upper),
+                                              q_coefs_rearr2_lower, 
+                                              1);
+
+    // Reordering done
+
+    __m256i v_level = _mm256_abs_epi16(v_coef);
    __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
    __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));

@ -130,7 +482,6 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe

    v_level = _mm256_packs_epi32(v_level32_a, v_level32_b);

-    __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n]));
    __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
    __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
    __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
@ -142,95 +493,15 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
    v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8);
    v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8);
    
-    _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b));
-    _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1));
+    __m256i deltas_h = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x31);
+    __m256i deltas_l = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x20);
+
+    last_cg = hide_block_sign(v_coef, q_coefs, deltas_h, deltas_l, q_coef, scan, subpos, last_cg);
  }

-  if (ac_sum >= 2) {
-#define SCAN_SET_SIZE 16
-#define LOG2_SCAN_SET_SIZE 4
-    int32_t n, last_cg = -1, abssum = 0, subset, subpos;
-    for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
-      int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1;
-      subpos = subset << LOG2_SCAN_SET_SIZE;
-      abssum = 0;
-
-      // Find last coeff pos
-      for (n = SCAN_SET_SIZE - 1; n >= 0; n--)  {
-        if (q_coef[scan[n + subpos]])  {
-          last_nz_pos_in_cg = n;
-          break;
-        }
-      }
-
-      // First coeff pos
-      for (n = 0; n <SCAN_SET_SIZE; n++) {
-        if (q_coef[scan[n + subpos]]) {
-          first_nz_pos_in_cg = n;
-          break;
-        }
-      }
-
-      // Sum all kvz_quant coeffs between first and last
-      for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) {
-        abssum += q_coef[scan[n + subpos]];
-      }
-
-      if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
-        last_cg = 1;
-      }
-
-      if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
-        int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1);
-        if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
-          int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff;
-          int16_t final_change = 0, cur_change = 0;
-          for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) {
-            uint32_t blkPos = scan[n + subpos];
-            if (q_coef[blkPos] != 0) {
-              if (delta_u[blkPos] > 0) {
-                cur_cost = -delta_u[blkPos];
-                cur_change = 1;
-              }
-              else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) {
-                cur_cost = 0x7fffffff;
-              }
-              else {
-                cur_cost = delta_u[blkPos];
-                cur_change = -1;
-              }
-            }
-            else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) {
-              cur_cost = 0x7fffffff;
-            }
-            else {
-              cur_cost = -delta_u[blkPos];
-              cur_change = 1;
-            }
-
-            if (cur_cost < min_cost_inc) {
-              min_cost_inc = cur_cost;
-              final_change = cur_change;
-              min_pos = blkPos;
-            }
-          } // CG loop
-
-          if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) {
-            final_change = -1;
-          }
-
-          if (coef[min_pos] >= 0) q_coef[min_pos] += final_change;
-          else q_coef[min_pos] -= final_change;
-        } // Hide
-      }
-      if (last_cg == 1) last_cg = 0;
-    }
-
+#undef VEC_WIDTH
 #undef SCAN_SET_SIZE
 #undef LOG2_SCAN_SET_SIZE
-  }
 }

 static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){