From e8bff993293e45651611dd816c7a358005a59dcf Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Mon, 29 Jul 2019 21:54:53 +0300
Subject: [PATCH] Redo the SAO_TYPE_BAND subsection of AVX2 SAO color
 reconstruction

Vectorize it all, hope this helps with perf
---
 src/strategies/avx2/sao-avx2.c | 176 +++++++++++++++++++++++----------
 1 file changed, 125 insertions(+), 51 deletions(-)

diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c
index 6a4504c5..1afecd32 100644
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@@ -402,6 +402,17 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   }
 }
 
+/*
+static void print_256(__m256i v)
+{
+  uint16_t buf[16];
+  _mm256_storeu_si256((__m256i *)buf, v);
+  for (int i = 0; i < 16; i++)
+    printf("%.4x%c", buf[i], (i == 15) ? '\n' : (i == 7) ? '-' : ' ');
+}
+*/
+
+
 /*
  * Calculate an array of intensity correlations for each intensity value.
  * Return array as 16 YMM vectors, each containing 2x16 unsigned bytes
@@ -477,6 +488,119 @@ static void calc_sao_offset_array_avx2(const encoder_control_t *encoder,
   }
 }
 
+static INLINE __m256i lookup_color_band_ymm(__m256i curr_row, const __m256i *offsets)
+{
+  const __m256i select_nibble = _mm256_set1_epi8   (0x0f);
+  const __m256i lo_nibbles    = _mm256_and_si256   (select_nibble, curr_row);
+  const __m256i hi_nibbles    = _mm256_andnot_si256(select_nibble, curr_row);
+
+  // Loop through the offset vectors, the 0xi'th one always holding
+  // offsets 0xi0...0xif. Use shuffle to do a lookup on the current
+  // offset vector, then check which pixels actually should be looked
+  // up from this vector (ie. whether their values are 0xi0...0xif) and
+  // mask out any but correct ones.
+  __m256i result_row = _mm256_setzero_si256();
+  for (uint8_t i = 0; i < 16; i += 4) {
+
+    __m256i curr_hinib0   = _mm256_set1_epi8   ((i + 0) << 4);
+    __m256i curr_hinib1   = _mm256_set1_epi8   ((i + 1) << 4);
+    __m256i curr_hinib2   = _mm256_set1_epi8   ((i + 2) << 4);
+    __m256i curr_hinib3   = _mm256_set1_epi8   ((i + 3) << 4);
+
+    __m256i hinib_select0 = _mm256_cmpeq_epi8  (curr_hinib0,    hi_nibbles);
+    __m256i hinib_select1 = _mm256_cmpeq_epi8  (curr_hinib1,    hi_nibbles);
+    __m256i hinib_select2 = _mm256_cmpeq_epi8  (curr_hinib2,    hi_nibbles);
+    __m256i hinib_select3 = _mm256_cmpeq_epi8  (curr_hinib3,    hi_nibbles);
+
+    __m256i lonib_lookup0 = _mm256_shuffle_epi8(offsets[i + 0], lo_nibbles);
+    __m256i lonib_lookup1 = _mm256_shuffle_epi8(offsets[i + 1], lo_nibbles);
+    __m256i lonib_lookup2 = _mm256_shuffle_epi8(offsets[i + 2], lo_nibbles);
+    __m256i lonib_lookup3 = _mm256_shuffle_epi8(offsets[i + 3], lo_nibbles);
+
+    __m256i lookup_mskd0  = _mm256_and_si256   (hinib_select0,  lonib_lookup0);
+    __m256i lookup_mskd1  = _mm256_and_si256   (hinib_select1,  lonib_lookup1);
+    __m256i lookup_mskd2  = _mm256_and_si256   (hinib_select2,  lonib_lookup2);
+    __m256i lookup_mskd3  = _mm256_and_si256   (hinib_select3,  lonib_lookup3);
+
+    __m256i lookup_mskd01 = _mm256_or_si256    (lookup_mskd0,   lookup_mskd1);
+    __m256i lookup_mskd23 = _mm256_or_si256    (lookup_mskd2,   lookup_mskd3);
+    __m256i lookup_res    = _mm256_or_si256    (lookup_mskd01,  lookup_mskd23);
+
+            result_row    = _mm256_or_si256    (result_row,     lookup_res);
+  }
+  return result_row;
+}
+
+static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
+                                          const kvz_pixel         *rec_data,
+                                                kvz_pixel         *new_rec_data,
+                                          const sao_info_t        *sao,
+                                                int32_t            stride,
+                                                int32_t            new_stride,
+                                                int32_t            block_width,
+                                                int32_t            block_height,
+                                                color_t            color_i)
+{
+  const uint32_t width_db32 = block_width & ~31;
+  const uint32_t width_db4  = block_width &  ~3;
+  const uint32_t width_rest = block_width &   3;
+
+  // Form the load&store mask
+  const __m256i wdb4_256      = _mm256_set1_epi32 (width_db4 & 31);
+  const __m256i indexes       = _mm256_setr_epi32 (3, 7, 11, 15, 19, 23, 27, 31);
+  const __m256i db4_mask      = _mm256_cmpgt_epi32(wdb4_256, indexes);
+
+  // Each of the 256 offsets is a byte, but only 16 are held in one YMM since
+  // lanes must be duplicated to use shuffle.
+  __m256i offsets[16];
+  calc_sao_offset_array_avx2(encoder, sao, offsets, color_i);
+
+  for (uint32_t y = 0; y < block_height; y++) {
+    uint32_t x = 0;
+    for (; x < width_db32; x += 32) {
+      const uint32_t curr_srcpos = y *     stride + x;
+      const uint32_t curr_dstpos = y * new_stride + x;
+
+      __m256i curr_row = _mm256_loadu_si256((const __m256i *)(rec_data + curr_srcpos));
+      __m256i result   = lookup_color_band_ymm(curr_row, offsets);
+      _mm256_storeu_si256((__m256i *)(new_rec_data + curr_dstpos), result);
+    }
+    if (block_width > width_db32) {
+      const uint32_t curr_srcpos = y *     stride + x;
+      const uint32_t curr_dstpos = y * new_stride + x;
+      const uint32_t rest_srcpos = y *     stride + width_db4;
+      const uint32_t rest_dstpos = y * new_stride + width_db4;
+
+      // Read the very last pixels byte by byte and pack them into one dword.
+      // Piggyback said dword as the highest dword of the row vector variable,
+      // that particular place can never be loaded into by the maskmove
+      // (otherwise that vector would go through the divisible-by-32 code
+      // path).
+      uint32_t last_dword = 0;
+      for (uint32_t i = 0; i < width_rest; i++) {
+        uint8_t  currb = rec_data[rest_srcpos + i];
+        uint32_t currd = ((uint32_t)currb) << (i * 8);
+        last_dword |= currd;
+      }
+      const int32_t *src_ptr = (const int32_t *)(    rec_data + curr_srcpos);
+            int32_t *dst_ptr = (      int32_t *)(new_rec_data + curr_dstpos);
+
+      __m256i curr_row = _mm256_maskload_epi32(src_ptr,  db4_mask);
+              curr_row = _mm256_insert_epi32  (curr_row, last_dword, 7);
+      __m256i result   = lookup_color_band_ymm(curr_row, offsets);
+
+      _mm256_maskstore_epi32(dst_ptr, db4_mask, result);
+      uint32_t last_dword_dst = _mm256_extract_epi32(result, 7);
+
+      for (uint32_t i = 0; i < width_rest; i++) {
+        uint8_t currb = last_dword_dst & 0xff;
+        new_rec_data[rest_dstpos + i] = currb;
+        last_dword_dst >>= 8;
+      }
+    }
+  }
+}
+
 static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
  const kvz_pixel *rec_data,
  kvz_pixel *new_rec_data,
@@ -492,57 +616,7 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
   int offset_v = color_i == COLOR_V ? 5 : 0;
 
   if (sao->type == SAO_TYPE_BAND) {
-    int offsets[1 << KVZ_BIT_DEPTH];
-    kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
-    unsigned char*temp;
-
-    for (int y = 0; y < block_height; ++y) {
-      for (int x = 0; x < block_width; x+=32) {
-
-        //new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
-
-
-        bool atleast_32_elements = (block_width - x) > 31;
-        bool atleast_16_elements = (block_width - x) > 15;
-
-        int choose = atleast_32_elements + atleast_16_elements;
-
-        switch (choose) {
-
-        case 2:;
-
-          __m256i rec_data_256_epi8 = _mm256_loadu_si256((__m256i*)&rec_data[y * stride + x]);
-          temp = (unsigned char*)&rec_data_256_epi8;
-
-          __m256i offsets_256_epi8 = _mm256_set_epi8(offsets[temp[31]], offsets[temp[30]], offsets[temp[29]], offsets[temp[28]], offsets[temp[27]], offsets[temp[26]], offsets[temp[25]],
-           offsets[temp[24]], offsets[temp[23]], offsets[temp[22]], offsets[temp[21]], offsets[temp[20]], offsets[temp[19]], offsets[temp[18]], offsets[temp[17]], offsets[temp[16]],
-           offsets[temp[15]], offsets[temp[14]], offsets[temp[13]], offsets[temp[12]], offsets[temp[11]], offsets[temp[10]], offsets[temp[9]],
-           offsets[temp[8]], offsets[temp[7]], offsets[temp[6]], offsets[temp[5]], offsets[temp[4]], offsets[temp[3]], offsets[temp[2]], offsets[temp[1]], offsets[temp[0]]);
-          _mm256_storeu_si256((__m256i*)& new_rec_data[y * new_stride + x], offsets_256_epi8);
-          break;
-
-        case 1:;
-
-          __m128i rec_data_128_epi8 = _mm_loadu_si128((__m128i*)&rec_data[y * stride + x]);
-          temp = (unsigned char*)&rec_data_128_epi8;
-          __m128i offsets_128_epi8 = _mm_set_epi8(offsets[temp[15]], offsets[temp[14]], offsets[temp[13]], offsets[temp[12]], offsets[temp[11]], offsets[temp[10]], offsets[temp[9]],
-           offsets[temp[8]], offsets[temp[7]], offsets[temp[6]], offsets[temp[5]], offsets[temp[4]], offsets[temp[3]], offsets[temp[2]], offsets[temp[1]], offsets[temp[0]]);
-          _mm_storeu_si128((__m128i*)& new_rec_data[y * new_stride + x], offsets_128_epi8);
-
-          for (int i = x; i < block_width; i++) {
-            new_rec_data[y * new_stride + i] = offsets[rec_data[y * stride + i]];
-          }
-          break;
-
-        default:;
-
-          for (int i = x; i < block_width; i++) {
-            new_rec_data[y * new_stride + i] = offsets[rec_data[y * stride + i]];
-          }
-          break;
-        }
-      }
-    }
+    reconstruct_color_band(encoder, rec_data, new_rec_data, sao, stride, new_stride, block_width, block_height, color_i);
   }
   else {