Redo sao_band_ddistortion_avx2

Avoid branching and do the entire thing on 32 pixels at once in YMMs. Also make the sao_bands function parameter const.
2024-11-24 02:24:07 +00:00 · 2019-07-24 15:30:00 +03:00 · 2019-07-24 15:30:00 +03:00 · c18adc5ee0
parent 2827c3e3ab
commit c18adc5ee0
4 changed files with 195 additions and 93 deletions
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@ -24,6 +24,7 @@
 #include <immintrin.h>
 #include <nmmintrin.h>
 #include "strategies/generic/sao_band_ddistortion.h"
 #include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
@ -436,8 +437,7 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
        bool use_8_elements = (block_width - x) >= 8;
-        switch (use_8_elements) {
+        if (use_8_elements) {
        case true:;
          const kvz_pixel *c_data = &rec_data[y * stride + x];
          __m128i vector_a_epi8 = _mm_loadl_epi64((__m128i*)&c_data[a_ofs.y * stride + a_ofs.x]);
@ -465,9 +465,8 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
          // Store 64-bits from vector to memory
          _mm_storel_epi64((__m128i*)&(new_rec_data[y * new_stride + x]), _mm256_castsi256_si128(temp_epi8));
          break;
-        default:;
+        } else {
          for (int i = x; i < (block_width); ++i) {
            const kvz_pixel *c_data = &rec_data[y * stride + i];
@ -481,94 +480,177 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
            int eo_cat = sao_calc_eo_cat(a, b, c);
            new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
          }
          break;
        }
      }
    }
  }
 }
-static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
+static INLINE __m256i srli_epi8(__m256i v, const uint32_t shift)
 const kvz_pixel *orig_data,
 const kvz_pixel *rec_data,
 int block_width,
 int block_height,
 int band_pos,
 int sao_bands[4])
 {
-  int y, x;
+  const uint8_t hibit_mask     = 0xff >> shift;
-  int shift = state->encoder_control->bitdepth - 5;
+  const __m256i hibit_mask_256 = _mm256_set1_epi8(hibit_mask);
  int sum = 0;
-  __m256i sum_epi32 = { 0 };
+  __m256i v_shifted = _mm256_srli_epi32(v,         shift);
  __m256i v_masked  = _mm256_and_si256 (v_shifted, hibit_mask_256);
-  __m256i band_pos_epi32 = _mm256_set1_epi32(band_pos);
+  return v_masked;
  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; x += 8) {
      bool use_8_elements =  (block_width - x) >= 8;
      switch (use_8_elements) {
      case true:;
        //int band = (rec_data[y * block_width + x] >> shift) - band_pos;
        __m256i band_epi32 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_data[y * block_width + x])));
        band_epi32 = _mm256_srli_epi32(band_epi32, shift);
        band_epi32 = _mm256_sub_epi32(band_epi32, band_pos_epi32);
        __m256i vector_mask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_set1_epi32(~3), band_epi32), _mm256_setzero_si256());
        __m256i offset_epi32 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)sao_bands)), band_epi32);
        offset_epi32 = _mm256_and_si256(vector_mask, offset_epi32);
        __m256i orig_data_epi32 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_data[y * block_width + x])));
        __m256i rec_data_epi32 = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_data[y * block_width + x])));
        __m256i diff_epi32 = _mm256_sub_epi32(orig_data_epi32, rec_data_epi32);
        __m256i diff_minus_offset_epi32 = _mm256_sub_epi32(diff_epi32, offset_epi32);
        __m256i temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(diff_minus_offset_epi32, diff_minus_offset_epi32), _mm256_mullo_epi32(diff_epi32, diff_epi32));
        sum_epi32 = _mm256_add_epi32(sum_epi32, temp_sum);
        break;
      default:;
        for (x; x < block_width; ++x) {
          int band = (rec_data[y * block_width + x] >> shift) - band_pos;
          int offset = 0;
          if (band >= 0 && band < 4) {
            offset = sao_bands[band];
          }
          if (offset != 0) {
            int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x];
            // Offset is applied to reconstruction, so it is subtracted from diff.
            sum += (diff - offset) * (diff - offset) - diff * diff;
          }
        }
 }
-
+static INLINE void cvt_epu8_epi16(const __m256i v, __m256i *res_lo, __m256i *res_hi)
-    }
+{
  const __m256i zero  = _mm256_setzero_si256();
             *res_lo  = _mm256_unpacklo_epi8(v, zero);
             *res_hi  = _mm256_unpackhi_epi8(v, zero);
 }
-  //Full horizontal sum
+static INLINE void cvt_epi8_epi16(const __m256i v, __m256i *res_lo, __m256i *res_hi)
-  sum_epi32 = _mm256_add_epi32(sum_epi32, _mm256_castsi128_si256(_mm256_extracti128_si256(sum_epi32, 1)));
+{
-  sum_epi32 = _mm256_add_epi32(sum_epi32, _mm256_shuffle_epi32(sum_epi32, _MM_SHUFFLE(1, 0, 3, 2)));
+  const __m256i zero  = _mm256_setzero_si256();
-  sum_epi32 = _mm256_add_epi32(sum_epi32, _mm256_shuffle_epi32(sum_epi32, _MM_SHUFFLE(0, 1, 0, 1)));
+        __m256i signs = _mm256_cmpgt_epi8   (zero, v);
-  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_epi32));
+             *res_lo  = _mm256_unpacklo_epi8(v,    signs);
             *res_hi  = _mm256_unpackhi_epi8(v,    signs);
 }
-  return sum;
+static int32_t sao_band_ddistortion_avx2(const encoder_state_t *state,
                                         const uint8_t         *orig_data,
                                         const uint8_t         *rec_data,
                                               int32_t          block_width,
                                               int32_t          block_height,
                                               int32_t          band_pos,
                                         const int32_t          sao_bands[4])
 {
  const uint32_t bitdepth = 8;
  const uint32_t shift    = bitdepth - 5;
  // Clamp band_pos to 32 from above. It'll be subtracted from the shifted
  // rec_data values, which in 8-bit depth will always be clamped to [0, 31],
  // so if it ever exceeds 32, all the band values will be negative and
  // ignored. Ditto for less than -4.
  __m128i bp_128   = _mm_cvtsi32_si128    (band_pos);
  __m128i hilimit  = _mm_cvtsi32_si128    (32);
  __m128i lolimit  = _mm_cvtsi32_si128    (-4);
          bp_128   = _mm_min_epi8         (bp_128, hilimit);
          bp_128   = _mm_max_epi8         (bp_128, lolimit);
  __m256i bp_256  = _mm256_broadcastb_epi8(bp_128);
  // LSBs of each SAO band dword, the band values must fit in 8 bits anyway
  // (this will be checked later)
  const __m128i sb_shufmask = _mm_set1_epi32(0x0c080400);
  __m128i sbs_32   = _mm_loadu_si128((const __m128i *)sao_bands);
  __m128i sbs_8    = _mm_shuffle_epi8            (sbs_32, sb_shufmask);
  __m256i sb_256   = _mm256_broadcastsi128_si256 (sbs_8);
  // Compare most significant 25 bits of SAO bands to the sign bit to assert
  // that the band is between -128 and 127 (only comparing 24 would fail to
  // detect values of 128...255)
  __m128i sb_ms25b = _mm_srai_epi32              (sbs_32,  7);
  __m128i sb_signs = _mm_srai_epi32              (sbs_32, 31);
  __m128i sbs_ok_v = _mm_cmpeq_epi32             (sb_ms25b, sb_signs);
  uint16_t sbs_ok  = _mm_movemask_epi8           (sbs_ok_v);
  // These should trigger like, never, at least the later condition of block
  // not being a multiple of 32 wide. Rather safe than sorry though, huge SAO
  // bands are more tricky of these two because the algorithm needs a complete
  // reimplementation to work on 16-bit values.
  if (sbs_ok != 0xffff)
    goto use_generic;
  // If VVC or something will start using SAO on blocks with width a multiple
  // of 16, feel free to implement a XMM variant of this algorithm
  if ((block_width & 31) != 0)
    goto use_generic;
  const __m256i zero          = _mm256_setzero_si256();
  const __m256i threes        = _mm256_set1_epi8 (3);
  const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001);
  __m256i sum = _mm256_setzero_si256();
  for (uint32_t y = 0; y < block_height; y++) {
    for (uint32_t x = 0; x < block_width; x += 32) {
      const int32_t curr_pos = y * block_width + x;
      __m256i   rd = _mm256_loadu_si256((const __m256i *)( rec_data + curr_pos));
      __m256i orig = _mm256_loadu_si256((const __m256i *)(orig_data + curr_pos));
      __m256i orig_lo, orig_hi, rd_lo, rd_hi;
      cvt_epu8_epi16(orig, &orig_lo, &orig_hi);
      cvt_epu8_epi16(rd,   &rd_lo,   &rd_hi);
      // The shift will clamp band to 0...31; band_pos on the other
      // hand is always between 0...32, so band will be -1...31. Anything
      // below zero is ignored, so we can clamp band_pos to 32.
      __m256i rd_divd      = srli_epi8           (rd,            shift);
      __m256i band         = _mm256_sub_epi8     (rd_divd,       bp_256);
      // Force all <0 or >3 bands to 0xff, which will zero the shuffle result
      __m256i band_lt_0    = _mm256_cmpgt_epi8   (zero,          band);
      __m256i band_gt_3    = _mm256_cmpgt_epi8   (band,          threes);
      __m256i band_inv     = _mm256_or_si256     (band_lt_0,     band_gt_3);
              band         = _mm256_or_si256     (band,          band_inv);
      __m256i offsets      = _mm256_shuffle_epi8 (sb_256,        band);
      __m256i offsets_lo, offsets_hi;
      cvt_epi8_epi16(offsets, &offsets_lo, &offsets_hi);
      __m256i offsets_0_lo = _mm256_cmpeq_epi16   (offsets_lo,   zero);
      __m256i offsets_0_hi = _mm256_cmpeq_epi16   (offsets_hi,   zero);
      __m256i diff_lo      = _mm256_sub_epi16     (orig_lo,      rd_lo);
      __m256i diff_hi      = _mm256_sub_epi16     (orig_hi,      rd_hi);
      __m256i delta_lo     = _mm256_sub_epi16     (diff_lo,      offsets_lo);
      __m256i delta_hi     = _mm256_sub_epi16     (diff_hi,      offsets_hi);
              diff_lo      = _mm256_andnot_si256  (offsets_0_lo, diff_lo);
              diff_hi      = _mm256_andnot_si256  (offsets_0_hi, diff_hi);
              delta_lo     = _mm256_andnot_si256  (offsets_0_lo, delta_lo);
              delta_hi     = _mm256_andnot_si256  (offsets_0_hi, delta_hi);
      __m256i dd0_lo       = _mm256_unpacklo_epi16(delta_lo,     diff_lo);
      __m256i dd0_hi       = _mm256_unpackhi_epi16(delta_lo,     diff_lo);
      __m256i dd1_lo       = _mm256_unpacklo_epi16(delta_hi,     diff_hi);
      __m256i dd1_hi       = _mm256_unpackhi_epi16(delta_hi,     diff_hi);
      __m256i dd0_lo_n     = _mm256_sign_epi16    (dd0_lo,       negate_hiword);
      __m256i dd0_hi_n     = _mm256_sign_epi16    (dd0_hi,       negate_hiword);
      __m256i dd1_lo_n     = _mm256_sign_epi16    (dd1_lo,       negate_hiword);
      __m256i dd1_hi_n     = _mm256_sign_epi16    (dd1_hi,       negate_hiword);
      __m256i sum0_lo      = _mm256_madd_epi16    (dd0_lo,       dd0_lo_n);
      __m256i sum0_hi      = _mm256_madd_epi16    (dd0_hi,       dd0_hi_n);
      __m256i sum1_lo      = _mm256_madd_epi16    (dd1_lo,       dd1_lo_n);
      __m256i sum1_hi      = _mm256_madd_epi16    (dd1_hi,       dd1_hi_n);
      __m256i sum0         = _mm256_add_epi32     (sum0_lo,      sum0_hi);
      __m256i sum1         = _mm256_add_epi32     (sum1_lo,      sum1_hi);
      __m256i curr_sum     = _mm256_add_epi32     (sum0,         sum1);
              sum          = _mm256_add_epi32     (sum,          curr_sum);
    }
  }
  // Horizontal sum of 8x32 YMM, nothing special here
  __m256i sum2 = _mm256_permute4x64_epi64(sum,  _MM_SHUFFLE(1, 0, 3, 2));
  __m256i sum3 = _mm256_add_epi32        (sum,  sum2);
  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);
  __m128i sum8 = _mm256_castsi256_si128  (sum7);
  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
  return  sum9;
 use_generic:
  return sao_band_ddistortion_generic(state, orig_data, rec_data, block_width,
      block_height, band_pos, sao_bands);
 }
 #endif //COMPILE_INTEL_AVX2
--- a/src/strategies/generic/sao-generic.c
+++ b/src/strategies/generic/sao-generic.c
@ -19,6 +19,7 @@
 ****************************************************************************/
 #include "strategies/generic/sao-generic.h"
 #include "strategies/generic/sao_band_ddistortion.h"
 #include "cu.h"
 #include "encoder.h"
@ -156,35 +157,6 @@ static void sao_reconstruct_color_generic(const encoder_control_t * const encode
 }
 static int sao_band_ddistortion_generic(const encoder_state_t * const state,
                                        const kvz_pixel *orig_data,
                                        const kvz_pixel *rec_data,
                                        int block_width,
                                        int block_height,
                                        int band_pos,
                                        int sao_bands[4])
 {
  int y, x;
  int shift = state->encoder_control->bitdepth-5;
  int sum = 0;
  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
      int band = (rec_data[y * block_width + x] >> shift) - band_pos;
      int offset = 0;
      if (band >= 0 && band < 4) {
        offset = sao_bands[band];
      }
      if (offset != 0) {
        int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x];
        // Offset is applied to reconstruction, so it is subtracted from diff.
        sum += (diff - offset) * (diff - offset) - diff * diff;
      }
    }
  }
  return sum;
 }
 int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
 {
--- a/src/strategies/generic/sao_band_ddistortion.h
+++ b/src/strategies/generic/sao_band_ddistortion.h
@ -0,0 +1,48 @@
 #ifndef SAO_BAND_DDISTORTION_H_
 #define SAO_BAND_DDISTORTION_H_
 // #include "encoder.h"
 #include "encoderstate.h"
 #include "kvazaar.h"
 #include "sao.h"
 static int sao_band_ddistortion_generic(const encoder_state_t * const state,
                                        const kvz_pixel *orig_data,
                                        const kvz_pixel *rec_data,
                                        int block_width,
                                        int block_height,
                                        int band_pos,
                                        const int sao_bands[4])
 {
  int y, x;
  int shift = state->encoder_control->bitdepth-5;
  int sum = 0;
  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
      const int32_t curr_pos = y * block_width + x;
      kvz_pixel rec  =  rec_data[curr_pos];
      kvz_pixel orig = orig_data[curr_pos];
      int32_t band = (rec >> shift) - band_pos;
      int32_t offset = 0;
      if (band >= 0 && band <= 3) {
        offset = sao_bands[band];
      }
      // Offset is applied to reconstruction, so it is subtracted from diff.
      int32_t diff  = orig - rec;
      int32_t delta = diff - offset;
      int32_t dmask = (offset == 0) ? -1 : 0;
      diff  &= ~dmask;
      delta &= ~dmask;
      sum += delta * delta - diff * diff;
    }
  }
  return sum;
 }
 #endif
--- a/src/strategies/strategies-sao.h
+++ b/src/strategies/strategies-sao.h
@ -51,7 +51,7 @@ typedef void (sao_reconstruct_color_func)(const encoder_control_t * const encode
 typedef int (sao_band_ddistortion_func)(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
  int block_width, int block_height,
-  int band_pos, int sao_bands[4]);
+  int band_pos, const int sao_bands[4]);
 // Declare function pointers.
 extern sao_edge_ddistortion_func * kvz_sao_edge_ddistortion;