Modify AVX2 SAD to mask data by byte granularity in AVX registers

Avoids using any SAD calculations narrower than 256 bits, and simplifies the code. Also improves execution speed
2024-11-24 02:24:07 +00:00 · 2019-01-07 16:48:32 +02:00 · 2019-01-07 16:48:32 +02:00 · 887d7700a8
parent 7585f79a71
commit 887d7700a8
1 changed files with 12 additions and 23 deletions
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -53,18 +53,18 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
                          const int width, const int height, const unsigned stride1, const unsigned stride2)
 {
  int32_t y, x;
  uint32_t sad = 0;
  __m256i avx_inc = _mm256_setzero_si256();
-  // 256-bit blocks, bytes after them, 32-bit blocks after the large blocks
+  // Bytes in block in 256-bit blocks per each scanline, and remainder
-  const int largeblock_bytes = width         & ~31;
+  const int largeblock_bytes = width & ~31;
-  const int any_residuals    = width         &  31;
+  const int residual_bytes   = width &  31;
  const int residual_128bs   = any_residuals >> 4;
  const int residual_dwords  = any_residuals >> 2;
-  const __m256i ns     = _mm256_setr_epi32 (0, 1, 2, 3, 4, 5, 6, 7);
+  const __m256i rds    = _mm256_set1_epi8(residual_bytes);
-  const __m256i rds    = _mm256_set1_epi32 (residual_dwords);
+  const __m256i ns     = _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
-  const __m256i rdmask = _mm256_cmpgt_epi32(rds, ns);
+                                          8,  9,  10, 11, 12, 13, 14, 15,
                                          16, 17, 18, 19, 20, 21, 22, 23,
                                          24, 25, 26, 27, 28, 29, 30, 31);
  const __m256i rdmask = _mm256_cmpgt_epi8(rds, ns);
  __m256i avx_inc      = _mm256_setzero_si256();
  for (y = 0; y < height; ++y) {
@ -74,22 +74,13 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
      __m256i curr_sads = _mm256_sad_epu8(a, b);
      avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
    }
-
+    if (residual_bytes) {
    /*
     * If there are no residual values, it does not matter what bogus values
     * we use here since it will be masked away anyway
     */
    if (any_residuals) {
      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
      __m256i b_masked  = _mm256_blendv_epi8(a, b, rdmask);
      __m256i curr_sads = _mm256_sad_epu8   (a, b_masked);
      avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
      x = width & ~(uint32_t)3;
      for (; x < width; x++)
        sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
    }
  }
  __m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc,   _MM_SHUFFLE(1, 0, 3, 2));
@ -100,9 +91,7 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
  // 32 bits should always be enough for even the largest blocks with a SAD of
  // 255 in each pixel, even though the SAD results themselves are 64 bits
  __m128i avx_inc_128 = _mm256_castsi256_si128(avx_inc_5);
-  sad += _mm_cvtsi128_si32(avx_inc_128);
+  return _mm_cvtsi128_si32(avx_inc_128);
  return sad;
 }
 /**