Create strategy for ver_sad

Easy to vectorize
2024-11-27 19:24:06 +00:00 · 2019-01-22 15:57:16 +02:00 · 2019-01-22 15:57:16 +02:00 · f781dc31f0
parent ca94ae9529
commit f781dc31f0
7 changed files with 304 additions and 33 deletions
--- a/src/image.c
+++ b/src/image.c
@ -260,32 +260,6 @@ static unsigned cor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
  return sad;
 }

-/**
- * \brief Vertically interpolate SAD outside the frame.
- *
- * \param data1   Starting point of the first picture.
- * \param data2   Starting point of the second picture.
- * \param width   Width of the region for which SAD is calculated.
- * \param height  Height of the region for which SAD is calculated.
- * \param width  Width of the pixel array.
- *
- * \returns Sum of Absolute Differences
- */
-static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_stride)
-{
-  int x, y;
-  unsigned sad = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
-    }
-  }
-
-  return sad;
-}
-
 /**
 * \brief Horizontally interpolate SAD outside the frame.
 *
@ -370,7 +344,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
    result += cor_sad(pic_data,
                      &ref_data[top * ref->stride + left],
                      left, top, pic->stride);
-    result += ver_sad(&pic_data[left],
+    result += kvz_ver_sad(&pic_data[left],
                      &ref_data[top * ref->stride + left],
                      block_width - left, top, pic->stride);
    result += hor_sad(&pic_data[top * pic->stride],
@ -380,7 +354,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
                      &ref_data[top * ref->stride + left],
                      block_width - left, block_height - top, pic->stride, ref->stride);
  } else if (top && right) {
-    result += ver_sad(pic_data,
+    result += kvz_ver_sad(pic_data,
                      &ref_data[top * ref->stride],
                      block_width - right, top, pic->stride);
    result += cor_sad(&pic_data[block_width - right],
@ -402,7 +376,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
                      left, bottom, pic->stride);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
+    result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
                      block_width - left, bottom, pic->stride);
  } else if (bottom && right) {
@ -412,14 +386,14 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
    result += hor_sad(&pic_data[block_width - right],
                      &ref_data[block_width - right - 1],
                      right, block_height - bottom, pic->stride, ref->stride);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
+    result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride],
                      block_width - right, bottom, pic->stride);
    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
                      &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
                      right, bottom, pic->stride);
  } else if (top) {
-    result += ver_sad(pic_data,
+    result += kvz_ver_sad(pic_data,
                      &ref_data[top * ref->stride],
                      block_width, top, pic->stride);
    result += reg_sad_maybe_optimized(&pic_data[top * pic->stride],
@ -431,7 +405,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture
                      ref_data,
                      block_width, block_height - bottom, pic->stride, ref->stride,
                      optimized_sad);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
+    result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
                      &ref_data[(block_height - bottom - 1) * ref->stride],
                      block_width, bottom, pic->stride);
  } else if (left) {
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -1277,6 +1277,24 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
  else
    return NULL;
 }
+
+static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                             int32_t width, int32_t height, uint32_t stride)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return ver_sad_w4(pic_data, ref_data, height, stride);
+  if (width == 8)
+    return ver_sad_w8(pic_data, ref_data, height, stride);
+  if (width == 12)
+    return ver_sad_w12(pic_data, ref_data, height, stride);
+  if (width == 16)
+    return ver_sad_w16(pic_data, ref_data, height, stride);
+  else
+    return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
+}
+
 #endif //COMPILE_INTEL_AVX2

 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
@ -1312,6 +1330,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
    success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
 	  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
    success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
+    success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);

  }
 #endif
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -593,6 +593,32 @@ static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
  return NULL;
 }

+/**
+ * \brief Vertically interpolate SAD outside the frame.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param width  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int block_width, int block_height, unsigned pic_stride)
+{
+  int x, y;
+  unsigned sad = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
+    }
+  }
+
+  return sad;
+}
+
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
  bool success = true;
@ -629,6 +655,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);

  success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
+  success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);

  return success;
 }
--- a/src/strategies/sse41/picture-sse41.c
+++ b/src/strategies/sse41/picture-sse41.c
@ -66,6 +66,23 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
    return NULL;
 }

+static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                              int32_t width, int32_t height, uint32_t stride)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return ver_sad_w4(pic_data, ref_data, height, stride);
+  if (width == 8)
+    return ver_sad_w8(pic_data, ref_data, height, stride);
+  if (width == 12)
+    return ver_sad_w12(pic_data, ref_data, height, stride);
+  if (width == 16)
+    return ver_sad_w16(pic_data, ref_data, height, stride);
+  else
+    return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
+}
+
 #endif //COMPILE_INTEL_SSE41


@ -75,6 +92,7 @@ int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) {
  if (bitdepth == 8){
    success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
    success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41);
  }
 #endif
  return success;
--- a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h
+++ b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h
@ -314,4 +314,232 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv
  return _mm_cvtsi128_si32(sad);
 }

+static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                           int32_t height, uint32_t stride)
+{
+  __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
+
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
+
+    __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  if (height_residual_lines) {
+    // Only pick the last dword, because we're comparing single dwords (lines)
+    ref_row = _mm_bsrli_si128(ref_row, 12);
+
+    for (; y < height; y++) {
+      __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
+
+      __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                           int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  uint64_t result = 0;
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128d a_d = _mm_setzero_pd();
+    __m128d c_d = _mm_setzero_pd();
+
+    a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
+    a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
+
+    c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
+    c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
+
+    __m128i a = _mm_castpd_si128(a_d);
+    __m128i c = _mm_castpd_si128(c_d);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  if (height_residual_lines) {
+    __m64 b = (__m64)_mm_cvtsi128_si64(ref_row);
+
+    for (; y < height; y++) {
+      __m64 a = *(__m64 *)(pic_data + y * stride);
+      __m64 sads = _mm_sad_pu8(a, b);
+      result += (uint64_t)sads;
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  result += _mm_cvtsi128_si32(sad);
+  return result;
+}
+
+static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                            int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  for (y = 0; y < height; y++) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
+
+    __m128i a_masked  = _mm_blend_epi16(ref_row, a, 0x3f);
+    __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                            int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
+  __m128i sse_inc       = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i pic_row_1   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
+    __m128i pic_row_2   = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
+    __m128i pic_row_3   = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
+    __m128i pic_row_4   = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
+
+    __m128i curr_sads_1 = _mm_sad_epu8   (pic_row_1, ref_row);
+    __m128i curr_sads_2 = _mm_sad_epu8   (pic_row_2, ref_row);
+    __m128i curr_sads_3 = _mm_sad_epu8   (pic_row_3, ref_row);
+    __m128i curr_sads_4 = _mm_sad_epu8   (pic_row_4, ref_row);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i pic_row   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
+      __m128i curr_sads = _mm_sad_epu8   (pic_row, ref_row);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                  int32_t width, int32_t height, uint32_t stride)
+{
+  int32_t y, x;
+  __m128i sse_inc = _mm_setzero_si128();
+
+  // Bytes in block in 128-bit blocks per each scanline, and remainder
+  const int32_t width_xmms             = width  & ~15;
+  const int32_t width_residual_pixels  = width  &  15;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  const __m128i rds    = _mm_set1_epi8 (width_residual_pixels);
+  const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
+                                        8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
+
+  for (x = 0; x < width_xmms; x += 16) {
+    const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
+      __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
+      __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
+      __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
+
+        __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+
+  if (width_residual_pixels) {
+    const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
+
+      __m128i a_masked     = _mm_blendv_epi8(ref_row, a, rdmask);
+      __m128i c_masked     = _mm_blendv_epi8(ref_row, c, rdmask);
+      __m128i e_masked     = _mm_blendv_epi8(ref_row, e, rdmask);
+      __m128i g_masked     = _mm_blendv_epi8(ref_row, g, rdmask);
+
+      __m128i curr_sads_ab = _mm_sad_epu8   (ref_row, a_masked);
+      __m128i curr_sads_cd = _mm_sad_epu8   (ref_row, c_masked);
+      __m128i curr_sads_ef = _mm_sad_epu8   (ref_row, e_masked);
+      __m128i curr_sads_gh = _mm_sad_epu8   (ref_row, g_masked);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
+
+        __m128i a_masked  = _mm_blendv_epi8(ref_row, a, rdmask);
+        __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
 #endif
--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@ -64,6 +64,7 @@ pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0;
 inter_recon_bipred_func * kvz_inter_recon_bipred_blend = 0;

 get_optimized_sad_func *kvz_get_optimized_sad = 0;
+ver_sad_func *kvz_ver_sad = 0;


 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -113,7 +113,9 @@ typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_p

 typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
 typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
-
+typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t block_width, int32_t block_height,
+                                uint32_t pic_stride);

 typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
 	const int hi_prec_luma_rec1,
@ -167,6 +169,7 @@ extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
 extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;

 extern get_optimized_sad_func *kvz_get_optimized_sad;
+extern ver_sad_func *kvz_ver_sad;

 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
@ -201,6 +204,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
  {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
  {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
+  {"ver_sad", (void**) &kvz_ver_sad}, \