Add intrinsic version of SATD for 8x8 and larger blocks

2024-11-27 19:24:06 +00:00 · 2015-09-24 19:10:03 +03:00 · 2015-09-24 19:10:03 +03:00 · 55da2a9958
parent d68fc4c41e
commit 55da2a9958
1 changed files with 105 additions and 0 deletions
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -136,6 +136,106 @@ static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * bu
  return m256i_horizontal_sum(sum0);
 }

+static void hor_add_sub_avx2(__m128i *row0, __m128i *row1){
+
+  __m128i a = _mm_hadd_epi16(*row0, *row1);
+  __m128i b = _mm_hsub_epi16(*row0, *row1);
+
+  __m128i c = _mm_hadd_epi16(a, b);
+  __m128i d = _mm_hsub_epi16(a, b);
+
+  *row0 = _mm_hadd_epi16(c, d);
+  *row1 = _mm_hsub_epi16(c, d);
+}
+
+static INLINE void ver_add_sub_avx2(__m128i temp_hor[8], __m128i temp_ver[8]){
+
+  // First stage
+  for (int i = 0; i < 8; i += 2){
+    temp_ver[i+0] = _mm_hadd_epi16(temp_hor[i + 0], temp_hor[i + 1]);
+    temp_ver[i+1] = _mm_hsub_epi16(temp_hor[i + 0], temp_hor[i + 1]);
+  }
+
+  // Second stage
+  for (int i = 0; i < 8; i += 4){
+    temp_hor[i + 0] = _mm_add_epi16(temp_ver[i + 0], temp_ver[i + 2]);
+    temp_hor[i + 1] = _mm_add_epi16(temp_ver[i + 1], temp_ver[i + 3]);
+    temp_hor[i + 2] = _mm_sub_epi16(temp_ver[i + 0], temp_ver[i + 2]);
+    temp_hor[i + 3] = _mm_sub_epi16(temp_ver[i + 1], temp_ver[i + 3]);
+  }
+
+  // Third stage
+  for (int i = 0; i < 4; ++i){
+    temp_ver[i + 0] = _mm_add_epi16(temp_hor[0 + i], temp_hor[4 + i]);
+    temp_ver[i + 4] = _mm_sub_epi16(temp_hor[0 + i], temp_hor[4 + i]);
+  }
+}
+
+static unsigned kvz_satd_8bit_8x8_general_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
+{
+  __m128i temp_hor[8];
+  __m128i temp_ver[8];
+
+  for (int row = 0; row < 8; row += 2){
+    for (int i = 0; i < 2; ++i){
+      __m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf1[(row + i) * stride1])));
+      __m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf2[(row + i) * stride2])));
+      temp_hor[row + i] = _mm_sub_epi16(buf1_row, buf2_row);
+    }
+    hor_add_sub_avx2(&temp_hor[row], &temp_hor[row + 1]);
+  }
+
+  ver_add_sub_avx2(temp_hor, temp_ver);
+  
+  __m128i sad = _mm_setzero_si128();
+  for (int row = 0; row < 8; ++row){
+    __m128i abs_value = _mm_abs_epi16(temp_ver[row]);
+    sad = _mm_add_epi32(sad, _mm_madd_epi16(abs_value, _mm_set1_epi16(1)));
+  }
+
+  sad = _mm_hadd_epi32(sad, sad);
+  sad = _mm_hadd_epi32(sad, sad);
+
+  unsigned result = (_mm_cvtsi128_si32(sad) + 2) >> 2;
+  return result;
+}
+
+// Function macro for defining hadamard calculating functions
+// for fixed size blocks. They calculate hadamard for integer
+// multiples of 8x8 with the 8x8 hadamard function.
+#define SATD_NXN_AVX2(n) \
+static unsigned satd_8bit_ ## n ## x ## n ## _avx2( \
+  const kvz_pixel * const block1, const kvz_pixel * const block2) \
+{ \
+  unsigned x, y; \
+  unsigned sum = 0; \
+  for (y = 0; y < (n); y += 8) { \
+  unsigned row = y * (n); \
+  for (x = 0; x < (n); x += 8) { \
+  sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (n), &block2[row + x], (n)); \
+    } \
+    } \
+  return sum>>(KVZ_BIT_DEPTH-8); \
+}
+
+static unsigned satd_8bit_8x8_avx2(
+  const kvz_pixel * const block1, const kvz_pixel * const block2) 
+{ 
+  unsigned x, y; 
+  unsigned sum = 0; 
+  for (y = 0; y < (8); y += 8) { 
+  unsigned row = y * (8); 
+  for (x = 0; x < (8); x += 8) { 
+  sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (8), &block2[row + x], (8)); 
+      } 
+      } 
+  return sum>>(KVZ_BIT_DEPTH-8); \
+}
+
+//SATD_NXN_AVX2(8)
+SATD_NXN_AVX2(16)
+SATD_NXN_AVX2(32)
+SATD_NXN_AVX2(64)

 #endif //COMPILE_INTEL_AVX2

@ -153,6 +253,11 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
    success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2);
    success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2);
    success &= kvz_strategyselector_register(opaque, "sad_64x64", "avx2", 40, &sad_8bit_64x64_avx2);
+
+    success &= kvz_strategyselector_register(opaque, "satd_8x8", "avx2", 40, &satd_8bit_8x8_avx2);
+    success &= kvz_strategyselector_register(opaque, "satd_16x16", "avx2", 40, &satd_8bit_16x16_avx2);
+    success &= kvz_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_8bit_32x32_avx2);
+    success &= kvz_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_8bit_64x64_avx2);
  }
 #endif
  return success;