Create a header file for shared AVX2 code

2024-11-30 20:54:07 +00:00 · 2018-12-18 18:13:50 +02:00 · 2018-12-18 18:13:50 +02:00 · d02207306d
parent 361bf0c7db
commit d02207306d
2 changed files with 98 additions and 91 deletions
--- a/src/strategies/avx2/avx2_common_functions.h
+++ b/src/strategies/avx2/avx2_common_functions.h
@ -0,0 +1,97 @@
+#ifndef AVX2_COMMON_FUNCTIONS_H
+#define AVX2_COMMON_FUNCTIONS_H
+
+#include <immintrin.h>
+
+static INLINE __m256i scanord_read_vector(const int16_t *__restrict coeff, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width)
+{
+  // For vectorized reordering of coef and q_coef
+  const __m128i low128_shuffle_masks[3] = {
+    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
+  };
+
+  const __m128i blend_masks[3] = {
+    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m128i invec_rearr_masks_upper[3] = {
+    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
+  };
+
+  const __m128i invec_rearr_masks_lower[3] = {
+    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
+  };
+
+  const size_t row_offsets[4] = {
+    scan[subpos] + width * 0,
+    scan[subpos] + width * 1,
+    scan[subpos] + width * 2,
+    scan[subpos] + width * 3,
+  };
+
+  // NOTE: Upper means "higher in pixel order inside block", which implies
+  // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
+  // so upper 128b vector actually becomes the lower part of a 256-bit coeff
+  // vector and lower vector the higher part!
+  __m128d   coeffs_d_upper = _mm_castsi128_pd(_mm_set1_epi8(0));
+  __m128d   coeffs_d_lower = _mm_castsi128_pd(_mm_set1_epi8(0));
+
+  __m128i   coeffs_upper;
+  __m128i   coeffs_lower;
+
+  __m128i   coeffs_rearr1_upper;
+  __m128i   coeffs_rearr1_lower;
+
+  __m128i   coeffs_rearr2_upper;
+  __m128i   coeffs_rearr2_lower;
+
+  coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
+  coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));
+
+  coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
+  coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));
+
+  coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
+  coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);
+
+  coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);
+
+  coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
+  coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);
+
+  coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
+  coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);
+
+  // Why, oh why, is there no _mm256_setr_m128i intrinsic in the header that
+  // would do the exact same operation in the exact same way? :(
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
+                                 coeffs_rearr2_lower,
+                                 1);
+}
+
+// If ints is completely zero, returns 16 in *first and -1 in *last
+static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
+{
+  // Note that nonzero_bytes will always have both bytes set for a set word
+  // even if said word only had one of its bytes set, because we're doing 16
+  // bit wide comparisons. No big deal, just shift results to the right by one
+  // bit to have the results represent indexes of first set words, not bytes.
+  // Another note, it has to use right shift instead of division to preserve
+  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
+  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
+  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
+  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
+}
+
+#endif
--- a/src/strategies/avx2/encode_coding_tree-avx2.c
+++ b/src/strategies/avx2/encode_coding_tree-avx2.c
@ -20,6 +20,7 @@

 #include "strategyselector.h"

+#include "avx2_common_functions.h"
 #include "cabac.h"
 #include "context.h"
 #include "encode_coding_tree-avx2.h"
@ -203,97 +204,6 @@ static INLINE __m256i kvz_context_get_sig_ctx_inc_16x16b(int32_t pattern_sig_ctx
  return rv;
 }

-static INLINE __m256i scanord_read_vector(const int16_t *coeff, const uint32_t *scan, int8_t scan_mode, int32_t subpos, int32_t width)
-{
-  // For vectorized reordering of coef and q_coef
-  const __m128i low128_shuffle_masks[3] = {
-    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
-    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
-    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
-  };
-
-  const __m128i blend_masks[3] = {
-    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
-    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
-    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
-  };
-
-  const __m128i invec_rearr_masks_upper[3] = {
-    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
-    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
-    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
-  };
-
-  const __m128i invec_rearr_masks_lower[3] = {
-    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
-    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
-    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
-  };
-
-  const size_t row_offsets[4] = {
-    scan[subpos] + width * 0,
-    scan[subpos] + width * 1,
-    scan[subpos] + width * 2,
-    scan[subpos] + width * 3,
-  };
-
-  // NOTE: Upper means "higher in pixel order inside block", which implies
-  // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
-  // so upper 128b vector actually becomes the lower part of a 256-bit coeff
-  // vector and lower vector the higher part!
-  __m128d   coeffs_d_upper = _mm_castsi128_pd(_mm_set1_epi8(0));
-  __m128d   coeffs_d_lower = _mm_castsi128_pd(_mm_set1_epi8(0));
-
-  __m128i   coeffs_upper;
-  __m128i   coeffs_lower;
-
-  __m128i   coeffs_rearr1_upper;
-  __m128i   coeffs_rearr1_lower;
-
-  __m128i   coeffs_rearr2_upper;
-  __m128i   coeffs_rearr2_lower;
-
-  coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
-  coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));
-
-  coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
-  coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));
-
-  coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
-  coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);
-
-  coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);
-
-  coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
-  coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);
-
-  coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
-  coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);
-
-  // Why, oh why, is there no _mm256_setr_m128i intrinsic in the header that
-  // would do the exact same operation in the exact same way? :(
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
-                                 coeffs_rearr2_lower,
-                                 1);
-}
-
-// If ints is completely zero, returns 16 in *first and -1 in *last
-static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
-{
-  // Note that nonzero_bytes will always have both bytes set for a set word
-  // even if said word only had one of its bytes set, because we're doing 16
-  // bit wide comparisons. No big deal, just shift results to the right by one
-  // bit to have the results represent indexes of first set words, not bytes.
-  // Another note, it has to use right shift instead of division to preserve
-  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
-  const __m256i zero = _mm256_setzero_si256();
-
-  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
-  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
-  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
-  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
-}
-
 /**
 * \brief Encode (X,Y) position of the last significant coefficient
 *