uvg266/src/strategies/avx2/avx2_common_functions.h

#ifndef AVX2_COMMON_FUNCTIONS_H
#define AVX2_COMMON_FUNCTIONS_H

#include <immintrin.h>

// The calling convention used by MSVC on 32-bit builds will essentially
// disallow functions to have more than 3 XMM/YMM parameters, because it
// will not provide more than 8-byte param alignment, and only the first
// three vector params will be carried in SIMD registers. Now the
// vectorcall convention could probably be problematic in globally visible
// funcitons, but likely not in static ones.
#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
  #define FIX_W32 __vectorcall
#else
  #define FIX_W32
#endif

// Non-inline functions defined in this header are likely to trigger a
// warning for each module including this header that does NOT use them,
// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
// Tell 'em we actually want to do that, it's not an accident.
#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
  #define FIX_UNUSED __attribute__((unused))
#else
  #define FIX_UNUSED
#endif

#define FIX_NOINLINE FIX_W32 FIX_UNUSED

/*
 * Reorder coefficients from raster to scan order
 * Fun fact: Once upon a time, doing this in a loop looked like this:
 * for (int32_t n = 0; n < width * height; n++) {
 *   coef_reord[n] = coef[scan[n]];
 *   q_coef_reord[n] = q_coef[scan[n]];
 * }
 */
static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)
{
  // For vectorized reordering of coef and q_coef
  const __m128i low128_shuffle_masks[3] = {
    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
  };

  const __m128i blend_masks[3] = {
    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
  };

  const __m128i invec_rearr_masks_upper[3] = {
    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
  };

  const __m128i invec_rearr_masks_lower[3] = {
    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
  };

  const size_t row_offsets[4] = {
    scan[subpos] + width * 0,
    scan[subpos] + width * 1,
    scan[subpos] + width * 2,
    scan[subpos] + width * 3,
  };

  for (int i = 0; i < n_bufs; i++) {
    const int16_t *__restrict coeff = coeffs[i];

    // NOTE: Upper means "higher in pixel order inside block", which implies
    // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
    // so upper 128b vector actually becomes the lower part of a 256-bit coeff
    // vector and lower vector the higher part!
    __m128d coeffs_d_upper;
    __m128d coeffs_d_lower;

    __m128i coeffs_upper;
    __m128i coeffs_lower;

    __m128i coeffs_rearr1_upper;
    __m128i coeffs_rearr1_lower;

    __m128i coeffs_rearr2_upper;
    __m128i coeffs_rearr2_lower;

    // Zeroing these is actually unnecessary, but the compiler will whine
    // about uninitialized values otherwise
    coeffs_d_upper = _mm_setzero_pd();
    coeffs_d_lower = _mm_setzero_pd();

    coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
    coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));

    coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
    coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));

    coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
    coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);

    coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);

    coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
    coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);

    coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
    coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);

    // The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers
    // lack such an instruction. What it does is essentially this anyway.
    result_vecs[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
                                             coeffs_rearr2_lower,
                                             1);
  }
}

// If ints is completely zero, returns 16 in *first and -1 in *last
static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
{
  // Note that nonzero_bytes will always have both bytes set for a set word
  // even if said word only had one of its bytes set, because we're doing 16
  // bit wide comparisons. No big deal, just shift results to the right by one
  // bit to have the results represent indexes of first set words, not bytes.
  // Another note, it has to use right shift instead of division to preserve
  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
  const __m256i zero = _mm256_setzero_si256();

  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
}

static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
{
  __m256i sum1 = v;
  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
  __m256i sum3 = _mm256_add_epi32        (sum1, sum2);
  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);

  __m128i sum8 = _mm256_castsi256_si128  (sum7);
  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
  return  sum9;
}

#endif
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00			`#ifndef AVX2_COMMON_FUNCTIONS_H`
			`#define AVX2_COMMON_FUNCTIONS_H`

			`#include <immintrin.h>`

Move hsum_8x32b to avx2_common_functions 2020-02-27 15:52:17 +00:00			`// The calling convention used by MSVC on 32-bit builds will essentially`
			`// disallow functions to have more than 3 XMM/YMM parameters, because it`
			`// will not provide more than 8-byte param alignment, and only the first`
			`// three vector params will be carried in SIMD registers. Now the`
			`// vectorcall convention could probably be problematic in globally visible`
			`// funcitons, but likely not in static ones.`
			`#if defined _MSC_VER && defined _WIN32 && !defined _WIN64`
			`#define FIX_W32 __vectorcall`
			`#else`
			`#define FIX_W32`
			`#endif`

Fix warning about potentially unused hsum_8x32b There's a lot of alternative options available, such as making it globally visible with a kvz_ prefix, force inlining it, or anything. This could be good too, hope it won't be compiled at all to translation units where it's not used. 2020-04-02 13:27:39 +00:00			`// Non-inline functions defined in this header are likely to trigger a`
			`// warning for each module including this header that does NOT use them,`
			`// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).`
			`// Tell 'em we actually want to do that, it's not an accident.`
			`#if defined __GNUC__ \|\| defined __clang__ \|\| defined __MINGW32__ \|\| defined __MINGW64__`
			`#define FIX_UNUSED __attribute__((unused))`
Fix define for use with Visual Studio. 2020-04-03 13:11:01 +00:00			`#else`
			`#define FIX_UNUSED`
Fix warning about potentially unused hsum_8x32b There's a lot of alternative options available, such as making it globally visible with a kvz_ prefix, force inlining it, or anything. This could be good too, hope it won't be compiled at all to translation units where it's not used. 2020-04-02 13:27:39 +00:00			`#endif`

			`#define FIX_NOINLINE FIX_W32 FIX_UNUSED`

Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`/*`
			`* Reorder coefficients from raster to scan order`
			`* Fun fact: Once upon a time, doing this in a loop looked like this:`
			`* for (int32_t n = 0; n < width * height; n++) {`
			`* coef_reord[n] = coef[scan[n]];`
			`* q_coef_reord[n] = q_coef[scan[n]];`
			`* }`
			`*/`
			`static INLINE void scanord_read_vector(const int16_t *__restrict coeffs, const uint32_t __restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00			`{`
			`// For vectorized reordering of coef and q_coef`
			`const __m128i low128_shuffle_masks[3] = {`
			`_mm_setr_epi8(10,11, 4, 5, 12,13, 0, 1, 6, 7, 14,15, 8, 9, 2, 3),`
			`_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),`
			`_mm_setr_epi8( 4, 5, 6, 7, 0, 1, 2, 3, 12,13, 14,15, 8, 9, 10,11),`
			`};`

			`const __m128i blend_masks[3] = {`
			`_mm_setr_epi16( 0, 0, 0, -1, 0, 0, -1, -1),`
			`_mm_setr_epi16( 0, 0, 0, 0, 0, 0, 0, 0),`
			`_mm_setr_epi16( 0, 0, -1, -1, 0, 0, -1, -1),`
			`};`

			`const __m128i invec_rearr_masks_upper[3] = {`
			`_mm_setr_epi8( 0, 1, 8, 9, 2, 3, 6, 7, 10,11, 4, 5, 12,13, 14,15),`
			`_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),`
			`_mm_setr_epi8( 0, 1, 8, 9, 4, 5, 12,13, 2, 3, 10,11, 6, 7, 14,15),`
			`};`

			`const __m128i invec_rearr_masks_lower[3] = {`
			`_mm_setr_epi8(12,13, 6, 7, 0, 1, 2, 3, 14,15, 4, 5, 8, 9, 10,11),`
			`_mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15),`
			`_mm_setr_epi8( 4, 5, 12,13, 0, 1, 8, 9, 6, 7, 14,15, 2, 3, 10,11),`
			`};`

			`const size_t row_offsets[4] = {`
			`scan[subpos] + width * 0,`
			`scan[subpos] + width * 1,`
			`scan[subpos] + width * 2,`
			`scan[subpos] + width * 3,`
			`};`

Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`for (int i = 0; i < n_bufs; i++) {`
			`const int16_t *__restrict coeff = coeffs[i];`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`// NOTE: Upper means "higher in pixel order inside block", which implies`
			`// lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),`
			`// so upper 128b vector actually becomes the lower part of a 256-bit coeff`
			`// vector and lower vector the higher part!`
			`__m128d coeffs_d_upper;`
			`__m128d coeffs_d_lower;`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`__m128i coeffs_upper;`
			`__m128i coeffs_lower;`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`__m128i coeffs_rearr1_upper;`
			`__m128i coeffs_rearr1_lower;`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`__m128i coeffs_rearr2_upper;`
			`__m128i coeffs_rearr2_lower;`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`// Zeroing these is actually unnecessary, but the compiler will whine`
			`// about uninitialized values otherwise`
			`coeffs_d_upper = _mm_setzero_pd();`
			`coeffs_d_lower = _mm_setzero_pd();`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));`
			`coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));`
			`coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`coeffs_upper = _mm_castpd_si128(coeffs_d_upper);`
			`coeffs_lower = _mm_castpd_si128(coeffs_d_lower);`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`coeffs_lower = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00
Move CG reordering code from quant-avx2 to shared header 2018-12-18 17:34:35 +00:00			`coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);`
			`coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);`

			`coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);`
			`coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);`

			`// The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers`
			`// lack such an instruction. What it does is essentially this anyway.`
			`result_vecs[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),`
			`coeffs_rearr2_lower,`
			`1);`
			`}`
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00			`}`

			`// If ints is completely zero, returns 16 in first and -1 in last`
			`static INLINE void get_first_last_nz_int16(__m256i ints, int32_t first, int32_t last)`
			`{`
			`// Note that nonzero_bytes will always have both bytes set for a set word`
			`// even if said word only had one of its bytes set, because we're doing 16`
			`// bit wide comparisons. No big deal, just shift results to the right by one`
			`// bit to have the results represent indexes of first set words, not bytes.`
			`// Another note, it has to use right shift instead of division to preserve`
			`// behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)`
			`const __m256i zero = _mm256_setzero_si256();`

			`__m256i zeros = _mm256_cmpeq_epi16(ints, zero);`
			`uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));`
			`*first = ( (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;`
			`*last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;`
			`}`

Fix warning about potentially unused hsum_8x32b There's a lot of alternative options available, such as making it globally visible with a kvz_ prefix, force inlining it, or anything. This could be good too, hope it won't be compiled at all to translation units where it's not used. 2020-04-02 13:27:39 +00:00			`static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)`
Move hsum_8x32b to avx2_common_functions 2020-02-27 15:52:17 +00:00			`{`
			`__m256i sum1 = v;`
			`__m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));`
			`__m256i sum3 = _mm256_add_epi32 (sum1, sum2);`
			`__m256i sum4 = _mm256_shuffle_epi32 (sum3, _MM_SHUFFLE(1, 0, 3, 2));`
			`__m256i sum5 = _mm256_add_epi32 (sum3, sum4);`
			`__m256i sum6 = _mm256_shuffle_epi32 (sum5, _MM_SHUFFLE(2, 3, 0, 1));`
			`__m256i sum7 = _mm256_add_epi32 (sum5, sum6);`

			`__m128i sum8 = _mm256_castsi256_si128 (sum7);`
			`int32_t sum9 = _mm_cvtsi128_si32 (sum8);`
			`return sum9;`
			`}`
Fix warning about potentially unused hsum_8x32b There's a lot of alternative options available, such as making it globally visible with a kvz_ prefix, force inlining it, or anything. This could be good too, hope it won't be compiled at all to translation units where it's not used. 2020-04-02 13:27:39 +00:00
Create a header file for shared AVX2 code 2018-12-18 16:13:50 +00:00			`#endif`