use the efficient method of reordering raster->scan

2024-12-03 13:44:05 +00:00 · 2018-12-10 22:51:53 +02:00 · 2018-12-10 22:51:53 +02:00 · a01362e638
parent 50a888e789
commit a01362e638
1 changed files with 78 additions and 6 deletions
--- a/src/strategies/avx2/encode_coding_tree-avx2.c
+++ b/src/strategies/avx2/encode_coding_tree-avx2.c
@ -26,6 +26,80 @@
 #include "kvz_math.h"
 #include <immintrin.h>
 static INLINE __m256i scanord_read_vector(const int16_t *coeff, const uint32_t *scan, int8_t scan_mode, int32_t subpos, int32_t width)
 {
  // For vectorized reordering of coef and q_coef
  const __m128i low128_shuffle_masks[3] = {
    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
  };
  const __m128i blend_masks[3] = {
    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
  };
  const __m128i invec_rearr_masks_upper[3] = {
    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
  };
  const __m128i invec_rearr_masks_lower[3] = {
    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
  };
  const size_t row_offsets[4] = {
    scan[subpos] + width * 0,
    scan[subpos] + width * 1,
    scan[subpos] + width * 2,
    scan[subpos] + width * 3,
  };
  // NOTE: Upper means "higher in pixel order inside block", which implies
  // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
  // so upper 128b vector actually becomes the lower part of a 256-bit coeff
  // vector and lower vector the higher part!
  __m128d   coeffs_d_upper = _mm_castsi128_pd(_mm_set1_epi8(0));
  __m128d   coeffs_d_lower = _mm_castsi128_pd(_mm_set1_epi8(0));
  __m128i   coeffs_upper;
  __m128i   coeffs_lower;
  __m128i   coeffs_rearr1_upper;
  __m128i   coeffs_rearr1_lower;
  __m128i   coeffs_rearr2_upper;
  __m128i   coeffs_rearr2_lower;
  coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0]));
  coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1]));
  coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2]));
  coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3]));
  coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
  coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);
  coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]);
  coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]);
  coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]);
  coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]);
  coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]);
  // Why, oh why, is there no _mm256_setr_m128i intrinsic in the header that
  // would do the exact same operation in the exact same way? :(
  return _mm256_insertf128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
                                 coeffs_rearr2_lower,
                                 1);
 }
 // If ints is completely zero, returns 16 in *first and -1 in *last
 static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
 {
@ -198,15 +272,13 @@ void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
  // Rest of the code assumes at least one non-zero coeff.
  assert(scan_cg_last_not_found == 0);
  // TODO: reorder coeffs in the fast if painstaking way?
  ALIGNED(64) int16_t coeff_reord[LCU_WIDTH * LCU_WIDTH];
-
+  for (int32_t i = scan_cg_last; i >= 0; i--) {
-  for (int i = 0; i < width * width; i++) {
+    int32_t subpos = i * 16;
-    coeff_reord[i] = coeff[scan[i]];
+    __m256i coeffs_r = scanord_read_vector(coeff, scan, scan_mode, subpos, width);
    _mm256_store_si256((__m256i *)(coeff_reord + subpos), coeffs_r);
  }
  const uint32_t SCAN_WIDTH = 8;
  // Find the last coeff by going backwards in scan order.
  uint32_t scan_pos_last;
  uint32_t baseaddr = scan_cg_last * 16;