Vectorize the coeff group loop to some extent

2024-11-24 02:24:07 +00:00 · 2018-12-10 23:47:07 +02:00 · 2018-12-10 23:47:07 +02:00 · efd097f5a5
parent a01362e638
commit efd097f5a5
1 changed files with 47 additions and 18 deletions
--- a/src/strategies/avx2/encode_coding_tree-avx2.c
+++ b/src/strategies/avx2/encode_coding_tree-avx2.c
@ -202,7 +202,7 @@ void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
  int8_t be_valid = encoder->cfg.signhide_enable;
  int32_t scan_pos_sig;
  uint32_t go_rice_param = 0;
-  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+  uint32_t ctx_sig;

  // CONSTANTS
  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
@ -340,38 +340,67 @@ void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
                                                             cg_pos_x, cg_pos_y, width);

-      // TODO: reorder coeff and vectorize?
-      const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+      ALIGNED(64) int16_t abs_coeff_buf[16];
+      ALIGNED(32) int16_t pos_ys_buf[16];
+      ALIGNED(32) int16_t pos_xs_buf[16];
+
+      const __m256i coeff_pos_zero = _mm256_castsi128_si256(_mm_cvtsi32_si128(0xffff));
+      const __m128i log2_block_size_128 = _mm_cvtsi32_si128(log2_block_size);

      __m256i coeffs = _mm256_load_si256((__m256i *)(coeff_reord + sub_pos));
      __m256i sigs_inv = _mm256_cmpeq_epi16(coeffs, zero);
      __m256i is = _mm256_set1_epi16(i);
      __m256i is_zero = _mm256_cmpeq_epi16(is, zero);
-      // TODO!
+      __m256i coeffs_subzero = _mm256_cmpgt_epi16(zero, coeffs);
+
+      __m256i masked_coeffs = _mm256_andnot_si256(sigs_inv, coeffs);
+      __m256i abs_coeffs = _mm256_abs_epi16(masked_coeffs);
+
+      // TODO: obtain 16-bit block positions, maybe? :P
+      __m256i blk_poses_hi = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 8));
+      __m256i blk_poses_lo = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 0));
+      __m256i blk_poses_tmp = _mm256_packs_epi32(blk_poses_lo, blk_poses_hi);
+      __m256i blk_poses = _mm256_permute4x64_epi64(blk_poses_tmp, _MM_SHUFFLE(3, 1, 2, 0));
+
+      __m256i pos_ys = _mm256_srl_epi16(blk_poses, log2_block_size_128);
+      __m256i pos_xs = _mm256_sub_epi16(blk_poses, _mm256_sll_epi16(pos_ys, log2_block_size_128));
+
+      _mm256_store_si256((__m256i *)pos_ys_buf, pos_ys);
+      _mm256_store_si256((__m256i *)pos_xs_buf, pos_xs);
+
+      __m256i encode_sig_coeff_flags_inv = _mm256_andnot_si256(is_zero, coeff_pos_zero);
+
+      get_first_last_nz_int16(masked_coeffs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+      _mm256_store_si256((__m256i *)abs_coeff_buf, abs_coeffs);
+
+      uint32_t esc_flags = ~(_mm256_movemask_epi8(encode_sig_coeff_flags_inv));
+      uint32_t sigs = ~(_mm256_movemask_epi8(sigs_inv));
+      uint32_t coeff_sign_buf = _mm256_movemask_epi8(coeffs_subzero);

-      // TODO: get first and last nz
      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
-        blk_pos = scan[scan_pos_sig];
-        pos_y   = blk_pos >> log2_block_size;
-        pos_x   = blk_pos - (pos_y << log2_block_size);
-        sig    = (coeff_reord[scan_pos_sig] != 0) ? 1 : 0;
+        uint32_t id = scan_pos_sig - sub_pos;
+        uint32_t shamt = (id << 1) + 1;

-        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero != 0) {
-          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+        uint32_t curr_sig = (sigs >> shamt) & 1;
+        uint32_t curr_esc_flag = (esc_flags >> shamt) & 1;
+        uint32_t curr_coeff_sign = (coeff_sign_buf >> shamt) & 1;
+
+        uint32_t curr_pos_x = pos_xs_buf[id];
+        uint32_t curr_pos_y = pos_ys_buf[id];
+
+        if (curr_esc_flag | num_non_zero) {
+          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, curr_pos_x, curr_pos_y,
                                             log2_block_size, type);
          cabac->cur_ctx = &baseCtx[ctx_sig];
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+          CABAC_BIN(cabac, curr_sig, "sig_coeff_flag");
        }

-        if (sig) {
-          abs_coeff[num_non_zero] = abs(coeff_reord[scan_pos_sig]);
-          coeff_signs              = 2 * coeff_signs + (coeff[blk_pos] < 0);
+        if (curr_sig) {
+          abs_coeff[num_non_zero] = abs_coeff_buf[id];
+          coeff_signs              = 2 * coeff_signs + curr_coeff_sign;
          num_non_zero++;
        }
      }
-    __m256i masked_coeffs = _mm256_andnot_si256(sigs_inv, coeffs);
-    get_first_last_nz_int16(masked_coeffs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
-
    } else {
      scan_pos_sig = sub_pos - 1;
    }