From f78bf2ebcb5c6c3817a4a7d1873c52bc00034c49 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Fri, 30 Nov 2018 17:15:26 +0200 Subject: [PATCH] Optimize q_coefs usage for indexed fetch --- src/strategies/avx2/quant-avx2.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 14b07b22..d3b44672 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -98,75 +98,55 @@ static INLINE void rearrange_512(__m256i *hi, __m256i *lo) } static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo, - __m256i ns, __m256i changes, __m256i q_coefs, - int16_t *final_change, int32_t *min_pos, int16_t *cheapest_q) + __m256i ns, __m256i changes, + int16_t *final_change, int32_t *min_pos) { - const __m256i zero = _mm256_set1_epi8(0); - // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs, // to have the same data layout as in costs. Zero extend to 32b width, shift // changes 16 bits to the left, and store them into the same vectors. __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes); __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes); - __m256i tmp2hi = _mm256_unpackhi_epi16(q_coefs, zero); - __m256i tmp2lo = _mm256_unpacklo_epi16(q_coefs, zero); - __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31); __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20); - __m256i pl2hi = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x31); - __m256i pl2lo = _mm256_permute2x128_si256(tmp2lo, tmp2hi, 0x20); - // Reorder to afford result stability (if multiple atoms tie for cheapest, // rightmost ie. the highest is the wanted one) rearrange_512(&costs_hi, &costs_lo); rearrange_512(&pl1hi, &pl1lo); - rearrange_512(&pl2hi, &pl2lo); // 0: pick hi, 1: pick lo (equality evaluates as 0) __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo); __m256i cost1 = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1); __m256i pl1_1 = _mm256_blendv_epi8(pl1hi, pl1lo, cmpmask1); - __m256i pl2_1 = _mm256_blendv_epi8(pl2hi, pl2lo, cmpmask1); __m256i cost2 = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1)); __m256i pl1_2 = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1)); - __m256i pl2_2 = _mm256_shuffle_epi32(pl2_1, _MM_SHUFFLE(2, 3, 0, 1)); __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1); __m256i cost3 = _mm256_blendv_epi8(cost2, cost1, cmpmask2); __m256i pl1_3 = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2); - __m256i pl2_3 = _mm256_blendv_epi8(pl2_2, pl2_1, cmpmask2); __m256i cost4 = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2)); __m256i pl1_4 = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i pl2_4 = _mm256_shuffle_epi32(pl2_3, _MM_SHUFFLE(1, 0, 3, 2)); __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3); __m256i cost5 = _mm256_blendv_epi8(cost4, cost3, cmpmask3); __m256i pl1_5 = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3); - __m256i pl2_5 = _mm256_blendv_epi8(pl2_4, pl2_3, cmpmask3); __m256i cost6 = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2)); __m256i pl1_6 = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i pl2_6 = _mm256_permute4x64_epi64(pl2_5, _MM_SHUFFLE(1, 0, 3, 2)); __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5); __m256i pl1_7 = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4); - __m256i pl2_7 = _mm256_blendv_epi8(pl2_6, pl2_5, cmpmask4); __m128i res1_128 = _mm256_castsi256_si128(pl1_7); - __m128i res2_128 = _mm256_castsi256_si128(pl2_7); uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0); - uint32_t tmp2 = (uint32_t)_mm_extract_epi32(res2_128, 0); uint16_t n = (uint16_t)(tmp1 & 0xffff); uint16_t chng = (uint16_t)(tmp1 >> 16); - uint16_t chpst = (uint16_t)(tmp2 & 0xffff); *final_change = (int16_t)chng; *min_pos = (int32_t)n; - *cheapest_q = (int16_t)chpst; } #define VEC_WIDTH 16 @@ -287,8 +267,10 @@ static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i de __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l)); __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h)); - get_cheapest_alternative(costs_h, costs_l, ns, changes, q_coefs, &final_change, &min_pos, &cheapest_q); + get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos); + const int32_t best_id = scan[min_pos + subpos]; + cheapest_q = q_coef[best_id]; if (cheapest_q == 32767 || cheapest_q == -32768) final_change = -1; @@ -300,7 +282,7 @@ static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i de else cheapest_q -= final_change; - q_coef[scan[min_pos + subpos]] = cheapest_q; + q_coef[best_id] = cheapest_q; } // Hide } if (last_cg == 1) @@ -476,6 +458,8 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t * __restri q_coefs_rearr2_lower, 1); + // Reordering done + __m256i v_level = _mm256_abs_epi16(v_coef); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));