From c6cc96f5eccf1b512268a0aee8eaf5b44547415f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Wed, 16 Mar 2016 19:47:51 +0200 Subject: [PATCH] Optimize sao band ddistortion --- src/strategies/avx2/sao-avx2.c | 38 ++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index 60ba930f..84860d85 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -307,21 +307,37 @@ int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz int shift = state->encoder_control->bitdepth-5; int sum = 0; + __m256i v_accum = { 0 }; + for (y = 0; y < block_height; ++y) { - for (x = 0; x < block_width; ++x) { - int band = (rec_data[y * block_width + x] >> shift) - band_pos; - int offset = 0; - if (band >= 0 && band < 4) { - offset = sao_bands[band]; - } - if (offset != 0) { - int diff = orig_data[y * block_width + x] - rec_data[y * block_width + x]; - // Offset is applied to reconstruction, so it is subtracted from diff. - sum += (diff - offset) * (diff - offset) - diff * diff; - } + for (x = 0; x < block_width; x+=8) { + + __m256i v_band = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_data[y * block_width + x]))); + v_band = _mm256_srli_epi32(v_band, shift); + v_band = _mm256_sub_epi32(v_band, _mm256_set1_epi32(band_pos)); + + __m256i v_offset = { 0 }; + __m256i v_mask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_set1_epi32(~3), v_band), _mm256_setzero_si256()); + v_offset = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)sao_bands)), v_band); + + v_offset = _mm256_and_si256(v_offset, v_mask); + + + __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_data[y * block_width + x]))); + __m256i v_rec = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_data[y * block_width + x]))); + v_diff = _mm256_sub_epi32(v_diff, v_rec); + __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset); + __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff)); + v_accum = _mm256_add_epi32(v_accum, v_temp_sum); } } + //Full horizontal sum + v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(2, 3, 0, 1))); + v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, KVZ_PERMUTE(1, 0, 1, 0))); + sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum)); + return sum; }