Optimize another loop

This commit is contained in:
Ari Lemmetti 2015-11-12 18:42:20 +02:00
parent 234d3fc6e0
commit b78460b02c

View file

@ -104,11 +104,44 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2];
for (int32_t n = 0; n < width * height; n++) { for (int32_t n = 0; n < width * height; n += 16) {
int32_t level;
level = coef[n]; __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n]));
level = ((int64_t)abs(level) * quant_coeff[n] + add) >> q_bits;
delta_u[n] = (int32_t)(((int64_t)abs(coef[n]) * quant_coeff[n] - (level << q_bits)) >> q_bits8); v_level = _mm256_abs_epi16(v_level);
__m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
__m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));
__m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
__m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0));
__m256i v_level32_a = _mm256_madd_epi16(low_a, low_b);
__m256i v_level32_b = _mm256_madd_epi16(high_a, high_b);
v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add));
v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add));
v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits);
v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits);
v_level = _mm256_packs_epi32(v_level32_a, v_level32_b);
__m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n]));
__m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
__m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
__m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
__m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0));
v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a);
v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b);
v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) );
v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) );
v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8);
v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8);
_mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a));
_mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1));
_mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b));
_mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1));
} }
if (ac_sum >= 2) { if (ac_sum >= 2) {