[alf] Change the processing in alf_get_blk_stats_avx2() to allow utilizing the whole 256bit register

This commit is contained in:
Marko Viitanen 2021-08-27 13:40:28 +03:00
parent fdf125f406
commit 26f18865f7

View file

@ -249,14 +249,23 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
for (int b = 0; b < 4; b++)
{
alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
}*/
}*/
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
}
for (int k = 0; k < num_coeff-1; k+=2)
{
__m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]);
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
__m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
_mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
__m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]);
_mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig));
}
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]);
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
__m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]);
_mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig));
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
}
org += org_stride;