diff --git a/src/strategies/avx2/alf-avx2.c b/src/strategies/avx2/alf-avx2.c index 1d4a8e9c..e832eb48 100644 --- a/src/strategies/avx2/alf-avx2.c +++ b/src/strategies/avx2/alf-avx2.c @@ -249,14 +249,23 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state, for (int b = 0; b < 4; b++) { alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local; - }*/ + }*/ - __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]); + } + for (int k = 0; k < num_coeff-1; k+=2) + { + __m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]); __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1); __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); - __m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]); - _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig)); + __m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]); + _mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig)); } + __m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]); + __m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1); + __m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32); + __m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]); + _mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig)); + alf_covariance[class_idx].pix_acc += y_local * (double)y_local; } org += org_stride;