mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
[alf] Change the processing in alf_get_blk_stats_avx2() to allow utilizing the whole 256bit register
This commit is contained in:
parent
fdf125f406
commit
26f18865f7
|
@ -251,12 +251,21 @@ static void alf_get_blk_stats_avx2(encoder_state_t* const state,
|
|||
alf_covariance[class_idx].y[k][b] += e_local[k][b] * (double)y_local;
|
||||
}*/
|
||||
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[k][0]);
|
||||
}
|
||||
for (int k = 0; k < num_coeff-1; k+=2)
|
||||
{
|
||||
__m128i e_local_1 = _mm_loadu_si128((__m128i*) & e_local[k][0]);
|
||||
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
|
||||
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
|
||||
__m128i orig = _mm_loadu_si128((__m128i*) &alf_covariance[class_idx].y[k][0]);
|
||||
_mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[k], _mm_add_epi32(_mm256_castsi256_si128(multiplied),orig));
|
||||
__m256i orig = _mm256_loadu_si256((__m256i*) & alf_covariance[class_idx].y[k][0]);
|
||||
_mm256_storeu_si256((__m256i*)alf_covariance[class_idx].y[k], _mm256_add_epi32(multiplied, orig));
|
||||
}
|
||||
__m128i e_local_1 = _mm_loadl_epi64((__m128i*) & e_local[num_coeff-1][0]);
|
||||
__m256i e_local_32 = _mm256_cvtepi16_epi32(e_local_1);
|
||||
__m256i multiplied = _mm256_mullo_epi32(y_local_32, e_local_32);
|
||||
__m128i orig = _mm_loadu_si128((__m128i*) & alf_covariance[class_idx].y[num_coeff - 1][0]);
|
||||
_mm_storeu_si128((__m128i*)alf_covariance[class_idx].y[num_coeff - 1], _mm_add_epi32(_mm256_castsi256_si128(multiplied), orig));
|
||||
|
||||
alf_covariance[class_idx].pix_acc += y_local * (double)y_local;
|
||||
}
|
||||
org += org_stride;
|
||||
|
|
Loading…
Reference in a new issue