mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-23 18:14:06 +00:00
Remove avx512 instrincis
This commit is contained in:
parent
254826d396
commit
2caf077cff
|
@ -104,7 +104,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
|
||||||
value = _mm_min_epi32(value, max_rice);
|
value = _mm_min_epi32(value, max_rice);
|
||||||
// In the original implementation the goRiceTab is selected beforehand, but since we need to load from
|
// In the original implementation the goRiceTab is selected beforehand, but since we need to load from
|
||||||
// potentially four different locations, we need to calculate the offsets and use gather
|
// potentially four different locations, we need to calculate the offsets and use gather
|
||||||
__m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
|
__m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i *)&state->m_goRicePar[start]));
|
||||||
go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
|
go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
|
||||||
value = _mm_add_epi32(value, go_rice_tab);
|
value = _mm_add_epi32(value, go_rice_tab);
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ static void check_rd_costs_avx2(const all_depquant_states* const state, const en
|
||||||
|
|
||||||
__m128i max_rice = _mm_set1_epi32(31);
|
__m128i max_rice = _mm_set1_epi32(31);
|
||||||
value = _mm_min_epi32(value, max_rice);
|
value = _mm_min_epi32(value, max_rice);
|
||||||
__m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
|
__m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
|
||||||
go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
|
go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
|
||||||
value = _mm_add_epi32(value, go_rice_tab);
|
value = _mm_add_epi32(value, go_rice_tab);
|
||||||
|
|
||||||
|
@ -727,7 +727,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
||||||
sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
|
sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
|
||||||
// Gather is not necessary here put it would require at least five operation to do the same thing
|
// Gather is not necessary here put it would require at least five operation to do the same thing
|
||||||
// so the performance gain in my opinion is not worth the readability loss
|
// so the performance gain in my opinion is not worth the readability loss
|
||||||
__m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
|
__m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long int *)cc->m_sbbFlagBits[0], sig_sbb, 8);
|
||||||
_mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
|
_mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
|
||||||
|
|
||||||
memset(&state->m_numSigSbb[state_offset], 0, 4);
|
memset(&state->m_numSigSbb[state_offset], 0, 4);
|
||||||
|
@ -868,7 +868,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
||||||
__m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
|
__m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
|
||||||
offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
|
offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
|
||||||
offsets = _mm_add_epi32(offsets, sum_abs_min);
|
offsets = _mm_add_epi32(offsets, sum_abs_min);
|
||||||
__m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
|
__m256i sig_frac_bits = _mm256_i32gather_epi64((long long const*)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
|
||||||
_mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
|
_mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
|
||||||
|
|
||||||
|
|
||||||
|
@ -959,7 +959,7 @@ static INLINE void update_states_avx2(
|
||||||
|
|
||||||
// Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit
|
// Again gather is not necessary but it is easier to read and shouldn't have too large of a performance hit
|
||||||
// Should be true for all gathers here
|
// Should be true for all gathers here
|
||||||
__m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
|
__m256i sbb_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sbbFracBits[0], prv_states, 8);
|
||||||
_mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
|
_mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
|
||||||
|
|
||||||
// Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1;
|
// Next three lines: state->m_remRegBins = prvState->m_remRegBins - 1;
|
||||||
|
@ -1218,7 +1218,7 @@ static INLINE void update_states_avx2(
|
||||||
_mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
|
_mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
|
||||||
_mm_set1_epi32(3));
|
_mm_set1_epi32(3));
|
||||||
offsets = _mm_add_epi32(offsets, temp);
|
offsets = _mm_add_epi32(offsets, temp);
|
||||||
__m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
|
__m256i sig_frac_bits = _mm256_i32gather_epi64((const long long *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
|
||||||
_mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
|
_mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
|
||||||
|
|
||||||
sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
|
sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
|
||||||
|
|
Loading…
Reference in a new issue