[avx2] and last

This commit is contained in:
Joose Sainio 2023-04-14 09:55:09 +03:00
parent 12fea6f8b1
commit 963db5a407

View file

@ -1385,7 +1385,7 @@ static INLINE void update_states_avx2(
} }
} }
uint32_t level_offset = scan_pos & 15; uint32_t level_offset = scan_pos & 15;
__m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(255)); __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
uint32_t max_abs_s[4]; uint32_t max_abs_s[4];
_mm_storeu_epi32(max_abs_s, max_abs); _mm_storeu_epi32(max_abs_s, max_abs);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
@ -1527,6 +1527,7 @@ static INLINE void update_states_avx2(
} }
__m128i sum_abs = _mm_srli_epi32(tinit, 8); __m128i sum_abs = _mm_srli_epi32(tinit, 8);
sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
switch (numIPos) { switch (numIPos) {
case 5: case 5:
{ {
@ -1534,8 +1535,7 @@ static INLINE void update_states_avx2(
levels, levels,
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
1); 1);
t = _mm_and_epi32(t, first_byte); sum_abs = _mm_add_epi32(t, sum_abs);
sum_abs = _mm_add_epi32(sum_abs, t);
} }
case 4: case 4:
{ {
@ -1543,8 +1543,7 @@ static INLINE void update_states_avx2(
levels, levels,
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
1); 1);
t = _mm_and_epi32(t, first_byte); sum_abs = _mm_add_epi32(t, sum_abs);
sum_abs = _mm_add_epi32(sum_abs, t);
} }
case 3: case 3:
{ {
@ -1552,8 +1551,7 @@ static INLINE void update_states_avx2(
levels, levels,
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
1); 1);
t = _mm_and_epi32(t, first_byte); sum_abs = _mm_add_epi32(t, sum_abs);
sum_abs = _mm_add_epi32(sum_abs, t);
} }
case 2: case 2:
{ {
@ -1561,8 +1559,7 @@ static INLINE void update_states_avx2(
levels, levels,
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
1); 1);
t = _mm_and_epi32(t, first_byte); sum_abs = _mm_add_epi32(t, sum_abs);
sum_abs = _mm_add_epi32(sum_abs, t);
} }
case 1: case 1:
{ {
@ -1570,12 +1567,12 @@ static INLINE void update_states_avx2(
levels, levels,
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
1); 1);
t = _mm_and_epi32(t, first_byte); sum_abs = _mm_add_epi32(t, sum_abs);
sum_abs = _mm_add_epi32(sum_abs, t);
} break; } break;
default: default:
assert(0); assert(0);
} }
sum_abs = _mm_and_epi32(sum_abs, first_byte);
if (extRiceFlag) { if (extRiceFlag) {
assert(0 && "Not implemented for avx2"); assert(0 && "Not implemented for avx2");
} else { } else {
@ -1815,7 +1812,7 @@ static INLINE void updateState(
state->all_gte_four &= state->m_remRegBins[state_id] >= 4; state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
state->all_lt_four &= state->m_remRegBins[state_id] < 4; state->all_lt_four &= state->m_remRegBins[state_id] < 4;
uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]); uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
levels[scan_pos & 15] = (uint8_t)MIN(255, decisions->absLevel[decision_id]); levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
if (state->m_remRegBins[state_id] >= 4) { if (state->m_remRegBins[state_id] >= 4) {
coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]; coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];