diff --git a/src/dep_quant.c b/src/dep_quant.c index c47b6892..2656f9aa 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -325,12 +325,12 @@ static void reset_common_context(common_context* ctx, const rate_estimator_t * r memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits)); uint8_t* next_sbb_memory = ctx->sbb_memory; uint8_t* next_level_memory = ctx->level_memory; - for (int k = 0; k < 8; k++, next_sbb_memory += numSbb, next_level_memory += num_coeff) { + for (int k = 0; k < 2; k++, next_sbb_memory += numSbb * 4llu, next_level_memory += num_coeff * 4llu) { ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory; ctx->m_allSbbCtx[k].levels = next_level_memory; } ctx->m_curr_sbb_ctx_offset = 0; - ctx->m_prev_sbb_ctx_offset = 4; + ctx->m_prev_sbb_ctx_offset = 1; ctx->num_coeff = num_coeff; } @@ -570,23 +570,35 @@ static INLINE void update_common_context( const int prev_state, const int curr_state) { - const uint32_t numSbb = width_in_sbb * height_in_sbb; - const int curr_state_without_offset = curr_state & 3; - uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].sbbFlags; - uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].levels; + const uint32_t numSbb = width_in_sbb * height_in_sbb; + const int curr_state_without_offset = curr_state & 3; + uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags; + uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels; size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t); - if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) { - memcpy(sbbFlags, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t)); - memcpy(levels + scan_pos, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize); + int8_t prev_sbb_state = ctxs->m_allStates.m_refSbbCtxId[prev_state]; + if (prev_state != -1 && prev_sbb_state >= 0) { + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state]; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state]; + } } else { - memset(sbbFlags, 0, numSbb * sizeof(uint8_t)); - memset(levels + scan_pos, 0, setCpSize); + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state_without_offset] = 0; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = 0; + } + } + sbbFlags[cg_pos * 4 + curr_state_without_offset] = !!ctxs->m_allStates.m_numSigSbb[curr_state]; + for (int i = 0; i < 16; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = ctxs->m_allStates.m_absLevels[curr_state / 4][i * 4 + curr_state_without_offset]; } - sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state]; - memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state], 16 * sizeof(uint8_t)); - const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0); + const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right * 4 + curr_state_without_offset] : false) + || (next_sbb_below ? sbbFlags[next_sbb_below* 4 + curr_state_without_offset] : false) ? 1 : 0); ctxs->m_allStates.m_numSigSbb[curr_state] = 0; if (prev_state != -1) { ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state]; @@ -604,11 +616,11 @@ static INLINE void update_common_context( uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2]; const int scanBeg = scan_pos - 16; const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; - const uint8_t* absLevels = levels + scanBeg; + const uint8_t* absLevels = levels + scanBeg * 4; for (int id = 0; id < 16; id++, nbOut++) { if (nbOut->num) { coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0; -#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } +#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k] * 4 + curr_state_without_offset]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } UPDATE(0); if (nbOut->num > 1) { UPDATE(1); @@ -623,13 +635,15 @@ static INLINE void update_common_context( } } #undef UPDATE - templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8); + templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1 << 3)) + (uint16_t)(MIN(127, sumAbs) << 8); } else { templateCtxInit[curr_state_without_offset + id * 4] = 0; } } - memset(ctxs->m_allStates.m_absLevels[curr_state], 0, 16 * sizeof(uint8_t)); + for (int i = curr_state_without_offset; i < 64; i += 4) { + ctxs->m_allStates.m_absLevels[curr_state >> 2][i] = 0; + } } @@ -655,18 +669,25 @@ void uvg_dep_quant_update_state_eos( if (decisions->prevId[decision_id] >= 4) { prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); state->m_numSigSbb[curr_state_offset] = 0; - memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t)); + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0; + } } else if (decisions->prevId[decision_id] >= 0) { prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id]; - memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prvState], 16 * sizeof(uint8_t)); + for (int i = 0; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i + decision_id] = + state->m_absLevels[ctxs->m_prev_state_offset / 4][i + decisions->prevId[decision_id]]; + } } else { state->m_numSigSbb[curr_state_offset] = 1; - memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t)); + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0; + } } - uint8_t* temp = &state->m_absLevels[curr_state_offset][scan_pos & 15]; + uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id]; *temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]); update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right, @@ -714,10 +735,12 @@ void uvg_dep_quant_update_state( ? (unsigned)decisions->absLevel[decision_id] : 3); } - memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t)); for (int i = 0; i < 64; i += 4) { state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i]; } + for (int i = 0; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i]; + } } else { state->m_numSigSbb[state_id] = 1; @@ -726,21 +749,23 @@ void uvg_dep_quant_update_state( //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - ( decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); - memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t)); - for (int i = 0; i < 64; i += 4) { - state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = 0; + for (int i = decision_id; i < 64; i += 4) { + state->m_absLevels[ctxs->m_curr_state_offset >> 2][i] = 0; + } + for (int i = decision_id; i < 64; i += 4) { + state->m_ctxInit[ctxs->m_curr_state_offset >> 2][i] = 0; } } state->all_gte_four &= state->m_remRegBins[state_id] >= 4; state->all_lt_four &= state->m_remRegBins[state_id] < 4; - uint8_t* levels = state->m_absLevels[state_id]; - levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]); + uint8_t* levels = state->m_absLevels[ctxs->m_curr_state_offset >> 2]; + levels[(scan_pos & 15) * 4 + decision_id] = (uint8_t)MIN(32, decisions->absLevel[decision_id]); if (state->m_remRegBins[state_id] >= 4) { coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id]; coeff_t sumAbs1 = (tinit >> 3) & 31; coeff_t sumNum = tinit & 7; -#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; } switch (numIPos) { case 5: UPDATE(4); case 4: UPDATE(3); @@ -760,7 +785,7 @@ void uvg_dep_quant_update_state( coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8; -#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; } +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; } switch (numIPos) { case 5: UPDATE(4); case 4: UPDATE(3); @@ -784,7 +809,7 @@ void uvg_dep_quant_update_state( } else { coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8; -#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; } +#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; } switch (numIPos) { case 5: UPDATE(4); case 4: UPDATE(3); @@ -1061,10 +1086,8 @@ int uvg_dep_quant( height, compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag()); } - for (int i = 0; i < 8; ++i) { - assert(ctxs->m_allStates.m_refSbbCtxId[i] < 5); - } - if(1){ + + if(0){ printf("%d\n", scanIdx); for (int i = 0; i < 4; i++) { printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]); diff --git a/src/dep_quant.h b/src/dep_quant.h index bd5ef363..6ef54f4d 100644 --- a/src/dep_quant.h +++ b/src/dep_quant.h @@ -119,7 +119,7 @@ typedef struct { typedef struct { const NbInfoOut* m_nbInfo; uint32_t m_sbbFlagBits[2][2]; - SbbCtx m_allSbbCtx[8]; + SbbCtx m_allSbbCtx[2]; int m_curr_sbb_ctx_offset; int m_prev_sbb_ctx_offset; uint8_t sbb_memory[8 * 1024]; @@ -149,7 +149,7 @@ typedef struct { } depquant_state; typedef struct { int64_t ALIGNED(32) m_rdCost[12]; - uint8_t ALIGNED(32) m_absLevels[12][16]; + uint8_t ALIGNED(32) m_absLevels[3][16 * 4]; uint16_t ALIGNED(32) m_ctxInit[3][16 * 4]; int8_t ALIGNED(16) m_numSigSbb[12]; int ALIGNED(32) m_remRegBins[12]; diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c index 9d40e496..1a00be56 100644 --- a/src/strategies/avx2/depquant-avx2.c +++ b/src/strategies/avx2/depquant-avx2.c @@ -497,6 +497,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, __m128i prev_state; __m128i prev_state_no_offset; __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); if (all_above_four) { prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset); prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4)); @@ -505,16 +506,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, prev_state_no_offset ); memset(&state->m_numSigSbb[state_offset], 0, 4); - for (int i = 0; i < 4; ++i) { - memset(state->m_absLevels[state_offset + i], 0, 16 * sizeof(uint8_t)); - } + memset(state->m_absLevels[state_offset >> 2], 0, 64 * sizeof(uint8_t)); + } else if (all_between_zero_and_three) { - prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); + prev_state_no_offset = _mm_load_si128((const __m128i*)decisions->prevId); prev_state = _mm_add_epi32( prev_state_no_offset, - _mm_load_si128((const __m128i*)decisions->prevId) + _mm_set1_epi32(ctxs->m_prev_state_offset) ); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00)); __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb); num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes); @@ -527,10 +526,15 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0); memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4); - int32_t prev_state_scalar[4]; - _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prev_state_scalar[i]], 16 * sizeof(uint8_t)); + __m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control); + __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state); + prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0); + __m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c); + prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add); + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) { + __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]); + data = _mm256_shuffle_epi8(data, prev_state_256); + _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data); } } else { int prev_state_s[4] = {-1, -1, -1, -1}; @@ -540,27 +544,31 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, if (decisions->prevId[decision_id] >= 4) { prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); state->m_numSigSbb[curr_state_offset] = 0; - memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t)); + for (int j = i; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j] = 0; + } } else if (decisions->prevId[decision_id] >= 0) { prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id]; - memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prev_state_s[i]], 16 * sizeof(uint8_t)); + for (int j = 0; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][j + decisions->prevId[decision_id]]; + } } else { state->m_numSigSbb[curr_state_offset] = 1; - memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t)); + for (int j = i; j < 64; j += 4) { + state->m_absLevels[curr_state_offset >> 2][j] = 0; + } all_have_previous_state = false; } } prev_state = _mm_loadu_si128((__m128i const*)prev_state_s); } uint32_t level_offset = scan_pos & 15; - __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32)); - uint32_t max_abs_s[4]; - _mm_storeu_si128((__m128i*)max_abs_s, max_abs); - for (int i = 0; i < 4; ++i) { - uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i]; - levels[level_offset] = max_abs_s[i]; - } + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51)); + max_abs = _mm_shuffle_epi8(max_abs, control); + uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0); + memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4); + // Update common context __m128i last; @@ -571,31 +579,40 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, int previous_state_array[4]; _mm_storeu_si128((__m128i*)previous_state_array, prev_state); for (int curr_state = 0; curr_state < 4; ++curr_state) { - uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags; - uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels; + uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset ].sbbFlags; + uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels; const int p_state = previous_state_array[curr_state]; if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) { - const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state]; - memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t)); - memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize); + const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state]; + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb]; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels[scan_pos * 4 + i * 4 + prev_sbb]; + } } else { - memset(sbbFlags, 0, numSbb * sizeof(uint8_t)); - memset(levels + scan_pos, 0, setCpSize); + for (int i = 0; i < numSbb; ++i) { + sbbFlags[i * 4 + curr_state] = 0; + } + for (int i = 16; i < setCpSize; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state] = 0; + } + } + sbbFlags[cg_pos * 4 + curr_state] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset]; + for (int i = 0; i < 16; ++i) { + levels[scan_pos * 4 + i * 4 + curr_state] = ctxs->m_allStates.m_absLevels[state_offset / 4][i * 4 + curr_state]; } - sbbFlags[cg_pos] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset]; - memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state + state_offset], 16 * sizeof(uint8_t)); } - - __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0); - __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right); - __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m); - __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0); - - __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below)); - __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0); + + __m128i sbb_right = next_sbb_right ? + _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) : + _mm_set1_epi32(0); + + __m128i sbb_below = next_sbb_below ? + _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_below * 4])) : + _mm_set1_epi32(0); __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below); - sig_sbb = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff)); sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1)); __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8); _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits); @@ -621,7 +638,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const int scanBeg = scan_pos - 16; const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; - const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg; + const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4; __m128i levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0); __m128i first_byte = _mm_set1_epi32(0xff); @@ -629,8 +646,6 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, __m128i fours = _mm_set1_epi32(4); __m256i all[4]; uint64_t temp[4]; - const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, - 31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16); for (int id = 0; id < 16; id++, nbOut++) { if (nbOut->num == 0) { @@ -646,9 +661,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, switch (nbOut->num) { case 5: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4])); - __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[4] * 4]))); sum_abs = _mm_add_epi32(sum_abs, t); sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); __m128i min_t = _mm_min_epi32( @@ -661,9 +674,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); } case 4: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[3] * 4]))); sum_abs = _mm_add_epi32(sum_abs, t); sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); __m128i min_t = _mm_min_epi32( @@ -674,9 +685,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); } case 3: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[2] * 4]))); sum_abs = _mm_add_epi32(sum_abs, t); sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); __m128i min_t = _mm_min_epi32( @@ -687,9 +696,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); } case 2: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[1] * 4]))); sum_abs = _mm_add_epi32(sum_abs, t); sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); __m128i min_t = _mm_min_epi32( @@ -700,9 +707,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); } case 1: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[0] * 4]))); sum_abs = _mm_add_epi32(sum_abs, t); sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); __m128i min_t = _mm_min_epi32( @@ -735,7 +740,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, _mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]); for (int i = 0; i < 4; ++i) { - memset(state->m_absLevels[state_offset + i], 0, 16); + memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4); } } @@ -811,13 +816,13 @@ static INLINE void update_states_avx2( bool rem_reg_all_gte_4 = true; bool rem_reg_all_lt4 = true; + __m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel); if (all_non_negative) { __m128i prv_states_o = _mm_load_si128((__m128i const*)decisions->prevId); __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); __m128i prv_states = _mm_add_epi32(prv_states_o, prev_offset); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control); __m128i sig_sbb = _mm_load_si128((__m128i const*)state->m_numSigSbb); @@ -862,17 +867,32 @@ static INLINE void update_states_avx2( bit_mask = _mm_movemask_epi8(mask); rem_reg_all_lt4 = (bit_mask == 0xFFFF); - int32_t prv_states_scalar[4]; - _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t)); + + __m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control); + __m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state); + prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0); + __m256i temp_add = _mm256_setr_epi32( + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c, + 0, + 0x04040404, + 0x08080808, + 0x0c0c0c0c); + prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add); + for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) { + __m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]); + data = _mm256_shuffle_epi8(data, prev_state_256); + _mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data); } + __m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId); __m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16); prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask); prev_state_full = _mm256_permute4x64_epi64(prev_state_full, 0); prev_state_full = _mm256_slli_epi16(prev_state_full, 1); - __m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25); + temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25); prev_state_full = _mm256_add_epi8(prev_state_full, temp_add); for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) { @@ -903,7 +923,7 @@ static INLINE void update_states_avx2( bit_mask = _mm_movemask_epi8(mask); rem_reg_all_lt4 = (bit_mask == 0xFFFF); - memset(state->m_absLevels[state_offset], 0, 16 * sizeof(uint8_t) * 4); + memset(state->m_absLevels[state_offset >> 2], 0, 16 * sizeof(uint8_t) * 4); memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4); } @@ -922,35 +942,36 @@ static INLINE void update_states_avx2( if (state->m_remRegBins[state_id] >= 4) { state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); } - memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t)); for (int k = 0; k < 16; ++k) { state->m_ctxInit[state_offset >> 2][k * 4 + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]]; } + for (int k = 0; k < 16; ++k) { + state->m_absLevels[state_offset >> 2][k * 4 + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]]; + } } else { state->m_numSigSbb[state_id] = 1; state->m_refSbbCtxId[state_id] = -1; int ctxBinSampleRatio = 28; //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); - memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t)); - for (int k = 0; k < 16; ++k) { - state->m_ctxInit[state_offset >> 2][k * 4 + i] = 0; + for (int k = i; k < 64; k += 4) { + state->m_ctxInit[state_offset >> 2][k] = 0; + state->m_absLevels[state_offset >> 2][k] = 0; } } rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4; rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4; } } - uint32_t level_offset = scan_pos & 15; - __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51)); - uint32_t max_abs_s[4]; - _mm_storeu_si128((__m128i*)max_abs_s, max_abs); - for (int i = 0; i < 4; ++i) { - uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i]; - levels[level_offset] = max_abs_s[i]; - } + uint32_t level_offset = scan_pos & 15; + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51)); + max_abs = _mm_shuffle_epi8(max_abs, control); + uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0); + memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4); + state->all_gte_four = rem_reg_all_gte_4; state->all_lt_four = rem_reg_all_lt4; + if (rem_reg_all_gte_4) { const __m128i first_byte = _mm_set1_epi32(0xff); const __m128i ones = _mm_set1_epi32(1); @@ -961,15 +982,11 @@ static INLINE void update_states_avx2( __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31)); __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7)); - uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset]; + uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2]; switch (numIPos) { case 5: { - __m128i t = _mm_i32gather_epi32( - (int *)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); __m128i min_arg = _mm_min_epi32( _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), t @@ -984,11 +1001,7 @@ static INLINE void update_states_avx2( } case 4: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); __m128i min_arg = _mm_min_epi32( _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), t @@ -1001,11 +1014,7 @@ static INLINE void update_states_avx2( } case 3: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); __m128i min_arg = _mm_min_epi32( _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), t @@ -1018,11 +1027,7 @@ static INLINE void update_states_avx2( } case 2: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); __m128i min_arg = _mm_min_epi32( _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), t @@ -1034,11 +1039,7 @@ static INLINE void update_states_avx2( sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); } case 1: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); - t = _mm_and_si128(t, first_byte); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); __m128i min_arg = _mm_min_epi32( _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), t @@ -1075,51 +1076,32 @@ static INLINE void update_states_avx2( switch (numIPos) { case 5: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); sum_abs = _mm_add_epi32(t, sum_abs); - // Need this to make sure we don't go beyond 255 - sum_abs = _mm_and_si128(sum_abs, first_byte); - sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51)); } case 4: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); sum_abs = _mm_add_epi32(t, sum_abs); } case 3: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); sum_abs = _mm_add_epi32(t, sum_abs); } case 2: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); sum_abs = _mm_add_epi32(t, sum_abs); } case 1: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); sum_abs = _mm_add_epi32(t, sum_abs); } break; default: assert(0); } - sum_abs = _mm_and_si128(sum_abs, first_byte); if (extRiceFlag) { assert(0 && "Not implemented for avx2"); } else { @@ -1138,7 +1120,7 @@ static INLINE void update_states_avx2( else if (rem_reg_all_lt4) { const __m128i first_byte = _mm_set1_epi32(0xff); - uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset]; + uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2]; const __m128i last_byte = _mm_set1_epi32(0xff); const uint32_t tinit_offset = MIN(level_offset - 1u, 15u); const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0); @@ -1147,48 +1129,34 @@ static INLINE void update_states_avx2( __m128i sum_abs = _mm_srli_epi32(tinit, 8); sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51)); switch (numIPos) { - case 5: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); - sum_abs = _mm_add_epi32(sum_abs, t); - // Need this to make sure we don't go beyond 255 - sum_abs = _mm_and_si128(sum_abs, first_byte); - sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51)); - } - case 4: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 3: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 2: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 1: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); - sum_abs = _mm_add_epi32(sum_abs, t); - } break; + case 5: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 4: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 3: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 2: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 1: + { + __m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4]))); + sum_abs = _mm_add_epi32(t, sum_abs); + } break; default: assert(0); } - sum_abs = _mm_and_si128(sum_abs, last_byte); if (extRiceFlag) { assert(0 && "Not implemented for avx2"); } else { @@ -1209,14 +1177,14 @@ static INLINE void update_states_avx2( else { for (int i = 0; i < 4; ++i) { const int state_id = state_offset + i; - uint8_t* levels = (uint8_t*)(state->m_absLevels[state_id]); + uint8_t* levels = (uint8_t*)(state->m_absLevels[state_offset >> 2]); if (state->m_remRegBins[state_id] >= 4) { coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]; coeff_t sumAbs1 = (tinit >> 3) & 31; coeff_t sumNum = tinit & 7; #define UPDATE(k) \ { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ sumAbs1 += MIN(4 + (t & 1), t); \ sumNum += !!t; \ } @@ -1238,7 +1206,7 @@ static INLINE void update_states_avx2( coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8; #define UPDATE(k) \ { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ sumAbs += t; \ } switch (numIPos) { @@ -1260,7 +1228,7 @@ static INLINE void update_states_avx2( coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8; #define UPDATE(k) \ { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \ sumAbs += t; \ } switch (numIPos) { @@ -1345,7 +1313,7 @@ void uvg_dep_quant_decide_and_update_avx2( // for (int k = 0; k < 16; ++k) { // printf( // "%3d ", - // ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset / 4][k * 4 + i]); + // ctxs->m_allStates.m_absLevels[ctxs->m_curr_state_offset / 4][k * 4 + i]); // } // printf("\n"); //}