mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-30 12:44:07 +00:00
[dep_quant] Change order of absLevels
This commit is contained in:
parent
f312fe4a2e
commit
9280d35d96
|
@ -325,12 +325,12 @@ static void reset_common_context(common_context* ctx, const rate_estimator_t * r
|
|||
memcpy(&ctx->m_sbbFlagBits, &rate_estimator->m_sigSbbFracBits, sizeof(rate_estimator->m_sigSbbFracBits));
|
||||
uint8_t* next_sbb_memory = ctx->sbb_memory;
|
||||
uint8_t* next_level_memory = ctx->level_memory;
|
||||
for (int k = 0; k < 8; k++, next_sbb_memory += numSbb, next_level_memory += num_coeff) {
|
||||
for (int k = 0; k < 2; k++, next_sbb_memory += numSbb * 4llu, next_level_memory += num_coeff * 4llu) {
|
||||
ctx->m_allSbbCtx[k].sbbFlags = next_sbb_memory;
|
||||
ctx->m_allSbbCtx[k].levels = next_level_memory;
|
||||
}
|
||||
ctx->m_curr_sbb_ctx_offset = 0;
|
||||
ctx->m_prev_sbb_ctx_offset = 4;
|
||||
ctx->m_prev_sbb_ctx_offset = 1;
|
||||
ctx->num_coeff = num_coeff;
|
||||
}
|
||||
|
||||
|
@ -570,23 +570,35 @@ static INLINE void update_common_context(
|
|||
const int prev_state,
|
||||
const int curr_state)
|
||||
{
|
||||
const uint32_t numSbb = width_in_sbb * height_in_sbb;
|
||||
const int curr_state_without_offset = curr_state & 3;
|
||||
uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].sbbFlags;
|
||||
uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + curr_state_without_offset].levels;
|
||||
const uint32_t numSbb = width_in_sbb * height_in_sbb;
|
||||
const int curr_state_without_offset = curr_state & 3;
|
||||
uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags;
|
||||
uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
|
||||
size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
|
||||
if (prev_state != -1 && ctxs->m_allStates.m_refSbbCtxId[prev_state] >= 0) {
|
||||
memcpy(sbbFlags, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].sbbFlags, numSbb * sizeof(uint8_t));
|
||||
memcpy(levels + scan_pos, cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[prev_state]].levels + scan_pos, setCpSize);
|
||||
int8_t prev_sbb_state = ctxs->m_allStates.m_refSbbCtxId[prev_state];
|
||||
if (prev_state != -1 && prev_sbb_state >= 0) {
|
||||
for (int i = 0; i < numSbb; ++i) {
|
||||
sbbFlags[i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb_state];
|
||||
}
|
||||
for (int i = 16; i < setCpSize; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[scan_pos * 4 + i * 4 + prev_sbb_state];
|
||||
}
|
||||
}
|
||||
else {
|
||||
memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
|
||||
memset(levels + scan_pos, 0, setCpSize);
|
||||
for (int i = 0; i < numSbb; ++i) {
|
||||
sbbFlags[i * 4 + curr_state_without_offset] = 0;
|
||||
}
|
||||
for (int i = 16; i < setCpSize; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = 0;
|
||||
}
|
||||
}
|
||||
sbbFlags[cg_pos * 4 + curr_state_without_offset] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state_without_offset] = ctxs->m_allStates.m_absLevels[curr_state / 4][i * 4 + curr_state_without_offset];
|
||||
}
|
||||
sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state];
|
||||
memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state], 16 * sizeof(uint8_t));
|
||||
|
||||
const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right] : false) || (next_sbb_below ? sbbFlags[next_sbb_below] : false) ? 1 : 0);
|
||||
const int sigNSbb = ((next_sbb_right ? sbbFlags[next_sbb_right * 4 + curr_state_without_offset] : false)
|
||||
|| (next_sbb_below ? sbbFlags[next_sbb_below* 4 + curr_state_without_offset] : false) ? 1 : 0);
|
||||
ctxs->m_allStates.m_numSigSbb[curr_state] = 0;
|
||||
if (prev_state != -1) {
|
||||
ctxs->m_allStates.m_remRegBins[curr_state] = ctxs->m_allStates.m_remRegBins[prev_state];
|
||||
|
@ -604,11 +616,11 @@ static INLINE void update_common_context(
|
|||
uint16_t *templateCtxInit = ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset >> 2];
|
||||
const int scanBeg = scan_pos - 16;
|
||||
const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
|
||||
const uint8_t* absLevels = levels + scanBeg;
|
||||
const uint8_t* absLevels = levels + scanBeg * 4;
|
||||
for (int id = 0; id < 16; id++, nbOut++) {
|
||||
if (nbOut->num) {
|
||||
coeff_t sumAbs = 0, sumAbs1 = 0, sumNum = 0;
|
||||
#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k]]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
|
||||
#define UPDATE(k) {coeff_t t=absLevels[nbOut->outPos[k] * 4 + curr_state_without_offset]; sumAbs+=t; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
|
||||
UPDATE(0);
|
||||
if (nbOut->num > 1) {
|
||||
UPDATE(1);
|
||||
|
@ -623,13 +635,15 @@ static INLINE void update_common_context(
|
|||
}
|
||||
}
|
||||
#undef UPDATE
|
||||
templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1) << 3) + ((uint16_t)MIN(127, sumAbs) << 8);
|
||||
templateCtxInit[curr_state_without_offset + id * 4] = (uint16_t)(sumNum) + ((uint16_t)(sumAbs1 << 3)) + (uint16_t)(MIN(127, sumAbs) << 8);
|
||||
}
|
||||
else {
|
||||
templateCtxInit[curr_state_without_offset + id * 4] = 0;
|
||||
}
|
||||
}
|
||||
memset(ctxs->m_allStates.m_absLevels[curr_state], 0, 16 * sizeof(uint8_t));
|
||||
for (int i = curr_state_without_offset; i < 64; i += 4) {
|
||||
ctxs->m_allStates.m_absLevels[curr_state >> 2][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -655,18 +669,25 @@ void uvg_dep_quant_update_state_eos(
|
|||
if (decisions->prevId[decision_id] >= 4) {
|
||||
prvState = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
|
||||
state->m_numSigSbb[curr_state_offset] = 0;
|
||||
memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
|
||||
for (int i = decision_id; i < 64; i += 4) {
|
||||
state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
|
||||
}
|
||||
}
|
||||
else if (decisions->prevId[decision_id] >= 0) {
|
||||
prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
|
||||
state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prvState] || !!decisions->absLevel[decision_id];
|
||||
memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
|
||||
for (int i = 0; i < 64; i += 4) {
|
||||
state->m_absLevels[ctxs->m_curr_state_offset / 4][i + decision_id] =
|
||||
state->m_absLevels[ctxs->m_prev_state_offset / 4][i + decisions->prevId[decision_id]];
|
||||
}
|
||||
}
|
||||
else {
|
||||
state->m_numSigSbb[curr_state_offset] = 1;
|
||||
memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
|
||||
for (int i = decision_id; i < 64; i += 4) {
|
||||
state->m_absLevels[ctxs->m_curr_state_offset / 4][i] = 0;
|
||||
}
|
||||
}
|
||||
uint8_t* temp = &state->m_absLevels[curr_state_offset][scan_pos & 15];
|
||||
uint8_t* temp = &state->m_absLevels[ctxs->m_curr_state_offset / 4][(scan_pos & 15) * 4 + decision_id];
|
||||
*temp = (uint8_t)MIN(51, decisions->absLevel[decision_id]);
|
||||
|
||||
update_common_context(ctxs, state->m_commonCtx, scan_pos, cg_pos, width_in_sbb, height_in_sbb, next_sbb_right,
|
||||
|
@ -714,10 +735,12 @@ void uvg_dep_quant_update_state(
|
|||
? (unsigned)decisions->absLevel[decision_id]
|
||||
: 3);
|
||||
}
|
||||
memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
|
||||
for (int i = 0; i < 64; i += 4) {
|
||||
state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i];
|
||||
}
|
||||
for (int i = 0; i < 64; i += 4) {
|
||||
state->m_absLevels[ctxs->m_curr_state_offset >> 2][decision_id + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][prev_id_no_offset + i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
state->m_numSigSbb[state_id] = 1;
|
||||
|
@ -726,21 +749,23 @@ void uvg_dep_quant_update_state(
|
|||
//(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
|
||||
state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (
|
||||
decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
|
||||
memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
|
||||
for (int i = 0; i < 64; i += 4) {
|
||||
state->m_ctxInit[ctxs->m_curr_state_offset >> 2][decision_id + i] = 0;
|
||||
for (int i = decision_id; i < 64; i += 4) {
|
||||
state->m_absLevels[ctxs->m_curr_state_offset >> 2][i] = 0;
|
||||
}
|
||||
for (int i = decision_id; i < 64; i += 4) {
|
||||
state->m_ctxInit[ctxs->m_curr_state_offset >> 2][i] = 0;
|
||||
}
|
||||
}
|
||||
state->all_gte_four &= state->m_remRegBins[state_id] >= 4;
|
||||
state->all_lt_four &= state->m_remRegBins[state_id] < 4;
|
||||
uint8_t* levels = state->m_absLevels[state_id];
|
||||
levels[scan_pos & 15] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
|
||||
uint8_t* levels = state->m_absLevels[ctxs->m_curr_state_offset >> 2];
|
||||
levels[(scan_pos & 15) * 4 + decision_id] = (uint8_t)MIN(32, decisions->absLevel[decision_id]);
|
||||
|
||||
if (state->m_remRegBins[state_id] >= 4) {
|
||||
coeff_t tinit = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id];
|
||||
coeff_t sumAbs1 = (tinit >> 3) & 31;
|
||||
coeff_t sumNum = tinit & 7;
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs1+=MIN(4+(t&1),t); sumNum+=!!t; }
|
||||
switch (numIPos) {
|
||||
case 5: UPDATE(4);
|
||||
case 4: UPDATE(3);
|
||||
|
@ -760,7 +785,7 @@ void uvg_dep_quant_update_state(
|
|||
|
||||
|
||||
coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
|
||||
switch (numIPos) {
|
||||
case 5: UPDATE(4);
|
||||
case 4: UPDATE(3);
|
||||
|
@ -784,7 +809,7 @@ void uvg_dep_quant_update_state(
|
|||
}
|
||||
else {
|
||||
coeff_t sumAbs = state->m_ctxInit[ctxs->m_curr_state_offset >> 2][((scan_pos - 1) & 15) * 4 + decision_id] >> 8;
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k]]; sumAbs+=t; }
|
||||
#define UPDATE(k) {coeff_t t=levels[next_nb_info_ssb.inPos[k] * 4 + decision_id]; sumAbs+=t; }
|
||||
switch (numIPos) {
|
||||
case 5: UPDATE(4);
|
||||
case 4: UPDATE(3);
|
||||
|
@ -1061,10 +1086,8 @@ int uvg_dep_quant(
|
|||
height,
|
||||
compID != 0); //tu.cu->slice->getReverseLastSigCoeffFlag());
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
assert(ctxs->m_allStates.m_refSbbCtxId[i] < 5);
|
||||
}
|
||||
if(1){
|
||||
|
||||
if(0){
|
||||
printf("%d\n", scanIdx);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
printf("%lld %hu %d\n", ctxs->m_trellis[scanIdx].rdCost[i], ctxs->m_trellis[scanIdx].absLevel[i], ctxs->m_trellis[scanIdx].prevId[i]);
|
||||
|
|
|
@ -119,7 +119,7 @@ typedef struct {
|
|||
typedef struct {
|
||||
const NbInfoOut* m_nbInfo;
|
||||
uint32_t m_sbbFlagBits[2][2];
|
||||
SbbCtx m_allSbbCtx[8];
|
||||
SbbCtx m_allSbbCtx[2];
|
||||
int m_curr_sbb_ctx_offset;
|
||||
int m_prev_sbb_ctx_offset;
|
||||
uint8_t sbb_memory[8 * 1024];
|
||||
|
@ -149,7 +149,7 @@ typedef struct {
|
|||
} depquant_state;
|
||||
typedef struct {
|
||||
int64_t ALIGNED(32) m_rdCost[12];
|
||||
uint8_t ALIGNED(32) m_absLevels[12][16];
|
||||
uint8_t ALIGNED(32) m_absLevels[3][16 * 4];
|
||||
uint16_t ALIGNED(32) m_ctxInit[3][16 * 4];
|
||||
int8_t ALIGNED(16) m_numSigSbb[12];
|
||||
int ALIGNED(32) m_remRegBins[12];
|
||||
|
|
|
@ -497,6 +497,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
__m128i prev_state;
|
||||
__m128i prev_state_no_offset;
|
||||
__m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
|
||||
__m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
if (all_above_four) {
|
||||
prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
|
||||
prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
|
||||
|
@ -505,16 +506,14 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
prev_state_no_offset
|
||||
);
|
||||
memset(&state->m_numSigSbb[state_offset], 0, 4);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
memset(state->m_absLevels[state_offset + i], 0, 16 * sizeof(uint8_t));
|
||||
}
|
||||
memset(state->m_absLevels[state_offset >> 2], 0, 64 * sizeof(uint8_t));
|
||||
|
||||
} else if (all_between_zero_and_three) {
|
||||
prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
|
||||
prev_state_no_offset = _mm_load_si128((const __m128i*)decisions->prevId);
|
||||
prev_state = _mm_add_epi32(
|
||||
prev_state_no_offset,
|
||||
_mm_load_si128((const __m128i*)decisions->prevId)
|
||||
_mm_set1_epi32(ctxs->m_prev_state_offset)
|
||||
);
|
||||
__m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
__m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
|
||||
__m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
|
||||
num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
|
||||
|
@ -527,10 +526,15 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
|
||||
memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
|
||||
|
||||
int32_t prev_state_scalar[4];
|
||||
_mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prev_state_scalar[i]], 16 * sizeof(uint8_t));
|
||||
__m128i temp_prev_state = _mm_shuffle_epi8(prev_state_no_offset, control);
|
||||
__m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state);
|
||||
prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0);
|
||||
__m256i temp_add = _mm256_setr_epi32(0, 0x04040404, 0x08080808, 0x0c0c0c0c, 0, 0x04040404, 0x08080808, 0x0c0c0c0c);
|
||||
prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
|
||||
for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
|
||||
__m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
|
||||
data = _mm256_shuffle_epi8(data, prev_state_256);
|
||||
_mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
|
||||
}
|
||||
} else {
|
||||
int prev_state_s[4] = {-1, -1, -1, -1};
|
||||
|
@ -540,27 +544,31 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
if (decisions->prevId[decision_id] >= 4) {
|
||||
prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
|
||||
state->m_numSigSbb[curr_state_offset] = 0;
|
||||
memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
|
||||
for (int j = i; j < 64; j += 4) {
|
||||
state->m_absLevels[curr_state_offset >> 2][j] = 0;
|
||||
}
|
||||
} else if (decisions->prevId[decision_id] >= 0) {
|
||||
prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
|
||||
state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] || !!decisions->absLevel[decision_id];
|
||||
memcpy(state->m_absLevels[curr_state_offset], state->m_absLevels[prev_state_s[i]], 16 * sizeof(uint8_t));
|
||||
for (int j = 0; j < 64; j += 4) {
|
||||
state->m_absLevels[curr_state_offset >> 2][j + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][j + decisions->prevId[decision_id]];
|
||||
}
|
||||
} else {
|
||||
state->m_numSigSbb[curr_state_offset] = 1;
|
||||
memset(state->m_absLevels[curr_state_offset], 0, 16 * sizeof(uint8_t));
|
||||
for (int j = i; j < 64; j += 4) {
|
||||
state->m_absLevels[curr_state_offset >> 2][j] = 0;
|
||||
}
|
||||
all_have_previous_state = false;
|
||||
}
|
||||
}
|
||||
prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
|
||||
}
|
||||
uint32_t level_offset = scan_pos & 15;
|
||||
__m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
|
||||
uint32_t max_abs_s[4];
|
||||
_mm_storeu_si128((__m128i*)max_abs_s, max_abs);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i];
|
||||
levels[level_offset] = max_abs_s[i];
|
||||
}
|
||||
__m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
|
||||
max_abs = _mm_shuffle_epi8(max_abs, control);
|
||||
uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
|
||||
memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs, 4);
|
||||
|
||||
|
||||
// Update common context
|
||||
__m128i last;
|
||||
|
@ -571,31 +579,40 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
int previous_state_array[4];
|
||||
_mm_storeu_si128((__m128i*)previous_state_array, prev_state);
|
||||
for (int curr_state = 0; curr_state < 4; ++curr_state) {
|
||||
uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
|
||||
uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
|
||||
uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset ].sbbFlags;
|
||||
uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels;
|
||||
const int p_state = previous_state_array[curr_state];
|
||||
if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
|
||||
const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
|
||||
memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
|
||||
memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
|
||||
const int prev_sbb = ctxs->m_allStates.m_refSbbCtxId[p_state];
|
||||
for (int i = 0; i < numSbb; ++i) {
|
||||
sbbFlags[i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].sbbFlags[i * 4 + prev_sbb];
|
||||
}
|
||||
for (int i = 16; i < setCpSize; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state] = cc->m_allSbbCtx[cc->m_prev_sbb_ctx_offset].levels[scan_pos * 4 + i * 4 + prev_sbb];
|
||||
}
|
||||
} else {
|
||||
memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
|
||||
memset(levels + scan_pos, 0, setCpSize);
|
||||
for (int i = 0; i < numSbb; ++i) {
|
||||
sbbFlags[i * 4 + curr_state] = 0;
|
||||
}
|
||||
for (int i = 16; i < setCpSize; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state] = 0;
|
||||
}
|
||||
}
|
||||
sbbFlags[cg_pos * 4 + curr_state] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
levels[scan_pos * 4 + i * 4 + curr_state] = ctxs->m_allStates.m_absLevels[state_offset / 4][i * 4 + curr_state];
|
||||
}
|
||||
sbbFlags[cg_pos] = ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
|
||||
memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevels[curr_state + state_offset], 16 * sizeof(uint8_t));
|
||||
}
|
||||
|
||||
__m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
|
||||
__m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
|
||||
__m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
|
||||
__m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
|
||||
|
||||
__m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
|
||||
__m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
|
||||
|
||||
__m128i sbb_right = next_sbb_right ?
|
||||
_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_right * 4])) :
|
||||
_mm_set1_epi32(0);
|
||||
|
||||
__m128i sbb_below = next_sbb_below ?
|
||||
_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags[next_sbb_below * 4])) :
|
||||
_mm_set1_epi32(0);
|
||||
|
||||
__m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
|
||||
sig_sbb = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
|
||||
sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
|
||||
__m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
|
||||
_mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
|
||||
|
@ -621,7 +638,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
|
||||
const int scanBeg = scan_pos - 16;
|
||||
const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
|
||||
const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
|
||||
const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg * 4;
|
||||
|
||||
__m128i levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
|
||||
__m128i first_byte = _mm_set1_epi32(0xff);
|
||||
|
@ -629,8 +646,6 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
__m128i fours = _mm_set1_epi32(4);
|
||||
__m256i all[4];
|
||||
uint64_t temp[4];
|
||||
const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0,
|
||||
31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
|
||||
|
||||
for (int id = 0; id < 16; id++, nbOut++) {
|
||||
if (nbOut->num == 0) {
|
||||
|
@ -646,9 +661,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
switch (nbOut->num) {
|
||||
case 5:
|
||||
{
|
||||
__m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
|
||||
__m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[4] * 4])));
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
__m128i min_t = _mm_min_epi32(
|
||||
|
@ -661,9 +674,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
|
||||
}
|
||||
case 4: {
|
||||
__m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
|
||||
__m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[3] * 4])));
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
__m128i min_t = _mm_min_epi32(
|
||||
|
@ -674,9 +685,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
|
||||
}
|
||||
case 3: {
|
||||
__m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
|
||||
__m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[2] * 4])));
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
__m128i min_t = _mm_min_epi32(
|
||||
|
@ -687,9 +696,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
|
||||
}
|
||||
case 2: {
|
||||
__m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
|
||||
__m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[1] * 4])));
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
__m128i min_t = _mm_min_epi32(
|
||||
|
@ -700,9 +707,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
|
||||
}
|
||||
case 1: {
|
||||
__m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
|
||||
__m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&absLevels[nbOut->outPos[0] * 4])));
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
__m128i min_t = _mm_min_epi32(
|
||||
|
@ -735,7 +740,7 @@ static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos,
|
|||
_mm256_storeu_si256((__m256i*)(&state->m_ctxInit[state_offset >> 2][48]), all[3]);
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
memset(state->m_absLevels[state_offset + i], 0, 16);
|
||||
memset(state->m_absLevels[state_offset >> 2], 0, 16 * 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -811,13 +816,13 @@ static INLINE void update_states_avx2(
|
|||
|
||||
bool rem_reg_all_gte_4 = true;
|
||||
bool rem_reg_all_lt4 = true;
|
||||
__m128i control = _mm_setr_epi8(0, 4, 8, 12, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
__m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
|
||||
if (all_non_negative) {
|
||||
__m128i prv_states_o = _mm_load_si128((__m128i const*)decisions->prevId);
|
||||
__m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
|
||||
__m128i prv_states = _mm_add_epi32(prv_states_o, prev_offset);
|
||||
__m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
__m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
|
||||
|
||||
__m128i sig_sbb = _mm_load_si128((__m128i const*)state->m_numSigSbb);
|
||||
|
@ -862,17 +867,32 @@ static INLINE void update_states_avx2(
|
|||
bit_mask = _mm_movemask_epi8(mask);
|
||||
rem_reg_all_lt4 = (bit_mask == 0xFFFF);
|
||||
|
||||
int32_t prv_states_scalar[4];
|
||||
_mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
memcpy(state->m_absLevels[state_offset + i], state->m_absLevels[prv_states_scalar[i]], 16 * sizeof(uint8_t));
|
||||
|
||||
__m128i temp_prev_state = _mm_shuffle_epi8(prv_states_o, control);
|
||||
__m256i prev_state_256 = _mm256_castsi128_si256(temp_prev_state);
|
||||
prev_state_256 = _mm256_permute4x64_epi64(prev_state_256, 0);
|
||||
__m256i temp_add = _mm256_setr_epi32(
|
||||
0,
|
||||
0x04040404,
|
||||
0x08080808,
|
||||
0x0c0c0c0c,
|
||||
0,
|
||||
0x04040404,
|
||||
0x08080808,
|
||||
0x0c0c0c0c);
|
||||
prev_state_256 = _mm256_add_epi8(prev_state_256, temp_add);
|
||||
for (int i = 0; i < 64; i += (256 / (8 * sizeof(uint8_t)))) {
|
||||
__m256i data = _mm256_load_si256((__m256i*)&state->m_absLevels[ctxs->m_prev_state_offset >> 2][i]);
|
||||
data = _mm256_shuffle_epi8(data, prev_state_256);
|
||||
_mm256_store_si256((__m256i*)&state->m_absLevels[ctxs->m_curr_state_offset >> 2][i], data);
|
||||
}
|
||||
|
||||
__m256i prev_state_full = _mm256_load_si256((__m256i const*)decisions->prevId);
|
||||
__m256i shuffle_mask = _mm256_setr_epi8(0, 0, 4, 4,8, 8, 12, 12, 0, 0, 4, 4, 8, 8, 12, 12,0, 0, 0, 0,0, 0, 0, 0,16, 16, 16, 16, 16, 16, 16, 16);
|
||||
prev_state_full = _mm256_shuffle_epi8(prev_state_full, shuffle_mask);
|
||||
prev_state_full = _mm256_permute4x64_epi64(prev_state_full, 0);
|
||||
prev_state_full = _mm256_slli_epi16(prev_state_full, 1);
|
||||
__m256i temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
|
||||
temp_add = _mm256_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9, 16, 17, 16, 17,16, 17,16, 17, 24, 25,24,25,24,25,24,25);
|
||||
prev_state_full = _mm256_add_epi8(prev_state_full, temp_add);
|
||||
|
||||
for (int i = 0; i < 64; i += (256 / 8 / sizeof(uint16_t))) {
|
||||
|
@ -903,7 +923,7 @@ static INLINE void update_states_avx2(
|
|||
bit_mask = _mm_movemask_epi8(mask);
|
||||
rem_reg_all_lt4 = (bit_mask == 0xFFFF);
|
||||
|
||||
memset(state->m_absLevels[state_offset], 0, 16 * sizeof(uint8_t) * 4);
|
||||
memset(state->m_absLevels[state_offset >> 2], 0, 16 * sizeof(uint8_t) * 4);
|
||||
memset(state->m_ctxInit[state_offset >> 2], 0, 16 * sizeof(uint16_t) * 4);
|
||||
|
||||
}
|
||||
|
@ -922,35 +942,36 @@ static INLINE void update_states_avx2(
|
|||
if (state->m_remRegBins[state_id] >= 4) {
|
||||
state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
|
||||
}
|
||||
memcpy(state->m_absLevels[state_id], state->m_absLevels[prvState], 16 * sizeof(uint8_t));
|
||||
for (int k = 0; k < 16; ++k) {
|
||||
state->m_ctxInit[state_offset >> 2][k * 4 + i] = state->m_ctxInit[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
|
||||
}
|
||||
for (int k = 0; k < 16; ++k) {
|
||||
state->m_absLevels[state_offset >> 2][k * 4 + i] = state->m_absLevels[ctxs->m_prev_state_offset >> 2][k * 4 + decisions->prevId[decision_id]];
|
||||
}
|
||||
} else {
|
||||
state->m_numSigSbb[state_id] = 1;
|
||||
state->m_refSbbCtxId[state_id] = -1;
|
||||
int ctxBinSampleRatio = 28;
|
||||
//(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
|
||||
state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
|
||||
memset(state->m_absLevels[state_id], 0, 16 * sizeof(uint8_t));
|
||||
for (int k = 0; k < 16; ++k) {
|
||||
state->m_ctxInit[state_offset >> 2][k * 4 + i] = 0;
|
||||
for (int k = i; k < 64; k += 4) {
|
||||
state->m_ctxInit[state_offset >> 2][k] = 0;
|
||||
state->m_absLevels[state_offset >> 2][k] = 0;
|
||||
}
|
||||
}
|
||||
rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
|
||||
rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
|
||||
}
|
||||
}
|
||||
uint32_t level_offset = scan_pos & 15;
|
||||
__m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
|
||||
uint32_t max_abs_s[4];
|
||||
_mm_storeu_si128((__m128i*)max_abs_s, max_abs);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset + i];
|
||||
levels[level_offset] = max_abs_s[i];
|
||||
}
|
||||
uint32_t level_offset = scan_pos & 15;
|
||||
__m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(51));
|
||||
max_abs = _mm_shuffle_epi8(max_abs, control);
|
||||
uint32_t packed_max_abs = _mm_extract_epi32(max_abs, 0);
|
||||
memcpy(&state->m_absLevels[state_offset >> 2][level_offset * 4], &packed_max_abs,4);
|
||||
|
||||
state->all_gte_four = rem_reg_all_gte_4;
|
||||
state->all_lt_four = rem_reg_all_lt4;
|
||||
|
||||
if (rem_reg_all_gte_4) {
|
||||
const __m128i first_byte = _mm_set1_epi32(0xff);
|
||||
const __m128i ones = _mm_set1_epi32(1);
|
||||
|
@ -961,15 +982,11 @@ static INLINE void update_states_avx2(
|
|||
__m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
|
||||
__m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
|
||||
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset];
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
|
||||
switch (numIPos) {
|
||||
case 5:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int *)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
|
||||
1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
|
||||
__m128i min_arg = _mm_min_epi32(
|
||||
_mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
|
||||
t
|
||||
|
@ -984,11 +1001,7 @@ static INLINE void update_states_avx2(
|
|||
}
|
||||
case 4:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
|
||||
1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
|
||||
__m128i min_arg = _mm_min_epi32(
|
||||
_mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
|
||||
t
|
||||
|
@ -1001,11 +1014,7 @@ static INLINE void update_states_avx2(
|
|||
}
|
||||
case 3:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
|
||||
1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
|
||||
__m128i min_arg = _mm_min_epi32(
|
||||
_mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
|
||||
t
|
||||
|
@ -1018,11 +1027,7 @@ static INLINE void update_states_avx2(
|
|||
}
|
||||
case 2:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
|
||||
1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
|
||||
__m128i min_arg = _mm_min_epi32(
|
||||
_mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
|
||||
t
|
||||
|
@ -1034,11 +1039,7 @@ static INLINE void update_states_avx2(
|
|||
sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
|
||||
}
|
||||
case 1: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
|
||||
1);
|
||||
t = _mm_and_si128(t, first_byte);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
|
||||
__m128i min_arg = _mm_min_epi32(
|
||||
_mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
|
||||
t
|
||||
|
@ -1075,51 +1076,32 @@ static INLINE void update_states_avx2(
|
|||
switch (numIPos) {
|
||||
case 5:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
|
||||
1);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
// Need this to make sure we don't go beyond 255
|
||||
sum_abs = _mm_and_si128(sum_abs, first_byte);
|
||||
sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
|
||||
1);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
|
||||
1);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
|
||||
1);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 1:
|
||||
{
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
|
||||
1);
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
} break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
sum_abs = _mm_and_si128(sum_abs, first_byte);
|
||||
if (extRiceFlag) {
|
||||
assert(0 && "Not implemented for avx2");
|
||||
} else {
|
||||
|
@ -1138,7 +1120,7 @@ static INLINE void update_states_avx2(
|
|||
|
||||
else if (rem_reg_all_lt4) {
|
||||
const __m128i first_byte = _mm_set1_epi32(0xff);
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset];
|
||||
uint8_t* levels = (uint8_t*)state->m_absLevels[state_offset >> 2];
|
||||
const __m128i last_byte = _mm_set1_epi32(0xff);
|
||||
const uint32_t tinit_offset = MIN(level_offset - 1u, 15u);
|
||||
const __m128i levels_start_offsets = _mm_set_epi32(16 * 3, 16 * 2, 16 * 1, 16 * 0);
|
||||
|
@ -1147,48 +1129,34 @@ static INLINE void update_states_avx2(
|
|||
__m128i sum_abs = _mm_srli_epi32(tinit, 8);
|
||||
sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
|
||||
switch (numIPos) {
|
||||
case 5: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
|
||||
1);
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
// Need this to make sure we don't go beyond 255
|
||||
sum_abs = _mm_and_si128(sum_abs, first_byte);
|
||||
sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(51));
|
||||
}
|
||||
case 4: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
|
||||
1);
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
}
|
||||
case 3: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
|
||||
1);
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
}
|
||||
case 2: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
|
||||
1);
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
}
|
||||
case 1: {
|
||||
__m128i t = _mm_i32gather_epi32(
|
||||
(int*)levels,
|
||||
_mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
|
||||
1);
|
||||
sum_abs = _mm_add_epi32(sum_abs, t);
|
||||
} break;
|
||||
case 5:
|
||||
{
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[4] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[3] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[2] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[1] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
}
|
||||
case 1:
|
||||
{
|
||||
__m128i t = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(&levels[next_nb_info_ssb.inPos[0] * 4])));
|
||||
sum_abs = _mm_add_epi32(t, sum_abs);
|
||||
} break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
sum_abs = _mm_and_si128(sum_abs, last_byte);
|
||||
if (extRiceFlag) {
|
||||
assert(0 && "Not implemented for avx2");
|
||||
} else {
|
||||
|
@ -1209,14 +1177,14 @@ static INLINE void update_states_avx2(
|
|||
else {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
const int state_id = state_offset + i;
|
||||
uint8_t* levels = (uint8_t*)(state->m_absLevels[state_id]);
|
||||
uint8_t* levels = (uint8_t*)(state->m_absLevels[state_offset >> 2]);
|
||||
if (state->m_remRegBins[state_id] >= 4) {
|
||||
coeff_t tinit = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i];
|
||||
coeff_t sumAbs1 = (tinit >> 3) & 31;
|
||||
coeff_t sumNum = tinit & 7;
|
||||
#define UPDATE(k) \
|
||||
{ \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
|
||||
sumAbs1 += MIN(4 + (t & 1), t); \
|
||||
sumNum += !!t; \
|
||||
}
|
||||
|
@ -1238,7 +1206,7 @@ static INLINE void update_states_avx2(
|
|||
coeff_t sumAbs = state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i] >> 8;
|
||||
#define UPDATE(k) \
|
||||
{ \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
|
||||
sumAbs += t; \
|
||||
}
|
||||
switch (numIPos) {
|
||||
|
@ -1260,7 +1228,7 @@ static INLINE void update_states_avx2(
|
|||
coeff_t sumAbs = (state->m_ctxInit[state_offset >> 2][((scan_pos - 1) & 15) * 4 + i]) >> 8;
|
||||
#define UPDATE(k) \
|
||||
{ \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
|
||||
coeff_t t = levels[next_nb_info_ssb.inPos[k] * 4 + i]; \
|
||||
sumAbs += t; \
|
||||
}
|
||||
switch (numIPos) {
|
||||
|
@ -1345,7 +1313,7 @@ void uvg_dep_quant_decide_and_update_avx2(
|
|||
// for (int k = 0; k < 16; ++k) {
|
||||
// printf(
|
||||
// "%3d ",
|
||||
// ctxs->m_allStates.m_ctxInit[ctxs->m_curr_state_offset / 4][k * 4 + i]);
|
||||
// ctxs->m_allStates.m_absLevels[ctxs->m_curr_state_offset / 4][k * 4 + i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
//}
|
||||
|
|
Loading…
Reference in a new issue