From dfff9a8030f72568e3f2baf507457583a12e2013 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 17 Apr 2023 15:14:35 +0300 Subject: [PATCH] [avx2] Move dep quant stuff to strategies --- CMakeLists.txt | 1 - src/dep_quant.c | 1464 +-------------------- src/dep_quant.h | 123 ++ src/strategies/avx2/depquant-avx2.c | 1389 +++++++++++++++++++ src/strategies/avx2/depquant-avx2.h | 46 + src/strategies/generic/depquant-generic.c | 238 ++++ src/strategies/generic/depquant-generic.h | 50 + src/strategies/strategies-depquant.c | 54 + src/strategies/strategies-depquant.h | 77 ++ src/strategies/strategies-quant.c | 13 +- src/strategyselector.c | 4 + src/strategyselector.h | 2 + 12 files changed, 1997 insertions(+), 1464 deletions(-) create mode 100644 src/strategies/avx2/depquant-avx2.c create mode 100644 src/strategies/avx2/depquant-avx2.h create mode 100644 src/strategies/generic/depquant-generic.c create mode 100644 src/strategies/generic/depquant-generic.h create mode 100644 src/strategies/strategies-depquant.c create mode 100644 src/strategies/strategies-depquant.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6460743b..d8c37bbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,6 @@ target_include_directories(uvg266 PUBLIC src/extras) target_include_directories(uvg266 PUBLIC src/strategies) file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c") -file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/dep_quant.c") file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c") file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c") diff --git a/src/dep_quant.c b/src/dep_quant.c index 39439c40..519e5795 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -39,10 +39,8 @@ #include "transform.h" #include "uvg_math.h" #include "generic/quant-generic.h" -#include - - +#include "strategies-depquant.h" static const int32_t g_goRiceBits[4][RICEMAX] = { { 32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, { 65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, @@ -56,102 +54,6 @@ static const int g_riceShift[5] = { 0, 2, 4, 6, 8 }; static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 }; -enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 }; - - - - -typedef struct -{ - uint8_t* sbbFlags; - uint8_t* levels; -} SbbCtx; - - -typedef struct -{ - int32_t absLevel[4]; - int64_t deltaDist[4]; -} PQData; - -typedef struct -{ - int64_t ALIGNED(32) rdCost[8]; - int32_t ALIGNED(32) absLevel[8]; - int32_t ALIGNED(32) prevId[8]; -} Decision; - - -typedef struct -{ - const NbInfoOut* m_nbInfo; - uint32_t m_sbbFlagBits[2][2]; - SbbCtx m_allSbbCtx[8]; - int m_curr_sbb_ctx_offset; - int m_prev_sbb_ctx_offset; - uint8_t sbb_memory[8 * 1024]; - uint8_t level_memory[8* TR_MAX_WIDTH * TR_MAX_WIDTH]; - int num_coeff; -} common_context; - - -typedef struct -{ - int64_t m_rdCost; - uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id - int8_t m_numSigSbb; - int m_remRegBins; - int8_t m_refSbbCtxId; - uint32_t m_sbbFracBits[2]; - uint32_t m_sigFracBits[2]; - int32_t m_coeffFracBits[6]; - int8_t m_goRicePar; - int8_t m_goRiceZero; - int8_t m_stateId; - uint32_t *m_sigFracBitsArray[12]; - int32_t *m_gtxFracBitsArray[21]; - common_context* m_commonCtx; - - unsigned effWidth; - unsigned effHeight; -} depquant_state; - -typedef struct -{ - int64_t ALIGNED(32) m_rdCost[12]; - uint16_t ALIGNED(32) m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id - int8_t ALIGNED(16) m_numSigSbb[12]; - int ALIGNED(32) m_remRegBins[12]; - int8_t ALIGNED(16) m_refSbbCtxId[12]; - uint32_t ALIGNED(32) m_sbbFracBits[12][2]; - uint32_t ALIGNED(32) m_sigFracBits[12][2]; - int32_t ALIGNED(32) m_coeffFracBits[12][6]; - int8_t ALIGNED(16) m_goRicePar[12]; - int8_t ALIGNED(16) m_goRiceZero[12]; - int8_t ALIGNED(16) m_stateId[12]; - uint32_t ALIGNED(32) m_sigFracBitsArray[12][12][2]; - int32_t ALIGNED(32) m_gtxFracBitsArray[21][6]; - common_context* m_commonCtx; - - unsigned effWidth; - unsigned effHeight; - - bool all_gte_four; - bool all_lt_four; -} all_depquant_states; - -typedef struct -{ - common_context m_common_context; - all_depquant_states m_allStates; - int m_curr_state_offset; - int m_prev_state_offset; - int m_skip_state_offset; - depquant_state m_startState; - quant_block* m_quant; - Decision m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH]; -} context_store; - int uvg_init_nb_info(encoder_control_t * encoder) { memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray)); @@ -556,326 +458,8 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2] state->m_sbbFracBits[1] = 0; } -static INLINE void checkRdCostSkipSbbZeroOut( - Decision* decision, - const all_depquant_states* const state, - int decision_id, - int skip_offset) { - int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0]; - decision->rdCost[decision_id] = rdCost; - decision->absLevel[decision_id] = 0; - decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; -} - - -static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start) -{ - int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; - int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; - int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; - - __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]); - __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]); - - __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]); - __m256i rd_cost_b = rd_cost_a; - __m256i rd_cost_z = rd_cost_a; - - rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist); - rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist); - - - if (state->all_gte_four) { - if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) { - __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); - __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4); - __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); - rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits); - } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) { - __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1); - - __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); - __m128i t = _mm_slli_epi32(value, 1); - offsets = _mm_sub_epi32(offsets, t); - __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); - - __m128i max_rice = _mm_set1_epi32(31); - value = _mm_min_epi32(value, max_rice); - __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); - go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); - value = _mm_add_epi32(value, go_rice_tab); - - __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); - rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); - } else { - const int pqAs[4] = {0, 0, 3, 3}; - ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; - for (int i = 0; i < 4; i++) { - const int state_offset = start + i; - const int pqA = pqAs[i]; - const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; - if (pqDataA->absLevel[pqA] < 4) { - rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; - } else { - const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; - rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; - } - } - rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0])); - } - - if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) { - __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); - __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); - __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); - rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits); - } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) { - __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1); - - __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); - __m128i t = _mm_slli_epi32(value, 1); - offsets = _mm_sub_epi32(offsets, t); - __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); - - __m128i max_rice = _mm_set1_epi32(31); - value = _mm_min_epi32(value, max_rice); - __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); - go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); - value = _mm_add_epi32(value, go_rice_tab); - - __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); - rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); - } else { - const int pqBs[4] = {2, 2, 1, 1}; - int64_t rd_costs[4] = {0, 0, 0, 0}; - for (int i = 0; i < 4; i++) { - const int state_offset = start + i; - const int pqB = pqBs[i]; - const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; - if (pqDataA->absLevel[pqB] < 4) { - rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; - } else { - const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; - rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; - } - } - rd_cost_b = - _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0])); - } - - if (spt == SCAN_ISCSBB) { - __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); - __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); - __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); - __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); - __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); - __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); - __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); - rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); - rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); - rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); - } else if (spt == SCAN_SOCSBB) { - __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); - __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); - __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); - __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); - __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); - __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); - __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); - - original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]); - odd = _mm256_permutevar8x32_epi32(original, odd_mask); - __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); - - - rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1); - rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1); - rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1); - - rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1); - rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1); - rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0); - } - else { - if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) { - __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); - __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); - __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); - __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); - __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); - __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); - __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); - rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); - rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); - rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); - } - else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) { - rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]); - } - - else { - const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3}; - _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a); - _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b); - _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z); - for (int i = 0; i < 4; i++) { - const int state_offset = start + i; - if (state->m_numSigSbb[state_offset]) { - temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1]; - temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1]; - temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0]; - } else { - temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]]; - } - } - rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); - rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); - rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); - } - } - } else if (state->all_lt_four) { - __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS); - __m128i max_rice = _mm_set1_epi32(31); - __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start])); - // RD cost A - { - __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]); - __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero); - - __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice); - - __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1)); - - __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); - - - __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); - go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); - - __m128i offsets = _mm_add_epi32(selected, go_rice_offset); - __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); - __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); - - rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); - } - // RD cost b - { - __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]); - __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero); - - __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice); - - __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1)); - - __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); - - - __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); - go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); - - __m128i offsets = _mm_add_epi32(selected, go_rice_offset); - __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); - __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); - - rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); - } - // RD cost Z - { - __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); - go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); - - go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero); - __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4); - rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab)); - } - } else { - const int pqAs[4] = {0, 0, 3, 3}; - const int pqBs[4] = {2, 2, 1, 1}; - const int decision_a[4] = {0, 2, 1, 3}; - for (int i = 0; i < 4; i++) { - const int state_offset = start + i; - const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; - const int pqA = pqAs[i]; - const int pqB = pqBs[i]; - int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA]; - int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB]; - int64_t rdCostZ = state->m_rdCost[state_offset]; - if (state->m_remRegBins[state_offset] >= 4) { - if (pqDataA->absLevel[pqA] < 4) { - rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; - } else { - const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; - rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; - } - if (pqDataA->absLevel[pqB] < 4) { - rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; - } else { - const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; - rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; - } - if (spt == SCAN_ISCSBB) { - rdCostA += state->m_sigFracBits[state_offset][1]; - rdCostB += state->m_sigFracBits[state_offset][1]; - rdCostZ += state->m_sigFracBits[state_offset][0]; - } else if (spt == SCAN_SOCSBB) { - rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; - rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; - rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0]; - } else if (state->m_numSigSbb[state_offset]) { - rdCostA += state->m_sigFracBits[state_offset][1]; - rdCostB += state->m_sigFracBits[state_offset][1]; - rdCostZ += state->m_sigFracBits[state_offset][0]; - } else { - rdCostZ = decisions->rdCost[decision_a[i]]; - } - } else { - rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)]; - rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)]; - rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]]; - } - temp_rd_cost_a[i] = rdCostA; - temp_rd_cost_b[i] = rdCostB; - temp_rd_cost_z[i] = rdCostZ; - } - rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); - rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); - rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); - } - rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216); - rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141); - rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216); - __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost); - - __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel); - __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId); - __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20); - __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - decision_data = _mm256_permutevar8x32_epi32(decision_data, mask); - - __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]); - __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]); - __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0); - - __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b); - __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b); - __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b); - - __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision); - __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision); - __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision); - - __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second); - __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision); - __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision); - - _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost); - final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); - _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data); -} - - -static void checkRdCosts( +void uvg_dep_quant_check_rd_costs( const all_depquant_states * const state, const enum ScanPosType spt, const PQData * pqDataA, @@ -950,107 +534,6 @@ static void checkRdCosts( } } -static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset) -{ - int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0]; - if (rdCost < decisions->rdCost[decision_id]) - { - decisions->rdCost[decision_id] = rdCost; - decisions->absLevel[decision_id] = 0; - decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id]; - } -} - -static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int - decision_id) -{ - int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset; - if (pqData->absLevel[decision_id] < 4) { - rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]]; - } - else { - const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1; - rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] - + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1]; - } - if (rdCost < decisions->rdCost[decision_id]) { - decisions->rdCost[decision_id] = rdCost; - decisions->absLevel[decision_id] = pqData->absLevel[decision_id]; - decisions->prevId[decision_id] = -1; - } -} - - -static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff) -{ - int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff; - coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift))); - int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact; - int index = qIdx & 3; - pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; - pqData->absLevel[index] = (++qIdx) >> 1; - scaledAdd += qp->m_DistStepAdd; - index = qIdx & 3; - pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; - pqData->absLevel[index] = (++qIdx) >> 1; - scaledAdd += qp->m_DistStepAdd; - index = qIdx & 3; - pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; - pqData->absLevel[index] = (++qIdx) >> 1; - scaledAdd += qp->m_DistStepAdd; - index = qIdx & 3; - pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; - pqData->absLevel[index] = (++qIdx) >> 1; -} - - -static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2}, - .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} }; - - -static void xDecide( - all_depquant_states* const all_states, - depquant_state* const m_startState, - quant_block * qp, - const enum ScanPosType spt, - const coeff_t absCoeff, - const int lastOffset, - Decision* decisions, - bool zeroOut, - coeff_t quanCoeff, - const int skip_offset, - const int prev_offset) -{ - memcpy(decisions, &startDec, sizeof(Decision)); - - if (zeroOut) { - if (spt == SCAN_EOCSBB) { - checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset); - checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset); - checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset); - checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset); - } - return; - } - - PQData pqData; - preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); - check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset); - //checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); - //checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); - //checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); - //checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); - if (spt == SCAN_EOCSBB) { - checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); - checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); - checkRdCostSkipSbb(all_states, decisions, 2, skip_offset); - checkRdCostSkipSbb(all_states, decisions, 3, skip_offset); - } - - checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0); - checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2); -} - static INLINE unsigned templateAbsCompare(coeff_t sum) { @@ -1146,354 +629,9 @@ static INLINE void update_common_context( memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t)); } -static INLINE void updateStateEOS( - context_store* ctxs, - const uint32_t scan_pos, - const uint32_t cg_pos, - const uint32_t sigCtxOffsetNext, - const uint32_t gtxCtxOffsetNext, - const uint32_t width_in_sbb, - const uint32_t height_in_sbb, - const uint32_t next_sbb_right, - const uint32_t next_sbb_below, - const Decision* decisions, - int decision_id); - -static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos, - const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext, - const uint32_t width_in_sbb, const uint32_t height_in_sbb, - const uint32_t next_sbb_right, const uint32_t next_sbb_below, - const Decision* decisions) -{ - all_depquant_states* state = &ctxs->m_allStates; - bool all_above_minus_two = true; - bool all_between_zero_and_three = true; - bool all_above_four = true; - - - int state_offset = ctxs->m_curr_state_offset; - __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); - _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); - for (int i = 0; i < 4; ++i) { - all_above_minus_two &= decisions->prevId[i] > -2; - all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4; - all_above_four &= decisions->prevId[i] >= 4; - } - if (all_above_minus_two) { - bool all_have_previous_state = true; - __m128i prev_state; - __m128i prev_state_no_offset; - __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel); - if (all_above_four) { - prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset); - prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4)); - prev_state = _mm_add_epi32( - prev_state, - prev_state_no_offset - ); - memset(&state->m_numSigSbb[state_offset], 0, 4); - for (int i = 0; i < 4; ++i) { - memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t)); - } - } else if (all_between_zero_and_three) { - prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); - prev_state = _mm_add_epi32( - prev_state_no_offset, - _mm_load_si128((const __m128i*)decisions->prevId) - ); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00)); - __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb); - num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes); - num_sig_sbb = _mm_add_epi32( - num_sig_sbb, - _mm_min_epi32(abs_level, _mm_set1_epi32(1)) - ); - - num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control); - int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0); - memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4); - - int32_t prev_state_scalar[4]; - _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t)); - } - } else { - int prev_state_s[4] = {-1, -1, -1, -1}; - for (int i = 0; i < 4; ++i) { - const int decision_id = i; - const int curr_state_offset = state_offset + i; - if (decisions->prevId[decision_id] >= 4) { - prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); - state->m_numSigSbb[curr_state_offset] = 0; - memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t)); - } else if (decisions->prevId[decision_id] >= 0) { - prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; - state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id]; - memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t)); - } else { - state->m_numSigSbb[curr_state_offset] = 1; - memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t)); - all_have_previous_state = false; - } - } - prev_state = _mm_loadu_si128((__m128i const*)prev_state_s); - } - uint32_t level_offset = scan_pos & 15; - __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32)); - uint32_t max_abs_s[4]; - _mm_storeu_si128((__m128i*)max_abs_s, max_abs); - for (int i = 0; i < 4; ++i) { - uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i]; - levels[level_offset] = max_abs_s[i]; - } - - // Update common context - __m128i last; - { - const uint32_t numSbb = width_in_sbb * height_in_sbb; - common_context* cc = &ctxs->m_common_context; - size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t); - int previous_state_array[4]; - _mm_storeu_si128((__m128i*)previous_state_array, prev_state); - for (int curr_state = 0; curr_state < 4; ++curr_state) { - uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags; - uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels; - const int p_state = previous_state_array[curr_state]; - if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) { - const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state]; - memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t)); - memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize); - } else { - memset(sbbFlags, 0, numSbb * sizeof(uint8_t)); - memset(levels + scan_pos, 0, setCpSize); - } - sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset]; - memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t)); - } - - __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0); - __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right); - __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m); - __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0); - - __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below)); - __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0); - - __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below); - sig_sbb = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff)); - sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1)); - __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8); - _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits); - - memset(&state->m_numSigSbb[state_offset], 0, 4); - memset(&state->m_goRicePar[state_offset], 0, 4); - - uint8_t states[4] = {0, 1, 2, 3}; - memcpy(&state->m_refSbbCtxId[state_offset], states, 4); - if (all_have_previous_state) { - __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4); - _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); - } else { - const int temp = (state->effWidth * state->effHeight * 28) / 16; - for (int i = 0; i < 4; ++i) { - if (previous_state_array[i] != -1) { - state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]]; - } else { - state->m_remRegBins[i + state_offset] = temp; - } - } - } - - const int scanBeg = scan_pos - 16; - const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; - const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg; - - __m128i levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0); - __m128i first_byte = _mm_set1_epi32(0xff); - __m128i ones = _mm_set1_epi32(1); - __m128i fours = _mm_set1_epi32(4); - __m256i all[4]; - uint64_t temp[4]; - const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, - 31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16); - - for (int id = 0; id < 16; id++, nbOut++) { - if (nbOut->num == 0) { - temp[id % 4] = 0; - if (id % 4 == 3) { - all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); - all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle); - } - continue; - } - __m128i sum_abs = _mm_set1_epi32(0); - __m128i sum_abs_1 = _mm_set1_epi32(0); - __m128i sum_num = _mm_set1_epi32(0); - switch (nbOut->num) { - case 5: - { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4])); - __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); - __m128i min_t = _mm_min_epi32( - t, - _mm_add_epi32( - fours, - _mm_and_si128(t, ones) - ) - ); - sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); - } - case 4: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); - __m128i min_t = _mm_min_epi32( - t, - _mm_add_epi32( - fours, - _mm_and_si128(t, ones))); - sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); - } - case 3: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); - __m128i min_t = _mm_min_epi32( - t, - _mm_add_epi32( - fours, - _mm_and_si128(t, ones))); - sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); - } - case 2: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); - __m128i min_t = _mm_min_epi32( - t, - _mm_add_epi32( - fours, - _mm_and_si128(t, ones))); - sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); - } - case 1: { - __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0])); - __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); - t = _mm_and_si128(t, first_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); - __m128i min_t = _mm_min_epi32( - t, - _mm_add_epi32( - fours, - _mm_and_si128(t, ones))); - sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); - } - break; - default: - assert(0); - } - sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3); - sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8); - __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs); - template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1); - __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0); - __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask); - temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0); - if (id % 4 == 3) { - all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); - all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle); - last = template_ctx_init; - } - } - - __m256i* v_src_tmp = all; - - __m256i v_tmp[4]; - v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20); - v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31); - v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20); - v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31); - - __m256i v_tmp16_lo[2]; - __m256i v_tmp16_hi[2]; - v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); - v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); - v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); - v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); - - v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0)); - v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0)); - v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0)); - v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0)); - - _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8), _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20)); - _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8), _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31)); - _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8), _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20)); - _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8), _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31)); - - for (int i = 0; i < 4; ++i) { - memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16); - } - } - - __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7)); - __m128i sum_abs1 = _mm_and_si128( - _mm_srli_epi32(last, 3), - _mm_set1_epi32(31)); - - __m128i sum_abs_min = _mm_min_epi32( - _mm_set1_epi32(3), - _mm_srli_epi32( - _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)), - 1)); - - __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); - offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); - offsets = _mm_add_epi32(offsets, sum_abs_min); - __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8); - _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); - __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); - __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); - uint32_t sum_gt1_s[4]; - _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0])); - } - } - else { - for (int i = 0; i < 4; i++) { - updateStateEOS( - ctxs, - scan_pos, - cg_pos, - sigCtxOffsetNext, - gtxCtxOffsetNext, - width_in_sbb, - height_in_sbb, - next_sbb_right, - next_sbb_below, - decisions, - i); - } - } -} - - -static INLINE void updateStateEOS( +void uvg_dep_quant_update_state_eos( context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos, @@ -1542,542 +680,9 @@ static INLINE void updateStateEOS( state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0])); } } -static INLINE void updateState( - context_store* ctxs, - int numIPos, - const uint32_t scan_pos, - const Decision* decisions, - const uint32_t sigCtxOffsetNext, - const uint32_t gtxCtxOffsetNext, - const NbInfoSbb next_nb_info_ssb, - const int baseLevel, - const bool extRiceFlag, - int decision_id); - -static INLINE void update_states_avx2( - context_store* ctxs, - int numIPos, - const uint32_t scan_pos, - const Decision* decisions, - const uint32_t sigCtxOffsetNext, - const uint32_t gtxCtxOffsetNext, - const NbInfoSbb next_nb_info_ssb, - const int baseLevel, - const bool extRiceFlag) -{ - all_depquant_states* state = &ctxs->m_allStates; - - bool all_non_negative = true; - bool all_above_minus_two = true; - bool all_minus_one = true; - for (int i = 0; i < 4; ++i) { - all_non_negative &= decisions->prevId[i] >= 0; - all_above_minus_two &= decisions->prevId[i] > -2; - all_minus_one &= decisions->prevId[i] == -1; - } - int state_offset = ctxs->m_curr_state_offset; - __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); - _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); - if (all_above_minus_two) { - - bool rem_reg_all_gte_4 = true; - bool rem_reg_all_lt4 = true; - - __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel); - if (all_non_negative) { - __m128i prv_states = _mm_load_si128((__m128i const*)decisions->prevId); - __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); - prv_states = _mm_add_epi32(prv_states, prev_offset); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control); - - __m128i sig_sbb = _mm_load_si128((__m128i const*)state->m_numSigSbb); - sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states); - __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1)); - has_coeff = _mm_shuffle_epi8(has_coeff, control); - sig_sbb = _mm_or_si128(sig_sbb, has_coeff); - int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0); - memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4); - - __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId); - ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states); - int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0); - memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4); - - __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar); - go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states); - int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); - memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); - - - __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8); - _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits); - - __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4); - __m128i ones = _mm_set1_epi32(1); - rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones); - - __m128i reg_bins_sub = _mm_set1_epi32(0); - __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)); - __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two); - - __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); - reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four); - rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub); - _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins); - - __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); - int bit_mask = _mm_movemask_epi8(mask); - rem_reg_all_gte_4 = (bit_mask == 0xFFFF); - mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); - bit_mask = _mm_movemask_epi8(mask); - rem_reg_all_lt4 = (bit_mask == 0xFFFF); - - int32_t prv_states_scalar[4]; - _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t)); - } - } - else if (all_minus_one) { - memset(&state->m_numSigSbb[state_offset], 1, 4); - memset(&state->m_refSbbCtxId[state_offset], -1, 4); - - const int a = (state->effWidth * state->effHeight * 28) / 16; - - __m128i rem_reg_bins = _mm_set1_epi32(a); - __m128i sub = _mm_blendv_epi8( - _mm_set1_epi32(3), - abs_level, - _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)) - ); - rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub); - _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); - - __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); - int bit_mask = _mm_movemask_epi8(mask); - rem_reg_all_gte_4 = (bit_mask == 0xFFFF); - mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); - bit_mask = _mm_movemask_epi8(mask); - rem_reg_all_lt4 = (bit_mask == 0xFFFF); - - memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4); - - } - else { - for (int i = 0; i< 4; ++i) { - const int decision_id = i; - const int state_id = state_offset + i; - if (decisions->prevId[decision_id] >= 0) { - const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; - state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id]; - state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState]; - state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0]; - state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1]; - state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1; - state->m_goRicePar[state_id] = state->m_goRicePar[prvState]; - if (state->m_remRegBins[state_id] >= 4) { - state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); - } - memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t)); - } else { - state->m_numSigSbb[state_id] = 1; - state->m_refSbbCtxId[state_id] = -1; - int ctxBinSampleRatio = 28; - //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; - state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); - memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t)); - } - rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4; - rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4; - } - } - uint32_t level_offset = scan_pos & 15; - __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32)); - uint32_t max_abs_s[4]; - _mm_storeu_si128((__m128i*)max_abs_s, max_abs); - for (int i = 0; i < 4; ++i) { - uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i]; - levels[level_offset] = max_abs_s[i]; - } - state->all_gte_four = rem_reg_all_gte_4; - state->all_lt_four = rem_reg_all_lt4; - if (rem_reg_all_gte_4) { - const __m128i first_two_bytes = _mm_set1_epi32(0xffff); - const __m128i first_byte = _mm_set1_epi32(0xff); - const __m128i ones = _mm_set1_epi32(1); - const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8; - const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0); - const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1); - __m128i tinit = _mm_i32gather_epi32( - (int *)state->m_absLevelsAndCtxInit[state_offset], - _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)), - 2); - tinit = _mm_and_si128(tinit, first_two_bytes); - __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31)); - __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7)); - - uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset]; - switch (numIPos) { - case 5: - { - __m128i t = _mm_i32gather_epi32( - (int *)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); - t = _mm_and_si128(t, first_byte); - __m128i min_arg = _mm_min_epi32( - _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), - t - ); - sum_abs1 = _mm_add_epi32( - sum_abs1, - min_arg - ); - sum_num = _mm_add_epi32( - sum_num, - _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); - } - case 4: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); - t = _mm_and_si128(t, first_byte); - __m128i min_arg = _mm_min_epi32( - _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), - t - ); - sum_abs1 = _mm_add_epi32( - sum_abs1, - min_arg - ); - sum_num = _mm_add_epi32( - sum_num, - _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); - } - case 3: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); - t = _mm_and_si128(t, first_byte); - __m128i min_arg = _mm_min_epi32( - _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), - t - ); - sum_abs1 = _mm_add_epi32( - sum_abs1, - min_arg - ); - sum_num = _mm_add_epi32( - sum_num, - _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); - } - case 2: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); - t = _mm_and_si128(t, first_byte); - __m128i min_arg = _mm_min_epi32( - _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), - t - ); - sum_abs1 = _mm_add_epi32( - sum_abs1, - min_arg - ); - sum_num = _mm_add_epi32( - sum_num, - _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); - } - case 1: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); - t = _mm_and_si128(t, first_byte); - __m128i min_arg = _mm_min_epi32( - _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), - t - ); - sum_abs1 = _mm_add_epi32( - sum_abs1, - min_arg - ); - sum_num = _mm_add_epi32( - sum_num, - _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); - } break; - default: - assert(0); - } - __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); - __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); - offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); - __m128i temp = _mm_min_epi32( - _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1), - _mm_set1_epi32(3)); - offsets = _mm_add_epi32(offsets, temp); - __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8); - _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); - - sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); - sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext)); - uint32_t sum_gt1_s[4]; - _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1); - for (int i = 0; i < 4; ++i) { - memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0])); - } - - __m128i sum_abs = _mm_srli_epi32(tinit, 8); - sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32)); - switch (numIPos) { - case 5: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); - sum_abs = _mm_add_epi32(t, sum_abs); - } - case 4: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); - sum_abs = _mm_add_epi32(t, sum_abs); - } - case 3: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); - sum_abs = _mm_add_epi32(t, sum_abs); - } - case 2: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); - sum_abs = _mm_add_epi32(t, sum_abs); - } - case 1: - { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); - sum_abs = _mm_add_epi32(t, sum_abs); - } break; - default: - assert(0); - } - sum_abs = _mm_and_si128(sum_abs, first_byte); - if (extRiceFlag) { - assert(0 && "Not implemented for avx2"); - } else { - __m128i sum_all = _mm_max_epi32( - _mm_min_epi32( - _mm_set1_epi32(31), - _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))), - _mm_set1_epi32(0)); - __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - __m128i go_rice_par = _mm_shuffle_epi8(temp, control); - int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); - memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); - } - } - - else if (rem_reg_all_lt4) { - uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset]; - const __m128i last_two_bytes = _mm_set1_epi32(0xffff); - const __m128i last_byte = _mm_set1_epi32(0xff); - const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8; - const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0); - const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1); - __m128i tinit = _mm_i32gather_epi32( - (int*)state->m_absLevelsAndCtxInit[state_offset], - _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)), - 2); - tinit = _mm_and_si128(tinit, last_two_bytes); - __m128i sum_abs = _mm_srli_epi32(tinit, 8); - switch (numIPos) { - case 5: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), - 1); - t = _mm_and_si128(t, last_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 4: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), - 1); - t = _mm_and_si128(t, last_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 3: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), - 1); - t = _mm_and_si128(t, last_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 2: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), - 1); - t = _mm_and_si128(t, last_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - } - case 1: { - __m128i t = _mm_i32gather_epi32( - (int*)levels, - _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), - 1); - t = _mm_and_si128(t, last_byte); - sum_abs = _mm_add_epi32(sum_abs, t); - } break; - default: - assert(0); - } - if (extRiceFlag) { - assert(0 && "Not implemented for avx2"); - } else { - __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs); - __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); - __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - __m128i go_rice_par = _mm_shuffle_epi8(temp, control); - int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); - memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); - - - for (int i = 0; i < 4; ++i) { - state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i]; - - } - - } - - } - else { - for (int i = 0; i < 4; ++i) { - const int state_id = state_offset + i; - uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]); - if (state->m_remRegBins[state_id] >= 4) { - coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]; - coeff_t sumAbs1 = (tinit >> 3) & 31; - coeff_t sumNum = tinit & 7; -#define UPDATE(k) \ - { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ - sumAbs1 += MIN(4 + (t & 1), t); \ - sumNum += !!t; \ - } - switch (numIPos) { - case 5: UPDATE(4); - case 4: UPDATE(3); - case 3: UPDATE(2); - case 2: UPDATE(1); - case 1: UPDATE(0); break; - default: assert(0); - } -#undef UPDATE - coeff_t sumGt1 = sumAbs1 - sumNum; - state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0]; - state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1]; - memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0])); - coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8; -#define UPDATE(k) \ - { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ - sumAbs += t; \ - } - switch (numIPos) { - case 5: UPDATE(4); - case 4: UPDATE(3); - case 3: UPDATE(2); - case 2: UPDATE(1); - case 1: UPDATE(0); break; - default: assert(0); - } -#undef UPDATE - if (extRiceFlag) { - unsigned currentShift = templateAbsCompare(sumAbs); - sumAbs = sumAbs >> currentShift; - int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0); - state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; - state->m_goRicePar[state_id] += currentShift; - } else { - int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0); - state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; - } - } else { - coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8; -#define UPDATE(k) \ - { \ - coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ - sumAbs += t; \ - } - switch (numIPos) { - case 5: UPDATE(4); - case 4: UPDATE(3); - case 3: UPDATE(2); - case 2: UPDATE(1); - case 1: UPDATE(0); break; - default: assert(0); - } -#undef UPDATE - if (extRiceFlag) { - unsigned currentShift = templateAbsCompare(sumAbs); - sumAbs = sumAbs >> currentShift; - sumAbs = MIN(31, sumAbs); - state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; - state->m_goRicePar[state_id] += currentShift; - } else { - sumAbs = MIN(31, sumAbs); - state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; - } - state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id]; - } - } - } - } else { - for (int i = 0; i < 4; ++i) { - state->all_gte_four = true; - state->all_lt_four = true; - updateState( - ctxs, - numIPos, - scan_pos, - decisions, - sigCtxOffsetNext, - gtxCtxOffsetNext, - next_nb_info_ssb, - baseLevel, - extRiceFlag, - i); - } - } -} - - -static INLINE void updateState( +void uvg_dep_quant_update_state( context_store * ctxs, int numIPos, const uint32_t scan_pos, @@ -2090,7 +695,7 @@ static INLINE void updateState( int decision_id) { all_depquant_states* state = &ctxs->m_allStates; int state_id = ctxs->m_curr_state_offset + decision_id; - // state->m_rdCost[state_id] = decisions->rdCost[decision_id]; + state->m_rdCost[state_id] = decisions->rdCost[decision_id]; if (decisions->prevId[decision_id] > -2) { if (decisions->prevId[decision_id] >= 0) { const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; @@ -2200,61 +805,6 @@ static INLINE void updateState( } static bool same[13]; -static void xDecideAndUpdate( - rate_estimator_t* re, - context_store* ctxs, - struct dep_quant_scan_info const* const scan_info, - const coeff_t absCoeff, - const uint32_t scan_pos, - const uint32_t width_in_sbb, - const uint32_t height_in_sbb, - const NbInfoSbb next_nb_info_ssb, - bool zeroOut, - coeff_t quantCoeff, - const uint32_t effWidth, - const uint32_t effHeight, - bool is_chroma) -{ - Decision* decisions = &ctxs->m_trellis[scan_pos]; - SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int); - - enum ScanPosType spt = 0; - if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1) - { - spt = SCAN_SOCSBB; - } - else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16) - { - spt = SCAN_EOCSBB; - } - - xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); - - if (scan_pos) { - if (!(scan_pos & 15)) { - SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int); - update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions); - //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0); - //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1); - //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2); - //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3); - memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t)); - memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t)); - memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t)); - } else if (!zeroOut) { - update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false); - /* updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0); - updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1); - updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2); - updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);*/ - } - - if (spt == SCAN_SOCSBB) { - SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int); - } - } -} - int uvg_dep_quant( const encoder_state_t* const state, @@ -2419,7 +969,7 @@ int uvg_dep_quant( if (enableScalingLists) { init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]); - xDecideAndUpdate( + uvg_dep_quant_decide_and_update( rate_estimator, ctxs, scan_info, @@ -2436,7 +986,7 @@ int uvg_dep_quant( ); //tu.cu->slice->getReverseLastSigCoeffFlag()); } else { - xDecideAndUpdate( + uvg_dep_quant_decide_and_update( rate_estimator, ctxs, scan_info, diff --git a/src/dep_quant.h b/src/dep_quant.h index ebb54d31..676d1bab 100644 --- a/src/dep_quant.h +++ b/src/dep_quant.h @@ -46,6 +46,8 @@ typedef struct encoder_control_t encoder_control_t; +enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 }; + struct dep_quant_scan_info { uint8_t sig_ctx_offset[2]; @@ -97,6 +99,91 @@ typedef struct uint16_t outPos[5]; } NbInfoOut; +typedef struct { + int32_t absLevel[4]; + int64_t deltaDist[4]; +} PQData; + +typedef struct { + int64_t ALIGNED(32) rdCost[8]; + int32_t ALIGNED(32) absLevel[8]; + int32_t ALIGNED(32) prevId[8]; +} Decision; + + +typedef struct { + uint8_t* sbbFlags; + uint8_t* levels; +} SbbCtx; + +typedef struct { + const NbInfoOut* m_nbInfo; + uint32_t m_sbbFlagBits[2][2]; + SbbCtx m_allSbbCtx[8]; + int m_curr_sbb_ctx_offset; + int m_prev_sbb_ctx_offset; + uint8_t sbb_memory[8 * 1024]; + uint8_t level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH]; + int num_coeff; +} common_context; + + +typedef struct { + int64_t m_rdCost; + uint16_t m_absLevelsAndCtxInit + [24]; // 16x8bit for abs levels + 16x16bit for ctx init id + int8_t m_numSigSbb; + int m_remRegBins; + int8_t m_refSbbCtxId; + uint32_t m_sbbFracBits[2]; + uint32_t m_sigFracBits[2]; + int32_t m_coeffFracBits[6]; + int8_t m_goRicePar; + int8_t m_goRiceZero; + int8_t m_stateId; + uint32_t* m_sigFracBitsArray[12]; + int32_t* m_gtxFracBitsArray[21]; + common_context* m_commonCtx; + + unsigned effWidth; + unsigned effHeight; +} depquant_state; +typedef struct { + int64_t ALIGNED(32) m_rdCost[12]; + uint16_t ALIGNED(32) m_absLevelsAndCtxInit + [12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id + int8_t ALIGNED(16) m_numSigSbb[12]; + int ALIGNED(32) m_remRegBins[12]; + int8_t ALIGNED(16) m_refSbbCtxId[12]; + uint32_t ALIGNED(32) m_sbbFracBits[12][2]; + uint32_t ALIGNED(32) m_sigFracBits[12][2]; + int32_t ALIGNED(32) m_coeffFracBits[12][6]; + int8_t ALIGNED(16) m_goRicePar[12]; + int8_t ALIGNED(16) m_goRiceZero[12]; + int8_t ALIGNED(16) m_stateId[12]; + uint32_t ALIGNED(32) m_sigFracBitsArray[12][12][2]; + int32_t ALIGNED(32) m_gtxFracBitsArray[21][6]; + common_context* m_commonCtx; + + unsigned effWidth; + unsigned effHeight; + + bool all_gte_four; + bool all_lt_four; +} all_depquant_states; + +typedef struct { + common_context m_common_context; + all_depquant_states m_allStates; + int m_curr_state_offset; + int m_prev_state_offset; + int m_skip_state_offset; + depquant_state m_startState; + quant_block* m_quant; + Decision m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH]; +} context_store; + + int uvg_init_nb_info(encoder_control_t* encoder); void uvg_dealloc_nb_info(encoder_control_t* encoder); @@ -122,4 +209,40 @@ int uvg_dep_quant( enum uvg_tree_type tree_type, int* absSum, const bool enableScalingLists); + + +void uvg_dep_quant_update_state( + context_store* ctxs, + int numIPos, + const uint32_t scan_pos, + const Decision* decisions, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const NbInfoSbb next_nb_info_ssb, + const int baseLevel, + const bool extRiceFlag, + int decision_id); + + +void uvg_dep_quant_update_state_eos( + context_store* ctxs, + const uint32_t scan_pos, + const uint32_t cg_pos, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const uint32_t next_sbb_right, + const uint32_t next_sbb_below, + const Decision* decisions, + int decision_id); + +void uvg_dep_quant_check_rd_costs( + const all_depquant_states* const state, + const enum ScanPosType spt, + const PQData* pqDataA, + Decision* decisions, + const int decisionA, + const int decisionB, + const int state_offset); #endif diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c new file mode 100644 index 00000000..86056de4 --- /dev/null +++ b/src/strategies/avx2/depquant-avx2.c @@ -0,0 +1,1389 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/* +* \file +*/ + +#include "strategies/avx2/depquant-avx2.h" + +#if COMPILE_INTEL_AVX2 && defined X86_64 +#include "dep_quant.h" + +#include +#include "cu.h" +#include "encoderstate.h" +#include "intra.h" +#include "rdo.h" +#include "transform.h" +#include "generic/quant-generic.h" +#include "uvg_math.h" +static const int32_t g_goRiceBits[4][RICEMAX] = { + { 32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, + { 65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, + { 98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680}, + {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376}, +}; + +static const int g_riceT[4] = { 32,128, 512, 2048 }; +static const int g_riceShift[5] = { 0, 2, 4, 6, 8 }; + +static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 }; + +static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start) +{ + int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; + + __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]); + __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]); + + __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]); + __m256i rd_cost_b = rd_cost_a; + __m256i rd_cost_z = rd_cost_a; + + rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist); + rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist); + + + if (state->all_gte_four) { + if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) { + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4); + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits); + } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + + __m128i max_rice = _mm_set1_epi32(31); + value = _mm_min_epi32(value, max_rice); + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqAs[4] = {0, 0, 3, 3}; + ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqA = pqAs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqA] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0])); + } + + if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) { + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits); + } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4); + + __m128i max_rice = _mm_set1_epi32(31); + value = _mm_min_epi32(value, max_rice); + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4)); + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqBs[4] = {2, 2, 1, 1}; + int64_t rd_costs[4] = {0, 0, 0, 0}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqB = pqBs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqB] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_b = + _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0])); + } + + if (spt == SCAN_ISCSBB) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); + } else if (spt == SCAN_SOCSBB) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); + + original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]); + odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); + + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0); + } + else { + if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); + } + else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) { + rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]); + } + + else { + const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3}; + _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a); + _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b); + _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z); + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + if (state->m_numSigSbb[state_offset]) { + temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0]; + } else { + temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]]; + } + } + rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); + rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); + rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); + } + } + } else if (state->all_lt_four) { + __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS); + __m128i max_rice = _mm_set1_epi32(31); + __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start])); + // RD cost A + { + __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]); + __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero); + + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); + + + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } + // RD cost b + { + __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]); + __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero); + + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp); + + + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4); + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } + // RD cost Z + { + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4); + rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab)); + } + } else { + const int pqAs[4] = {0, 0, 3, 3}; + const int pqBs[4] = {2, 2, 1, 1}; + const int decision_a[4] = {0, 2, 1, 3}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + const int pqA = pqAs[i]; + const int pqB = pqBs[i]; + int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA]; + int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB]; + int64_t rdCostZ = state->m_rdCost[state_offset]; + if (state->m_remRegBins[state_offset] >= 4) { + if (pqDataA->absLevel[pqA] < 4) { + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (pqDataA->absLevel[pqB] < 4) { + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (spt == SCAN_ISCSBB) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else if (spt == SCAN_SOCSBB) { + rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0]; + } else if (state->m_numSigSbb[state_offset]) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else { + rdCostZ = decisions->rdCost[decision_a[i]]; + } + } else { + rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)]; + rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)]; + rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]]; + } + temp_rd_cost_a[i] = rdCostA; + temp_rd_cost_b[i] = rdCostB; + temp_rd_cost_z[i] = rdCostZ; + } + rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a); + rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b); + rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z); + } + rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216); + rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141); + rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216); + __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost); + + __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel); + __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId); + __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20); + __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + decision_data = _mm256_permutevar8x32_epi32(decision_data, mask); + + __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]); + __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]); + __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0); + + __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b); + __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b); + __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b); + + __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision); + __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision); + __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision); + + __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second); + __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision); + __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision); + + _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost); + final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data); +} + + +static INLINE void checkRdCostSkipSbbZeroOut( + Decision* decision, + const all_depquant_states* const state, + int decision_id, + int skip_offset) +{ + int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0]; + decision->rdCost[decision_id] = rdCost; + decision->absLevel[decision_id] = 0; + decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; +} + + +static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset) +{ + int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0]; + if (rdCost < decisions->rdCost[decision_id]) + { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = 0; + decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id]; + } +} + +static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int + decision_id) +{ + int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset; + if (pqData->absLevel[decision_id] < 4) { + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]]; + } + else { + const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1; + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1]; + } + if (rdCost < decisions->rdCost[decision_id]) { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = pqData->absLevel[decision_id]; + decisions->prevId[decision_id] = -1; + } +} + +static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff) +{ + int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff; + coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift))); + int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact; + int index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; +} + + +static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2}, + .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} }; + + +static void xDecide( + all_depquant_states* const all_states, + depquant_state* const m_startState, + quant_block * qp, + const enum ScanPosType spt, + const coeff_t absCoeff, + const int lastOffset, + Decision* decisions, + bool zeroOut, + coeff_t quanCoeff, + const int skip_offset, + const int prev_offset) +{ + memcpy(decisions, &startDec, sizeof(Decision)); + + if (zeroOut) { + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset); + } + return; + } + + PQData pqData; + preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); + check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset); + //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); + //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); + //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); + //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 2, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 3, skip_offset); + } + + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0); + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2); +} + + +static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos, + const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext, + const uint32_t width_in_sbb, const uint32_t height_in_sbb, + const uint32_t next_sbb_right, const uint32_t next_sbb_below, + const Decision* decisions) +{ + all_depquant_states* state = &ctxs->m_allStates; + bool all_above_minus_two = true; + bool all_between_zero_and_three = true; + bool all_above_four = true; + + + int state_offset = ctxs->m_curr_state_offset; + __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); + _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); + for (int i = 0; i < 4; ++i) { + all_above_minus_two &= decisions->prevId[i] > -2; + all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4; + all_above_four &= decisions->prevId[i] >= 4; + } + if (all_above_minus_two) { + bool all_have_previous_state = true; + __m128i prev_state; + __m128i prev_state_no_offset; + __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel); + if (all_above_four) { + prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset); + prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4)); + prev_state = _mm_add_epi32( + prev_state, + prev_state_no_offset + ); + memset(&state->m_numSigSbb[state_offset], 0, 4); + for (int i = 0; i < 4; ++i) { + memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t)); + } + } else if (all_between_zero_and_three) { + prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); + prev_state = _mm_add_epi32( + prev_state_no_offset, + _mm_load_si128((const __m128i*)decisions->prevId) + ); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00)); + __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb); + num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes); + num_sig_sbb = _mm_add_epi32( + num_sig_sbb, + _mm_min_epi32(abs_level, _mm_set1_epi32(1)) + ); + + num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control); + int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0); + memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4); + + int32_t prev_state_scalar[4]; + _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state); + for (int i = 0; i < 4; ++i) { + memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t)); + } + } else { + int prev_state_s[4] = {-1, -1, -1, -1}; + for (int i = 0; i < 4; ++i) { + const int decision_id = i; + const int curr_state_offset = state_offset + i; + if (decisions->prevId[decision_id] >= 4) { + prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4); + state->m_numSigSbb[curr_state_offset] = 0; + memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t)); + } else if (decisions->prevId[decision_id] >= 0) { + prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; + state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id]; + memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t)); + } else { + state->m_numSigSbb[curr_state_offset] = 1; + memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t)); + all_have_previous_state = false; + } + } + prev_state = _mm_loadu_si128((__m128i const*)prev_state_s); + } + uint32_t level_offset = scan_pos & 15; + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32)); + uint32_t max_abs_s[4]; + _mm_storeu_si128((__m128i*)max_abs_s, max_abs); + for (int i = 0; i < 4; ++i) { + uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i]; + levels[level_offset] = max_abs_s[i]; + } + + // Update common context + __m128i last; + { + const uint32_t numSbb = width_in_sbb * height_in_sbb; + common_context* cc = &ctxs->m_common_context; + size_t setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t); + int previous_state_array[4]; + _mm_storeu_si128((__m128i*)previous_state_array, prev_state); + for (int curr_state = 0; curr_state < 4; ++curr_state) { + uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags; + uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels; + const int p_state = previous_state_array[curr_state]; + if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) { + const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state]; + memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t)); + memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize); + } else { + memset(sbbFlags, 0, numSbb * sizeof(uint8_t)); + memset(levels + scan_pos, 0, setCpSize); + } + sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset]; + memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t)); + } + + __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0); + __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right); + __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m); + __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0); + + __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below)); + __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0); + + __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below); + sig_sbb = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff)); + sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1)); + __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8); + _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits); + + memset(&state->m_numSigSbb[state_offset], 0, 4); + memset(&state->m_goRicePar[state_offset], 0, 4); + + uint8_t states[4] = {0, 1, 2, 3}; + memcpy(&state->m_refSbbCtxId[state_offset], states, 4); + if (all_have_previous_state) { + __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4); + _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); + } else { + const int temp = (state->effWidth * state->effHeight * 28) / 16; + for (int i = 0; i < 4; ++i) { + if (previous_state_array[i] != -1) { + state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]]; + } else { + state->m_remRegBins[i + state_offset] = temp; + } + } + } + + const int scanBeg = scan_pos - 16; + const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg; + const uint8_t* absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg; + + __m128i levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0); + __m128i first_byte = _mm_set1_epi32(0xff); + __m128i ones = _mm_set1_epi32(1); + __m128i fours = _mm_set1_epi32(4); + __m256i all[4]; + uint64_t temp[4]; + const __m256i v_shuffle = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16); + + for (int id = 0; id < 16; id++, nbOut++) { + if (nbOut->num == 0) { + temp[id % 4] = 0; + if (id % 4 == 3) { + all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); + all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle); + } + continue; + } + __m128i sum_abs = _mm_set1_epi32(0); + __m128i sum_abs_1 = _mm_set1_epi32(0); + __m128i sum_num = _mm_set1_epi32(0); + switch (nbOut->num) { + case 5: + { + __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4])); + __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1); + t = _mm_and_si128(t, first_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones) + ) + ); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 4: { + __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3])); + __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); + t = _mm_and_si128(t, first_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 3: { + __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2])); + __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); + t = _mm_and_si128(t, first_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 2: { + __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1])); + __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); + t = _mm_and_si128(t, first_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + case 1: { + __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0])); + __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1); + t = _mm_and_si128(t, first_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones)); + __m128i min_t = _mm_min_epi32( + t, + _mm_add_epi32( + fours, + _mm_and_si128(t, ones))); + sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t); + } + break; + default: + assert(0); + } + sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3); + sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8); + __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs); + template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1); + __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0); + __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask); + temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0); + if (id % 4 == 3) { + all[id / 4] = _mm256_loadu_si256((__m256i const*)temp); + all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle); + last = template_ctx_init; + } + } + + __m256i* v_src_tmp = all; + + __m256i v_tmp[4]; + v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20); + v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31); + v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20); + v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31); + + __m256i v_tmp16_lo[2]; + __m256i v_tmp16_hi[2]; + v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]); + v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]); + v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]); + + v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0)); + v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8), _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20)); + _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8), _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31)); + _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8), _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20)); + _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8), _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31)); + + for (int i = 0; i < 4; ++i) { + memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16); + } + } + + __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7)); + __m128i sum_abs1 = _mm_and_si128( + _mm_srli_epi32(last, 3), + _mm_set1_epi32(31)); + + __m128i sum_abs_min = _mm_min_epi32( + _mm_set1_epi32(3), + _mm_srli_epi32( + _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)), + 1)); + + __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); + offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); + offsets = _mm_add_epi32(offsets, sum_abs_min); + __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8); + _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); + + + __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); + __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); + uint32_t sum_gt1_s[4]; + _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1); + for (int i = 0; i < 4; ++i) { + memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0])); + } + } + else { + for (int i = 0; i < 4; i++) { + uvg_dep_quant_update_state_eos( + ctxs, + scan_pos, + cg_pos, + sigCtxOffsetNext, + gtxCtxOffsetNext, + width_in_sbb, + height_in_sbb, + next_sbb_right, + next_sbb_below, + decisions, + i); + } + } +} + +static INLINE void update_states_avx2( + context_store* ctxs, + int numIPos, + const uint32_t scan_pos, + const Decision* decisions, + const uint32_t sigCtxOffsetNext, + const uint32_t gtxCtxOffsetNext, + const NbInfoSbb next_nb_info_ssb, + const int baseLevel, + const bool extRiceFlag) +{ + all_depquant_states* state = &ctxs->m_allStates; + + bool all_non_negative = true; + bool all_above_minus_two = true; + bool all_minus_one = true; + for (int i = 0; i < 4; ++i) { + all_non_negative &= decisions->prevId[i] >= 0; + all_above_minus_two &= decisions->prevId[i] > -2; + all_minus_one &= decisions->prevId[i] == -1; + } + int state_offset = ctxs->m_curr_state_offset; + __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost); + _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost); + if (all_above_minus_two) { + + bool rem_reg_all_gte_4 = true; + bool rem_reg_all_lt4 = true; + + __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel); + if (all_non_negative) { + __m128i prv_states = _mm_load_si128((__m128i const*)decisions->prevId); + __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset); + prv_states = _mm_add_epi32(prv_states, prev_offset); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control); + + __m128i sig_sbb = _mm_load_si128((__m128i const*)state->m_numSigSbb); + sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states); + __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1)); + has_coeff = _mm_shuffle_epi8(has_coeff, control); + sig_sbb = _mm_or_si128(sig_sbb, has_coeff); + int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0); + memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4); + + __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId); + ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states); + int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0); + memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4); + + __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar); + go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + + + __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8); + _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits); + + __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4); + __m128i ones = _mm_set1_epi32(1); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones); + + __m128i reg_bins_sub = _mm_set1_epi32(0); + __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)); + __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two); + + __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub); + _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins); + + __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); + int bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_gte_4 = (bit_mask == 0xFFFF); + mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_lt4 = (bit_mask == 0xFFFF); + + int32_t prv_states_scalar[4]; + _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states); + for (int i = 0; i < 4; ++i) { + memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t)); + } + } + else if (all_minus_one) { + memset(&state->m_numSigSbb[state_offset], 1, 4); + memset(&state->m_refSbbCtxId[state_offset], -1, 4); + + const int a = (state->effWidth * state->effHeight * 28) / 16; + + __m128i rem_reg_bins = _mm_set1_epi32(a); + __m128i sub = _mm_blendv_epi8( + _mm_set1_epi32(3), + abs_level, + _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2)) + ); + rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub); + _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins); + + __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); + int bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_gte_4 = (bit_mask == 0xFFFF); + mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4)); + bit_mask = _mm_movemask_epi8(mask); + rem_reg_all_lt4 = (bit_mask == 0xFFFF); + + memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4); + + } + else { + for (int i = 0; i< 4; ++i) { + const int decision_id = i; + const int state_id = state_offset + i; + if (decisions->prevId[decision_id] >= 0) { + const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id]; + state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id]; + state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState]; + state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0]; + state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1]; + state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1; + state->m_goRicePar[state_id] = state->m_goRicePar[prvState]; + if (state->m_remRegBins[state_id] >= 4) { + state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); + } + memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t)); + } else { + state->m_numSigSbb[state_id] = 1; + state->m_refSbbCtxId[state_id] = -1; + int ctxBinSampleRatio = 28; + //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA; + state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3); + memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t)); + } + rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4; + rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4; + } + } + uint32_t level_offset = scan_pos & 15; + __m128i max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32)); + uint32_t max_abs_s[4]; + _mm_storeu_si128((__m128i*)max_abs_s, max_abs); + for (int i = 0; i < 4; ++i) { + uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i]; + levels[level_offset] = max_abs_s[i]; + } + state->all_gte_four = rem_reg_all_gte_4; + state->all_lt_four = rem_reg_all_lt4; + if (rem_reg_all_gte_4) { + const __m128i first_two_bytes = _mm_set1_epi32(0xffff); + const __m128i first_byte = _mm_set1_epi32(0xff); + const __m128i ones = _mm_set1_epi32(1); + const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8; + const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0); + const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1); + __m128i tinit = _mm_i32gather_epi32( + (int *)state->m_absLevelsAndCtxInit[state_offset], + _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)), + 2); + tinit = _mm_and_si128(tinit, first_two_bytes); + __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31)); + __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7)); + + uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset]; + switch (numIPos) { + case 5: + { + __m128i t = _mm_i32gather_epi32( + (int *)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), + 1); + t = _mm_and_si128(t, first_byte); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); + } + case 4: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), + 1); + t = _mm_and_si128(t, first_byte); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); + } + case 3: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), + 1); + t = _mm_and_si128(t, first_byte); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); + } + case 2: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), + 1); + t = _mm_and_si128(t, first_byte); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); + } + case 1: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), + 1); + t = _mm_and_si128(t, first_byte); + __m128i min_arg = _mm_min_epi32( + _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)), + t + ); + sum_abs1 = _mm_add_epi32( + sum_abs1, + min_arg + ); + sum_num = _mm_add_epi32( + sum_num, + _mm_min_epi32(_mm_and_si128(t, first_byte), ones)); + } break; + default: + assert(0); + } + __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num); + __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0); + offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext)); + __m128i temp = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1), + _mm_set1_epi32(3)); + offsets = _mm_add_epi32(offsets, temp); + __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8); + _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits); + + sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4)); + sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext)); + uint32_t sum_gt1_s[4]; + _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1); + for (int i = 0; i < 4; ++i) { + memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0])); + } + + __m128i sum_abs = _mm_srli_epi32(tinit, 8); + sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32)); + switch (numIPos) { + case 5: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), + 1); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 4: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), + 1); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 3: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), + 1); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 2: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), + 1); + sum_abs = _mm_add_epi32(t, sum_abs); + } + case 1: + { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), + 1); + sum_abs = _mm_add_epi32(t, sum_abs); + } break; + default: + assert(0); + } + sum_abs = _mm_and_si128(sum_abs, first_byte); + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + __m128i sum_all = _mm_max_epi32( + _mm_min_epi32( + _mm_set1_epi32(31), + _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))), + _mm_set1_epi32(0)); + __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i go_rice_par = _mm_shuffle_epi8(temp, control); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + } + } + + else if (rem_reg_all_lt4) { + uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset]; + const __m128i last_two_bytes = _mm_set1_epi32(0xffff); + const __m128i last_byte = _mm_set1_epi32(0xff); + const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8; + const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0); + const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1); + __m128i tinit = _mm_i32gather_epi32( + (int*)state->m_absLevelsAndCtxInit[state_offset], + _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)), + 2); + tinit = _mm_and_si128(tinit, last_two_bytes); + __m128i sum_abs = _mm_srli_epi32(tinit, 8); + switch (numIPos) { + case 5: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])), + 1); + t = _mm_and_si128(t, last_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + } + case 4: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])), + 1); + t = _mm_and_si128(t, last_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + } + case 3: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])), + 1); + t = _mm_and_si128(t, last_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + } + case 2: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])), + 1); + t = _mm_and_si128(t, last_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + } + case 1: { + __m128i t = _mm_i32gather_epi32( + (int*)levels, + _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])), + 1); + t = _mm_and_si128(t, last_byte); + sum_abs = _mm_add_epi32(sum_abs, t); + } break; + default: + assert(0); + } + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs); + __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4); + __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i go_rice_par = _mm_shuffle_epi8(temp, control); + int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0); + memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4); + + + for (int i = 0; i < 4; ++i) { + state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i]; + + } + + } + + } + else { + for (int i = 0; i < 4; ++i) { + const int state_id = state_offset + i; + uint8_t* levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]); + if (state->m_remRegBins[state_id] >= 4) { + coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]; + coeff_t sumAbs1 = (tinit >> 3) & 31; + coeff_t sumNum = tinit & 7; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + sumAbs1 += MIN(4 + (t & 1), t); \ + sumNum += !!t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + coeff_t sumGt1 = sumAbs1 - sumNum; + state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0]; + state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1]; + memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0])); + + + coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + sumAbs += t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll]; + } + } else { + coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8; +#define UPDATE(k) \ + { \ + coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \ + sumAbs += t; \ + } + switch (numIPos) { + case 5: UPDATE(4); + case 4: UPDATE(3); + case 3: UPDATE(2); + case 2: UPDATE(1); + case 1: UPDATE(0); break; + default: assert(0); + } +#undef UPDATE + if (extRiceFlag) { + assert(0 && "Not implemented for avx2"); + } else { + sumAbs = MIN(31, sumAbs); + state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs]; + } + state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id]; + } + } + } + } else { + for (int i = 0; i < 4; ++i) { + state->all_gte_four = true; + state->all_lt_four = true; + uvg_dep_quant_update_state( + ctxs, + numIPos, + scan_pos, + decisions, + sigCtxOffsetNext, + gtxCtxOffsetNext, + next_nb_info_ssb, + baseLevel, + extRiceFlag, + i); + } + } +} + +void uvg_dep_quant_decide_and_update_avx2( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma) +{ + Decision* decisions = &ctxs->m_trellis[scan_pos]; + SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int); + + enum ScanPosType spt = 0; + if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1) + { + spt = SCAN_SOCSBB; + } + else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16) + { + spt = SCAN_EOCSBB; + } + + xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + + if (scan_pos) { + if (!(scan_pos & 15)) { + SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int); + update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions); + memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t)); + memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t)); + memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t)); + } else if (!zeroOut) { + update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false); + } + + if (spt == SCAN_SOCSBB) { + SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int); + } + } +} + + +#endif //COMPILE_INTEL_AVX2 && defined X86_64 + +int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth) +{ + bool success = true; + +#if COMPILE_INTEL_AVX2 && defined X86_64 + success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2); +#endif //COMPILE_INTEL_AVX2 && defined X86_64 + + return success; +} diff --git a/src/strategies/avx2/depquant-avx2.h b/src/strategies/avx2/depquant-avx2.h new file mode 100644 index 00000000..e6db110c --- /dev/null +++ b/src/strategies/avx2/depquant-avx2.h @@ -0,0 +1,46 @@ +#ifndef STRATEGIES_DEPQUANT_AVX2_H_ +#define STRATEGIES_DEPQUANT_AVX2_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Optimizations for AVX2. + */ + +#include "global.h" // IWYU pragma: keep + + +int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth); + +#endif //STRATEGIES_DEPQUANT_AVX2_H_ diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c new file mode 100644 index 00000000..aa2ea99e --- /dev/null +++ b/src/strategies/generic/depquant-generic.c @@ -0,0 +1,238 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "strategies/generic/depquant-generic.h" + +#include "dep_quant.h" + +#include "cu.h" +#include "encoderstate.h" +#include "intra.h" +#include "rdo.h" +#include "strategyselector.h" +#include "transform.h" +#include "uvg_math.h" +#include "generic/quant-generic.h" +static const int32_t g_goRiceBits[4][RICEMAX] = { + {32768, 65536, 98304, 131072, 163840, 196608, 262144, 262144, + 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, + 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, + 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752}, + {65536, 65536, 98304, 98304, 131072, 131072, 163840, 163840, + 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, + 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, + 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984}, + {98304, 98304, 98304, 98304, 131072, 131072, 131072, 131072, + 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, + 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, + 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680}, + {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, + 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, + 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, + 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376}, +}; + + +static INLINE void checkRdCostSkipSbbZeroOut( + Decision* decision, + const all_depquant_states* const state, + int decision_id, + int skip_offset) { + int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0]; + decision->rdCost[decision_id] = rdCost; + decision->absLevel[decision_id] = 0; + decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; +} + +static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset) +{ + int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0]; + if (rdCost < decisions->rdCost[decision_id]) + { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = 0; + decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id]; + } +} + +static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int + decision_id) +{ + int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset; + if (pqData->absLevel[decision_id] < 4) { + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]]; + } + else { + const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1; + rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)] + + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1]; + } + if (rdCost < decisions->rdCost[decision_id]) { + decisions->rdCost[decision_id] = rdCost; + decisions->absLevel[decision_id] = pqData->absLevel[decision_id]; + decisions->prevId[decision_id] = -1; + } +} + + + +static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2}, + .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} }; + +static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff) +{ + int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff; + coeff_t qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift))); + int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact; + int index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; + scaledAdd += qp->m_DistStepAdd; + index = qIdx & 3; + pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift; + pqData->absLevel[index] = (++qIdx) >> 1; +} + +static void xDecide( + all_depquant_states* const all_states, + depquant_state* const m_startState, + quant_block* qp, + const enum ScanPosType spt, + const coeff_t absCoeff, + const int lastOffset, + Decision* decisions, + bool zeroOut, + coeff_t quanCoeff, + const int skip_offset, + const int prev_offset) +{ + memcpy(decisions, &startDec, sizeof(Decision)); + + if (zeroOut) { + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset); + checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset); + } + return; + } + + PQData pqData; + preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); + uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); + if (spt == SCAN_EOCSBB) { + checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 2, skip_offset); + checkRdCostSkipSbb(all_states, decisions, 3, skip_offset); + } + + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0); + checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2); +} + + +static void uvg_dep_quant_decide_and_update_generic( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma) +{ + Decision* decisions = &ctxs->m_trellis[scan_pos]; + SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int); + + enum ScanPosType spt = 0; + if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1) + { + spt = SCAN_SOCSBB; + } + else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16) + { + spt = SCAN_EOCSBB; + } + + xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset); + + if (scan_pos) { + if (!(scan_pos & 15)) { + SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2); + uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3); + memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t)); + memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t)); + memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t)); + } else if (!zeroOut) { + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2); + uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3); + } + + if (spt == SCAN_SOCSBB) { + SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int); + } + } +} + + +int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic); + + + return success; +} diff --git a/src/strategies/generic/depquant-generic.h b/src/strategies/generic/depquant-generic.h new file mode 100644 index 00000000..488963be --- /dev/null +++ b/src/strategies/generic/depquant-generic.h @@ -0,0 +1,50 @@ +#ifndef STRATEGIES_DEPQUANT_GENERIC_H_ +#define STRATEGIES_DEPQUANT_GENERIC_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Generic C implementations of optimized functions. + */ + +#include "cu.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "uvg266.h" +#include "tables.h" + + +int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth); + +#endif //STRATEGIES_DEPQUANT_GENERIC_H_ diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c new file mode 100644 index 00000000..7ba62163 --- /dev/null +++ b/src/strategies/strategies-depquant.c @@ -0,0 +1,54 @@ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#include "strategies/strategies-depquant.h" + +#include "strategies/avx2/depquant-avx2.h" +#include "strategies/generic/depquant-generic.h" +#include "strategyselector.h" + + +// Define function pointers. +dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; + + +int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= uvg_strategy_register_depquant_generic(opaque, bitdepth); + + if (uvg_g_hardware_flags.intel_flags.avx2) { + success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth); + } + return success; +} diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h new file mode 100644 index 00000000..4021c458 --- /dev/null +++ b/src/strategies/strategies-depquant.h @@ -0,0 +1,77 @@ +#ifndef STRATEGIES_DEPQUANT_H_ +#define STRATEGIES_DEPQUANT_H_ +/***************************************************************************** + * This file is part of uvg266 VVC encoder. + * + * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Interface for sao functions. + */ + +#include "encoder.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "uvg266.h" +#include "dep_quant.h" + + +// Declare function pointers. +typedef int(dep_quant_decide_and_update_func)( + rate_estimator_t* re, + context_store* ctxs, + struct dep_quant_scan_info const* const scan_info, + const coeff_t absCoeff, + const uint32_t scan_pos, + const uint32_t width_in_sbb, + const uint32_t height_in_sbb, + const NbInfoSbb next_nb_info_ssb, + bool zeroOut, + coeff_t quantCoeff, + const uint32_t effWidth, + const uint32_t effHeight, + bool is_chroma); + + + +// Declare function pointers. +extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; + +int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth); + + +#define STRATEGIES_DEPQUANT_EXPORTS \ + {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \ + + + +#endif //STRATEGIES_DEPQUANT_H_ diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c index 89baf86e..62c75d6f 100644 --- a/src/strategies/strategies-quant.c +++ b/src/strategies/strategies-quant.c @@ -38,15 +38,16 @@ // Define function pointers. -quant_func *uvg_quant; -quant_cbcr_func *uvg_quant_cbcr_residual; -quant_residual_func *uvg_quantize_residual; -dequant_func *uvg_dequant; -coeff_abs_sum_func *uvg_coeff_abs_sum; +quant_func *uvg_quant; +quant_cbcr_func *uvg_quant_cbcr_residual; +quant_residual_func *uvg_quantize_residual; +dequant_func *uvg_dequant; +coeff_abs_sum_func *uvg_coeff_abs_sum; fast_coeff_cost_func *uvg_fast_coeff_cost; -int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) { +int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth) +{ bool success = true; success &= uvg_strategy_register_quant_generic(opaque, bitdepth); diff --git a/src/strategyselector.c b/src/strategyselector.c index 477604a9..d6dffa4e 100644 --- a/src/strategyselector.c +++ b/src/strategyselector.c @@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) { fprintf(stderr, "uvg_strategy_register_encode failed!\n"); return 0; } + if (!uvg_strategy_register_depquant(&strategies, bitdepth)) { + fprintf(stderr, "uvg_strategy_register_depquant failed!\n"); + return 0; + } while(cur_strategy_to_select->fptr) { *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type); diff --git a/src/strategyselector.h b/src/strategyselector.h index caadfda9..8bbdfbed 100644 --- a/src/strategyselector.h +++ b/src/strategyselector.h @@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st #include "strategies/strategies-intra.h" #include "strategies/strategies-sao.h" #include "strategies/strategies-encode.h" +#include "strategies/strategies-depquant.h" #include "strategies/strategies-alf.h" static const strategy_to_select_t strategies_to_select[] = { @@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = { STRATEGIES_SAO_EXPORTS STRATEGIES_ENCODE_EXPORTS STRATEGIES_ALF_EXPORTS + STRATEGIES_DEPQUANT_EXPORTS { NULL, NULL }, };