From 8caabcde1a16261d18799d61489e206dcb629119 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 7 Apr 2023 15:15:58 +0300 Subject: [PATCH] [avx2] WIP check_rd_costs_avx2 --- src/dep_quant.c | 395 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 383 insertions(+), 12 deletions(-) diff --git a/src/dep_quant.c b/src/dep_quant.c index f97dde4d..2ff848b1 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -39,6 +39,7 @@ #include "transform.h" #include "uvg_math.h" #include "generic/quant-generic.h" +#include #define sm_numCtxSetsSig 3 @@ -89,7 +90,7 @@ typedef struct typedef struct { - coeff_t absLevel[4]; + int32_t absLevel[4]; int64_t deltaDist[4]; } PQData; @@ -558,6 +559,368 @@ static INLINE void checkRdCostSkipSbbZeroOut( decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset]; } + + +static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start) +{ + int32_t a[64] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}; + __m128i offsets = _mm_set_epi32(12, 8, 4, 0); + __m128i r = _mm_i32gather_epi32(a, offsets, 1); + + int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; + + __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]); + __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]); + + __m256i rd_cost_a = _mm256_loadu_si256(&state->m_rdCost[start]); + __m256i rd_cost_b = rd_cost_a; + __m256i rd_cost_z = rd_cost_a; + + rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist); + rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist); + + bool all_over_or_four = true; + bool all_under_four = true; + for (int i = 0; i < 4; i++) { + all_over_or_four &= state->m_remRegBins[start + i] >= 4; + all_under_four &= state->m_remRegBins[start + i] < 4; + } + + if (all_over_or_four) { + if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) { + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4); + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits); + } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1); + + __m128i max_rice = _mm_set1_epi32(15); + value = _mm_min_epi32(value, max_rice); + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(value, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqAs[4] = {0, 0, 3, 3}; + int64_t rd_costs[4] = {0, 0, 0, 0}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqA = pqAs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqA] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256(&rd_costs[0])); + } + + if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) { + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1); + __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits); + rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits); + } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) { + __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1); + + __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]); + __m128i t = _mm_slli_epi32(value, 1); + offsets = _mm_sub_epi32(offsets, t); + __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 1); + + __m128i max_rice = _mm_set1_epi32(15); + value = _mm_min_epi32(value, max_rice); + __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_tab = _mm_slli_epi32(go_rice_tab, 5); + value = _mm_add_epi32(value, go_rice_tab); + + __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 1)); + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } else { + const int pqAs[4] = {0, 0, 3, 3}; + int64_t rd_costs[4] = {0, 0, 0, 0}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int pqA = pqAs[i]; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + if (pqDataA->absLevel[pqA] < 4) { + rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + } + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256(&rd_costs[0])); + } + + if (spt == SCAN_ISCSBB) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); + } else if (spt == SCAN_SOCSBB) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1)); + + original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]); + odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1)); + + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1); + rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1); + rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0); + } + else { + if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) { + __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]); + __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1); + __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1); + __m256i even = _mm256_permutevar8x32_epi32(original, even_mask); + __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask); + __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0)); + __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 1)); + rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64); + rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64); + rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64); + } + else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) { + rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[3], decisions->rdCost[3], decisions->rdCost[0], decisions->rdCost[0]); + } + + else { + const int pqAs[4] = {0, 0, 3, 3}; + int64_t temp_rd_cost_a[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_b[4] = {0, 0, 0, 0}; + int64_t temp_rd_cost_z[4] = {0, 0, 0, 0}; + int64_t z_out[4] = {0, 0, 0, 0}; + _mm256_storeu_epi64(z_out, rd_cost_z); + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + if (state->m_numSigSbb[state_offset]) { + temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1]; + temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0]; + } else { + z_out[i] = decisions->rdCost[pqAs[i]]; + } + } + rd_cost_z = _mm256_loadu_epi64(z_out); + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_epi64(temp_rd_cost_a)); + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_loadu_epi64(temp_rd_cost_b)); + rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_loadu_epi64(temp_rd_cost_z)); + } + } + _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a); + _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b); + _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z); + } else if (all_under_four) { + __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS); + __m128i max_rice = _mm_set1_epi32(15); + __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(&state->m_goRiceZero[start])); + // RD cost A + { + __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]); + __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a); + + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp); + + + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1); + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp)); + } + // RD cost b + { + __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]); + __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_b); + + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1)); + + __m128i selected = _mm_blendv_epi8(go_rice_smaller, other, cmp); + + + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + __m128i offsets = _mm_add_epi32(selected, go_rice_offset); + __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 1); + __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits); + + rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp)); + } + // RD cost Z + { + __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start])); + go_rice_offset = _mm_slli_epi32(go_rice_offset, 5); + + go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero); + rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_offset)); + } + _mm256_storeu_epi64(temp_rd_cost_a, rd_cost_a); + _mm256_storeu_epi64(temp_rd_cost_b, rd_cost_b); + _mm256_storeu_epi64(temp_rd_cost_z, rd_cost_z); + } else { + const int pqAs[4] = {0, 0, 3, 3}; + const int pqBs[4] = {2, 2, 1, 1}; + const int decision_a[4] = {0, 2, 1, 3}; + for (int i = 0; i < 4; i++) { + const int state_offset = start + i; + const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]]; + const int pqA = pqAs[i]; + const int pqB = pqBs[i]; + int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA]; + int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB]; + int64_t rdCostZ = state->m_rdCost[state_offset]; + if (state->m_remRegBins[state_offset] >= 4) { + if (pqDataA->absLevel[pqA] < 4) { + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (pqDataA->absLevel[pqB] < 4) { + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; + } else { + const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; + } + if (spt == SCAN_ISCSBB) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else if (spt == SCAN_SOCSBB) { + rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0]; + } else if (state->m_numSigSbb[state_offset]) { + rdCostA += state->m_sigFracBits[state_offset][1]; + rdCostB += state->m_sigFracBits[state_offset][1]; + rdCostZ += state->m_sigFracBits[state_offset][0]; + } else { + rdCostZ = decisions->rdCost[decision_a[i]]; + } + } else { + rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)]; + rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)]; + rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]]; + } + temp_rd_cost_a[i] = rdCostA; + temp_rd_cost_b[i] = rdCostB; + temp_rd_cost_z[i] = rdCostZ; + } + rd_cost_a = _mm256_loadu_epi64(temp_rd_cost_a); + rd_cost_b = _mm256_loadu_epi64(temp_rd_cost_b); + rd_cost_z = _mm256_loadu_epi64(temp_rd_cost_z); + } + // Decision 0 + if (temp_rd_cost_a[0] < decisions->rdCost[0]) { + decisions->rdCost[0] = temp_rd_cost_a[0]; + decisions->absLevel[0] = pqDataA->absLevel[0]; + decisions->prevId[0] = state->m_stateId[start]; + } + if (temp_rd_cost_z[0] < decisions->rdCost[0]) { + decisions->rdCost[0] = temp_rd_cost_z[0]; + decisions->absLevel[0] = 0; + decisions->prevId[0] = state->m_stateId[start]; + } + if (temp_rd_cost_b[1] < decisions->rdCost[0]) { + decisions->rdCost[0] = temp_rd_cost_b[1]; + decisions->absLevel[0] = pqDataA->absLevel[2]; + decisions->prevId[0] = state->m_stateId[start + 1]; + } + + // Decision 1 + if (temp_rd_cost_a[1] < decisions->rdCost[2]) { + decisions->rdCost[2] = temp_rd_cost_a[1]; + decisions->absLevel[2] = pqDataA->absLevel[0]; + decisions->prevId[2] = state->m_stateId[start + 1]; + } + if (temp_rd_cost_z[1] < decisions->rdCost[2]) { + decisions->rdCost[2] = temp_rd_cost_z[1]; + decisions->absLevel[2] = 0; + decisions->prevId[2] = state->m_stateId[start + 1]; + } + if (temp_rd_cost_b[0] < decisions->rdCost[2]) { + decisions->rdCost[2] = temp_rd_cost_b[0]; + decisions->absLevel[2] = pqDataA->absLevel[2]; + decisions->prevId[2] = state->m_stateId[start]; + } + + // Decision 2 + if (temp_rd_cost_a[2] < decisions->rdCost[0]) { + decisions->rdCost[2] = temp_rd_cost_a[2]; + decisions->absLevel[2] = pqDataA->absLevel[3]; + decisions->prevId[2] = state->m_stateId[start + 2]; + } + if (temp_rd_cost_z[2] < decisions->rdCost[0]) { + decisions->rdCost[2] = temp_rd_cost_z[2]; + decisions->absLevel[2] = 0; + decisions->prevId[2] = state->m_stateId[start + 2]; + } + if (temp_rd_cost_b[3] < decisions->rdCost[0]) { + decisions->rdCost[2] = temp_rd_cost_b[3]; + decisions->absLevel[2] = pqDataA->absLevel[1]; + decisions->prevId[2] = state->m_stateId[start + 3]; + } + + // Decision 3 + if (temp_rd_cost_a[3] < decisions->rdCost[1]) { + decisions->rdCost[3] = temp_rd_cost_a[3]; + decisions->absLevel[3] = pqDataA->absLevel[3]; + decisions->prevId[3] = state->m_stateId[start + 3]; + } + if (temp_rd_cost_z[3] < decisions->rdCost[1]) { + decisions->rdCost[3] = temp_rd_cost_z[3]; + decisions->absLevel[3] = 0; + decisions->prevId[3] = state->m_stateId[start + 3]; + } + if (temp_rd_cost_b[2] < decisions->rdCost[1]) { + decisions->rdCost[3] = temp_rd_cost_b[2]; + decisions->absLevel[3] = pqDataA->absLevel[1]; + decisions->prevId[3] = state->m_stateId[start + 2]; + } +} + + static void checkRdCosts( const all_depquant_states * const state, const enum ScanPosType spt, @@ -579,18 +942,14 @@ static void checkRdCosts( } else { const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1; - rdCostA += - state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[ - value < RICEMAX ? value : RICEMAX - 1]; + rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; } if (pqDataA->absLevel[pqB] < 4) { rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]]; } else { const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1; - rdCostB += - state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[ - value < RICEMAX ? value : RICEMAX - 1]; + rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1]; } if (spt == SCAN_ISCSBB) { rdCostA += state->m_sigFracBits[state_offset][1]; @@ -722,10 +1081,11 @@ static void xDecide( PQData pqData; preQuantCoeff(qp, absCoeff, &pqData, quanCoeff); - checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); - checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); - checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); - checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); + check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset); + //checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0); + //checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1); + //checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2); + //checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3); if (spt == SCAN_EOCSBB) { checkRdCostSkipSbb(all_states, decisions, 0, skip_offset); checkRdCostSkipSbb(all_states, decisions, 1, skip_offset); @@ -1132,11 +1492,22 @@ int uvg_dep_quant( dep_quant_context.m_curr_state_offset = 0; dep_quant_context.m_prev_state_offset = 4; dep_quant_context.m_skip_state_offset = 8; - + const uint32_t lfnstIdx = tree_type != UVG_CHROMA_T || compID == COLOR_Y ? cur_tu->lfnst_idx : cur_tu->cr_lfnst_idx; + int8_t t[4] = {2, 2, 2, 2}; + __m128i pq_abs_a = _mm_set_epi32(16, 0, 16, 0); + __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_epi8(t)); + __m128i cmp = _mm_cmplt_epi32(go_rice_zero, pq_abs_a); + + __m128i max_rice = _mm_set1_epi32(15); + __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice); + + __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1)); + __m128i selected = _mm_blendv_epi8(go_rice_zero, other, cmp); + const int numCoeff = width * height; memset(coeff_out, 0x00, width * height * sizeof(coeff_t));