From dfff9a8030f72568e3f2baf507457583a12e2013 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 17 Apr 2023 15:14:35 +0300
Subject: [PATCH] [avx2] Move dep quant stuff to strategies

---
 CMakeLists.txt                            |    1 -
 src/dep_quant.c                           | 1464 +--------------------
 src/dep_quant.h                           |  123 ++
 src/strategies/avx2/depquant-avx2.c       | 1389 +++++++++++++++++++
 src/strategies/avx2/depquant-avx2.h       |   46 +
 src/strategies/generic/depquant-generic.c |  238 ++++
 src/strategies/generic/depquant-generic.h |   50 +
 src/strategies/strategies-depquant.c      |   54 +
 src/strategies/strategies-depquant.h      |   77 ++
 src/strategies/strategies-quant.c         |   13 +-
 src/strategyselector.c                    |    4 +
 src/strategyselector.h                    |    2 +
 12 files changed, 1997 insertions(+), 1464 deletions(-)
 create mode 100644 src/strategies/avx2/depquant-avx2.c
 create mode 100644 src/strategies/avx2/depquant-avx2.h
 create mode 100644 src/strategies/generic/depquant-generic.c
 create mode 100644 src/strategies/generic/depquant-generic.h
 create mode 100644 src/strategies/strategies-depquant.c
 create mode 100644 src/strategies/strategies-depquant.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6460743b..d8c37bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,7 +144,6 @@ target_include_directories(uvg266 PUBLIC src/extras)
 target_include_directories(uvg266 PUBLIC src/strategies)
 
 file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
-file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/dep_quant.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
 file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
 
diff --git a/src/dep_quant.c b/src/dep_quant.c
index 39439c40..519e5795 100644
--- a/src/dep_quant.c
+++ b/src/dep_quant.c
@@ -39,10 +39,8 @@
 #include "transform.h"
 #include "uvg_math.h"
 #include "generic/quant-generic.h"
-#include <immintrin.h>
-
-
 
+#include "strategies-depquant.h"
 static const int32_t g_goRiceBits[4][RICEMAX] = {
     { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
     { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
@@ -56,102 +54,6 @@ static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
 static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
                                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
 
-enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
-
-
-
-
-typedef struct
-{
-  uint8_t* sbbFlags;
-  uint8_t* levels;
-} SbbCtx;
-
-
-typedef struct
-{
-  int32_t absLevel[4];
-  int64_t deltaDist[4];
-} PQData;
-
-typedef struct
-{
-  int64_t ALIGNED(32) rdCost[8];
-  int32_t ALIGNED(32) absLevel[8];
-  int32_t ALIGNED(32) prevId[8];
-} Decision;
-
-
-typedef struct
-{
-  const NbInfoOut* m_nbInfo;
-  uint32_t m_sbbFlagBits[2][2];
-  SbbCtx m_allSbbCtx[8];
-  int m_curr_sbb_ctx_offset;
-  int m_prev_sbb_ctx_offset;
-  uint8_t sbb_memory[8 * 1024];
-  uint8_t level_memory[8* TR_MAX_WIDTH * TR_MAX_WIDTH];
-  int num_coeff;
-} common_context;
-
-
-typedef struct
-{
-  int64_t m_rdCost;
-  uint16_t m_absLevelsAndCtxInit[24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t m_numSigSbb;
-  int m_remRegBins;
-  int8_t m_refSbbCtxId;
-  uint32_t m_sbbFracBits[2];
-  uint32_t m_sigFracBits[2];
-  int32_t m_coeffFracBits[6];
-  int8_t m_goRicePar;
-  int8_t m_goRiceZero;
-  int8_t m_stateId;
-  uint32_t *m_sigFracBitsArray[12];
-  int32_t *m_gtxFracBitsArray[21];
-  common_context* m_commonCtx;
-
-  unsigned effWidth;
-  unsigned effHeight;
-} depquant_state;
-
-typedef struct
-{
-  int64_t         ALIGNED(32) m_rdCost[12];
-  uint16_t        ALIGNED(32) m_absLevelsAndCtxInit[12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
-  int8_t          ALIGNED(16) m_numSigSbb[12];
-  int             ALIGNED(32) m_remRegBins[12];
-  int8_t          ALIGNED(16) m_refSbbCtxId[12];
-  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
-  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
-  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
-  int8_t          ALIGNED(16) m_goRicePar[12];
-  int8_t          ALIGNED(16) m_goRiceZero[12];
-  int8_t          ALIGNED(16) m_stateId[12];
-  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
-  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
-  common_context* m_commonCtx;
-
-  unsigned effWidth;
-  unsigned effHeight;
-
-  bool all_gte_four;
-  bool all_lt_four;
-} all_depquant_states;
-
-typedef struct
-{
-    common_context  m_common_context;
-    all_depquant_states m_allStates;
-    int m_curr_state_offset;
-    int m_prev_state_offset;
-    int m_skip_state_offset;
-    depquant_state       m_startState;
-    quant_block*   m_quant;
-    Decision    m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
-} context_store;
-
 
 int uvg_init_nb_info(encoder_control_t * encoder) {
   memset(encoder->m_scanId2NbInfoSbbArray, 0, sizeof(encoder->m_scanId2NbInfoSbbArray));
@@ -556,326 +458,8 @@ static void depquant_state_init(depquant_state* state, uint32_t sig_frac_bits[2]
   state->m_sbbFracBits[1] = 0;
 }
 
-static INLINE void checkRdCostSkipSbbZeroOut(
-  Decision* decision, 
-  const all_depquant_states* const state,
-  int decision_id, 
-  int skip_offset) {
-  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
-  decision->rdCost[decision_id] = rdCost;
-  decision->absLevel[decision_id] = 0;
-  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
-}
 
-
-
-static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
-{
-  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
-  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
-  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
-
-  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
-  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
-
-  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
-  __m256i rd_cost_b = rd_cost_a;
-  __m256i rd_cost_z = rd_cost_a;
-
-  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
-  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
-
-
-  if (state->all_gte_four) {
-    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
-      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
-    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
-      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
-
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
-      __m128i t = _mm_slli_epi32(value, 1);
-      offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-
-      __m128i max_rice = _mm_set1_epi32(31);
-      value = _mm_min_epi32(value, max_rice);
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
-      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
-      value = _mm_add_epi32(value, go_rice_tab);
-
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
-    } else {
-      const int pqAs[4] = {0, 0, 3, 3};
-      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
-      for (int i = 0; i < 4; i++) {
-        const int      state_offset = start + i;
-        const int      pqA = pqAs[i];
-        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-        if (pqDataA->absLevel[pqA] < 4) {
-          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-      }
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
-    }
-
-    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
-    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
-      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
-
-      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
-      __m128i t = _mm_slli_epi32(value, 1);
-      offsets = _mm_sub_epi32(offsets, t);
-      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
-
-      __m128i max_rice = _mm_set1_epi32(31);
-      value = _mm_min_epi32(value, max_rice);
-      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
-      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
-      value = _mm_add_epi32(value, go_rice_tab);
-
-      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
-    } else {
-      const int pqBs[4] = {2, 2, 1, 1};
-      int64_t rd_costs[4] = {0, 0, 0, 0}; 
-      for (int i = 0; i < 4; i++) {
-        const int      state_offset = start + i;
-        const int      pqB = pqBs[i];
-        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-        if (pqDataA->absLevel[pqB] < 4) {
-          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-      }
-      rd_cost_b =
-        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
-    }
-
-    if (spt == SCAN_ISCSBB) {
-      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
-    } else if (spt == SCAN_SOCSBB) {
-      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-
-      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
-      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-
-      
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
-
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
-    }
-    else {
-      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
-        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
-        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
-        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
-        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
-        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
-        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
-        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
-        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
-        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
-        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
-      }
-      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
-        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
-      }
-
-      else {
-        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
-        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
-        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
-        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
-        for (int i = 0; i < 4; i++) {
-          const int state_offset = start + i;
-          if (state->m_numSigSbb[state_offset]) {
-            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
-            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
-            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
-          } else {
-            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
-          }
-        }
-        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
-        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
-        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
-      }
-    }
-  } else if (state->all_lt_four) {
-    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
-    __m128i max_rice = _mm_set1_epi32(31);
-    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
-    // RD cost A
-    {
-      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
-      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
-      
-      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
-
-      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
-
-      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
-
-
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
-      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
-
-      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
-    }
-    // RD cost b
-    {
-      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
-      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
-
-      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
-
-      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
-
-      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
-
-
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
-      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
-
-      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
-    }
-    // RD cost Z
-    {
-      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
-      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
-
-      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
-      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
-      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
-    }
-  } else {
-    const int pqAs[4] = {0, 0, 3, 3};
-    const int pqBs[4] = {2, 2, 1, 1};
-    const int decision_a[4] = {0, 2, 1, 3};
-    for (int i = 0; i < 4; i++) {
-      const int      state_offset = start + i;
-      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
-      const int pqA = pqAs[i];
-      const int pqB = pqBs[i];
-      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
-      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
-      int64_t rdCostZ = state->m_rdCost[state_offset];
-      if (state->m_remRegBins[state_offset] >= 4) {
-        if (pqDataA->absLevel[pqA] < 4) {
-          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
-          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (pqDataA->absLevel[pqB] < 4) {
-          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
-        } else {
-          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
-          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
-        }
-        if (spt == SCAN_ISCSBB) {
-          rdCostA += state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sigFracBits[state_offset][0];
-        } else if (spt == SCAN_SOCSBB) {
-          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
-        } else if (state->m_numSigSbb[state_offset]) {
-          rdCostA += state->m_sigFracBits[state_offset][1];
-          rdCostB += state->m_sigFracBits[state_offset][1];
-          rdCostZ += state->m_sigFracBits[state_offset][0];
-        } else {
-          rdCostZ = decisions->rdCost[decision_a[i]];
-        }
-      } else {
-        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
-        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
-        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
-      }
-      temp_rd_cost_a[i] = rdCostA;
-      temp_rd_cost_b[i] = rdCostB;
-      temp_rd_cost_z[i] = rdCostZ;
-    }
-    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
-    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
-    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
-  }
-  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
-  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
-  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
-  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
-
-  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
-  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
-  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
-  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
-
-  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
-  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
-  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
-
-  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
-  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
-  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
-
-  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
-  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
-  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
-
-  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
-  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
-  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
-
-  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
-  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
-  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
-}
-
-
-static void checkRdCosts(
+void uvg_dep_quant_check_rd_costs(
   const all_depquant_states * const state,
   const enum ScanPosType            spt,
   const PQData *                    pqDataA,
@@ -950,107 +534,6 @@ static void checkRdCosts(
   }
 }
 
-static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
-{
-  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
-  if (rdCost < decisions->rdCost[decision_id])
-  {
-    decisions->rdCost[decision_id] = rdCost;
-    decisions->absLevel[decision_id] = 0;
-    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
-  }
-}
-
-static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
-                                    decision_id)
-{
-  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
-  if (pqData->absLevel[decision_id] < 4) {
-    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
-  }
-  else {
-    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
-    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
-              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
-  }
-  if (rdCost < decisions->rdCost[decision_id]) {
-    decisions->rdCost[decision_id] = rdCost;
-    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
-    decisions->prevId[decision_id] = -1;
-  }
-}
-
-
-static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
-{
-  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
-  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
-  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
-  int index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-  scaledAdd += qp->m_DistStepAdd;
-  index = qIdx & 3;
-  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
-  pqData->absLevel[index] = (++qIdx) >> 1;
-}
-
-
-static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
-  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
-
-
-static void xDecide(
-  all_depquant_states* const all_states,
-  depquant_state* const      m_startState,
-  quant_block *              qp,
-  const enum ScanPosType     spt,
-  const coeff_t              absCoeff,
-  const int                  lastOffset,
-  Decision*                  decisions,
-  bool                       zeroOut,
-  coeff_t                    quanCoeff,
-  const int                  skip_offset,
-  const int                  prev_offset)
-{
-  memcpy(decisions, &startDec, sizeof(Decision));
-
-  if (zeroOut) {
-    if (spt == SCAN_EOCSBB) {
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
-      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
-    }
-    return;
-  }
-
-  PQData pqData;
-  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
-  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
-  //checkRdCosts(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
-  if (spt == SCAN_EOCSBB) {
-    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
-    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
-  }
-
-  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
-  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
-}
-
 
 static INLINE unsigned templateAbsCompare(coeff_t sum)
 {
@@ -1146,354 +629,9 @@ static INLINE void update_common_context(
   memset(ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state], 0, 16 * sizeof(uint8_t));
 }
 
-static INLINE void updateStateEOS(
-  context_store*  ctxs,
-  const uint32_t  scan_pos,
-  const uint32_t  cg_pos,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const uint32_t  width_in_sbb,
-  const uint32_t  height_in_sbb,
-  const uint32_t  next_sbb_right,
-  const uint32_t  next_sbb_below,
-  const Decision* decisions,
-  int             decision_id);
-
-static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
-                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
-                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
-                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
-                                  const Decision* decisions)
-{
-  all_depquant_states* state = &ctxs->m_allStates;
-  bool all_above_minus_two = true;
-  bool all_between_zero_and_three = true;
-  bool all_above_four = true;
-
-  
-  int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
-  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
-  for (int i = 0; i < 4; ++i) {
-    all_above_minus_two &= decisions->prevId[i] > -2;
-    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
-    all_above_four &= decisions->prevId[i] >= 4;
-  }
-  if (all_above_minus_two) {
-    bool all_have_previous_state = true;
-    __m128i prev_state;
-    __m128i prev_state_no_offset;
-    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
-    if (all_above_four) {
-      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
-      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
-      prev_state = _mm_add_epi32(
-        prev_state,
-            prev_state_no_offset
-      );
-      memset(&state->m_numSigSbb[state_offset], 0, 4);
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
-      }
-    } else if (all_between_zero_and_three) {
-      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
-      prev_state = _mm_add_epi32(
-        prev_state_no_offset,
-        _mm_load_si128((const __m128i*)decisions->prevId)
-      );
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
-      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
-      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
-      num_sig_sbb = _mm_add_epi32(
-        num_sig_sbb,
-        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
-      );
-
-      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
-      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
-      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
-
-      int32_t prev_state_scalar[4];
-      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
-      }
-    } else {
-      int prev_state_s[4] = {-1, -1, -1, -1};
-      for (int i = 0; i < 4; ++i) {
-        const int decision_id = i;
-        const int curr_state_offset = state_offset + i;
-        if (decisions->prevId[decision_id] >= 4) {
-          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
-          state->m_numSigSbb[curr_state_offset] = 0;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-        } else if (decisions->prevId[decision_id] >= 0) {
-          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
-          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
-        } else {
-          state->m_numSigSbb[curr_state_offset] = 1;
-          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
-          all_have_previous_state = false;
-        }
-      }
-      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
-    }
-    uint32_t level_offset = scan_pos & 15;
-    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
-
-    // Update common context
-    __m128i last;
-    {
-      const uint32_t numSbb = width_in_sbb * height_in_sbb;
-      common_context* cc = &ctxs->m_common_context;
-      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
-      int previous_state_array[4];
-      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
-      for (int curr_state = 0; curr_state < 4; ++curr_state) {
-        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
-        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
-        const int p_state = previous_state_array[curr_state];
-        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
-          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
-          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
-          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
-        } else {
-          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
-          memset(levels + scan_pos, 0, setCpSize);
-        }
-        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
-        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
-      }
-
-      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
-      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
-      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
-      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
-
-      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
-      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
-
-      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
-      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
-      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
-      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
-
-      memset(&state->m_numSigSbb[state_offset], 0, 4);
-      memset(&state->m_goRicePar[state_offset], 0, 4);
-
-      uint8_t states[4] = {0, 1, 2, 3};
-      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
-      if (all_have_previous_state) {
-        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
-        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
-      } else {
-        const int temp = (state->effWidth * state->effHeight * 28) / 16;
-        for (int i = 0; i < 4; ++i) {
-          if (previous_state_array[i] != -1) {
-            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
-          } else {
-            state->m_remRegBins[i + state_offset] = temp;
-          }
-        }
-      }
-      
-      const int        scanBeg = scan_pos - 16;
-      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
-      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
-
-      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
-      __m128i          first_byte = _mm_set1_epi32(0xff);
-      __m128i          ones = _mm_set1_epi32(1);
-      __m128i         fours = _mm_set1_epi32(4);
-      __m256i          all[4];
-      uint64_t         temp[4];
-      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
-                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
-
-      for (int id = 0; id < 16; id++, nbOut++) {
-        if (nbOut->num == 0) {
-          temp[id % 4] = 0;
-          if (id % 4 == 3) {
-            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
-          }
-          continue;
-        }
-        __m128i sum_abs = _mm_set1_epi32(0);
-        __m128i sum_abs_1 = _mm_set1_epi32(0);
-        __m128i sum_num = _mm_set1_epi32(0);
-        switch (nbOut->num) {
-        case 5:
-          {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
-            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)
-              )
-            );
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-          }
-        case 4: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 3: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 2: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
-            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-        case 1: {
-            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
-            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
-            t = _mm_and_si128(t, first_byte);
-            sum_abs = _mm_add_epi32(sum_abs, t);
-            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
-            __m128i min_t = _mm_min_epi32(
-              t,
-              _mm_add_epi32(
-                fours,
-                _mm_and_si128(t, ones)));
-            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
-        }
-            break;
-        default:
-          assert(0);
-        }
-        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
-        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
-        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
-        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
-        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
-        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
-        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
-        if (id % 4 == 3) {
-          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
-          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
-          last = template_ctx_init;
-        }
-      }
-
-      __m256i* v_src_tmp = all;
-
-      __m256i v_tmp[4];
-      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
-      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
-      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
-      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
-
-      __m256i v_tmp16_lo[2];
-      __m256i v_tmp16_hi[2];
-      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
-      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
-      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
-
-      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
-      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
-
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
-      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
-
-      for (int i = 0; i < 4; ++i) {
-        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
-      }
-    }
-
-    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
-    __m128i sum_abs1 = _mm_and_si128(
-      _mm_srli_epi32(last, 3),
-      _mm_set1_epi32(31));
-
-    __m128i sum_abs_min = _mm_min_epi32(
-      _mm_set1_epi32(3),
-      _mm_srli_epi32(
-        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
-        1));
-
-    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
-    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
-    offsets         = _mm_add_epi32(offsets, sum_abs_min);
-    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
-    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
 
 
-    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
-    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
-    uint32_t sum_gt1_s[4];
-    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
-    for (int i = 0; i < 4; ++i) {
-      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
-    }
-  }
-  else {
-    for (int i = 0; i < 4; i++) {
-      updateStateEOS(
-        ctxs,
-        scan_pos,
-        cg_pos,
-        sigCtxOffsetNext,
-        gtxCtxOffsetNext,
-        width_in_sbb,
-        height_in_sbb,
-        next_sbb_right,
-        next_sbb_below,
-        decisions,
-        i);
-    }
-  }
-}
-
-
-static INLINE void updateStateEOS(
+void uvg_dep_quant_update_state_eos(
   context_store*   ctxs,
   const uint32_t   scan_pos,
   const uint32_t   cg_pos,
@@ -1542,542 +680,9 @@ static INLINE void updateStateEOS(
            state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
   }
 }
-static INLINE void updateState(
-  context_store*  ctxs,
-  int             numIPos,
-  const uint32_t  scan_pos,
-  const Decision* decisions,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const NbInfoSbb next_nb_info_ssb,
-  const int       baseLevel,
-  const bool      extRiceFlag,
-  int             decision_id);
-
-static INLINE void update_states_avx2(
-  context_store*  ctxs,
-  int             numIPos,
-  const uint32_t  scan_pos,
-  const Decision* decisions,
-  const uint32_t  sigCtxOffsetNext,
-  const uint32_t  gtxCtxOffsetNext,
-  const NbInfoSbb next_nb_info_ssb,
-  const int       baseLevel,
-  const bool      extRiceFlag)
-{
-  all_depquant_states* state = &ctxs->m_allStates;
-
-  bool all_non_negative = true;
-  bool all_above_minus_two = true;
-  bool all_minus_one = true;
-  for (int i = 0; i < 4; ++i) {
-    all_non_negative &= decisions->prevId[i] >= 0;
-    all_above_minus_two &= decisions->prevId[i] > -2;
-    all_minus_one &= decisions->prevId[i] == -1;
-  }
-  int state_offset = ctxs->m_curr_state_offset;
-  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
-  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
-  if (all_above_minus_two) {
-
-    bool    rem_reg_all_gte_4 = true;
-    bool    rem_reg_all_lt4 = true;
-
-    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
-    if (all_non_negative) {
-      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
-      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
-      prv_states = _mm_add_epi32(prv_states, prev_offset);
-      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
-      
-      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
-      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
-      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
-      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
-      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
-      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
-      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
-      
-      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
-      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
-      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
-      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
-      
-      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
-      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
-      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-
-      
-      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
-      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
-
-      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
-      __m128i ones = _mm_set1_epi32(1);
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
-
-      __m128i reg_bins_sub = _mm_set1_epi32(0);
-      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
-      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
-
-      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
-      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
-
-      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
-      int     bit_mask = _mm_movemask_epi8(mask);           
-      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
-      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      bit_mask = _mm_movemask_epi8(mask); 
-      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
-
-      int32_t prv_states_scalar[4];
-      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
-      }
-    }
-    else if (all_minus_one) {
-      memset(&state->m_numSigSbb[state_offset], 1, 4);
-      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
-
-      const int a = (state->effWidth * state->effHeight * 28) / 16;
-
-      __m128i   rem_reg_bins = _mm_set1_epi32(a);
-      __m128i   sub = _mm_blendv_epi8(
-        _mm_set1_epi32(3),
-        abs_level,
-        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
-      );
-      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
-      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
-
-      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
-      int     bit_mask = _mm_movemask_epi8(mask);
-      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
-      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
-      bit_mask = _mm_movemask_epi8(mask);
-      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
-      
-      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
-      
-    }
-    else {
-      for (int i = 0; i< 4; ++i) {
-        const int decision_id = i;
-        const int state_id = state_offset + i;
-        if (decisions->prevId[decision_id] >= 0) {
-          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
-          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
-          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
-          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
-          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
-          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
-          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
-          if (state->m_remRegBins[state_id] >= 4) {
-            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          }
-          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
-        } else {
-          state->m_numSigSbb[state_id] = 1;
-          state->m_refSbbCtxId[state_id] = -1;
-          int ctxBinSampleRatio = 28;
-          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
-          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
-          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
-        }
-        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
-        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
-      }
-    }
-    uint32_t level_offset = scan_pos & 15;
-    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
-    uint32_t max_abs_s[4];
-    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
-    for (int i = 0; i < 4; ++i) {
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
-      levels[level_offset] = max_abs_s[i];
-    }
-    state->all_gte_four = rem_reg_all_gte_4;
-    state->all_lt_four = rem_reg_all_lt4;
-    if (rem_reg_all_gte_4) {
-      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
-      const __m128i  first_byte = _mm_set1_epi32(0xff);
-      const __m128i  ones = _mm_set1_epi32(1);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
-      __m128i        tinit = _mm_i32gather_epi32(
-        (int *)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, first_two_bytes);
-      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
-      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
-
-      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
-      switch (numIPos) {
-      case 5:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int *)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 4:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 3:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 2:
-        {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-        __m128i min_arg = _mm_min_epi32(
-              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-              t
-            );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-          );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        }
-      case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          t = _mm_and_si128(t, first_byte);
-          __m128i min_arg = _mm_min_epi32(
-            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
-            t
-          );
-          sum_abs1 = _mm_add_epi32(
-            sum_abs1,
-            min_arg
-            );
-          sum_num = _mm_add_epi32(
-            sum_num,
-            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
-        } break;
-      default:
-          assert(0);
-      }
-      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
-      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
-      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
-      __m128i temp = _mm_min_epi32(
-        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
-        _mm_set1_epi32(3));
-      offsets = _mm_add_epi32(offsets, temp);
-      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
-      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
-
-      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
-      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
-      uint32_t sum_gt1_s[4];
-      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
-      for (int i = 0; i < 4; ++i) {
-        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
-      }
-
-      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
-      switch (numIPos) {
-        case 5:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 4:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 3:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 2:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          }
-        case 1:
-          {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          sum_abs = _mm_add_epi32(t, sum_abs);
-          } break;
-        default:
-          assert(0);
-      }
-      sum_abs = _mm_and_si128(sum_abs, first_byte);
-      if (extRiceFlag) {
-        assert(0 && "Not implemented for avx2");
-      } else {
-        __m128i sum_all = _mm_max_epi32(
-          _mm_min_epi32(
-            _mm_set1_epi32(31),
-            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
-          _mm_set1_epi32(0));
-        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
-        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
-        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-      }
-    }
-
-    else if (rem_reg_all_lt4) {
-      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
-      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
-      const __m128i  last_byte = _mm_set1_epi32(0xff);
-      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
-      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
-      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
-      __m128i       tinit = _mm_i32gather_epi32(
-        (int*)state->m_absLevelsAndCtxInit[state_offset],
-        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
-        2);
-      tinit = _mm_and_si128(tinit, last_two_bytes);
-      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
-      switch (numIPos) {
-        case 5: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 4: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 3: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 2: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        }
-        case 1: {
-          __m128i t = _mm_i32gather_epi32(
-            (int*)levels,
-            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
-            1);
-          t = _mm_and_si128(t, last_byte);
-          sum_abs = _mm_add_epi32(sum_abs, t);
-        } break;
-        default:
-          assert(0);
-      }
-      if (extRiceFlag) {
-        assert(0 && "Not implemented for avx2");
-      } else {
-        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
-        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
-        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
-        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
-        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
-
-        
-        for (int i = 0; i < 4; ++i) {
-          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
-          
-        }
-
-      }
-
-    }
-    else {
-      for (int i = 0; i < 4; ++i) {
-        const int state_id = state_offset + i;
-        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
-        if (state->m_remRegBins[state_id] >= 4) {
-          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
-          coeff_t sumAbs1 = (tinit >> 3) & 31;
-          coeff_t sumNum = tinit & 7;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs1 += MIN(4 + (t & 1), t);                \
-    sumNum += !!t;                                 \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          coeff_t sumGt1 = sumAbs1 - sumNum;
-          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
-          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
-          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
 
 
-          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs += t;                                   \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          if (extRiceFlag) {
-            unsigned currentShift = templateAbsCompare(sumAbs);
-            sumAbs = sumAbs >> currentShift;
-            int sumAll = MAX(MIN(31, (int)sumAbs - (int)baseLevel), 0);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-            state->m_goRicePar[state_id] += currentShift;
-          } else {
-            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
-          }
-        } else {
-          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
-#define UPDATE(k)                                  \
-  {                                                \
-    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
-    sumAbs += t;                                   \
-  }
-          switch (numIPos) {
-            case 5: UPDATE(4);
-            case 4: UPDATE(3);
-            case 3: UPDATE(2);
-            case 2: UPDATE(1);
-            case 1: UPDATE(0); break;
-            default: assert(0);
-          }
-#undef UPDATE
-          if (extRiceFlag) {
-            unsigned currentShift = templateAbsCompare(sumAbs);
-            sumAbs = sumAbs >> currentShift;
-            sumAbs = MIN(31, sumAbs);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-            state->m_goRicePar[state_id] += currentShift;
-          } else {
-            sumAbs = MIN(31, sumAbs);
-            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
-          }
-          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < 4; ++i) {
-      state->all_gte_four = true;
-      state->all_lt_four = true;
-      updateState(
-        ctxs,
-        numIPos,
-        scan_pos,
-        decisions,
-        sigCtxOffsetNext,
-        gtxCtxOffsetNext,
-        next_nb_info_ssb,
-        baseLevel,
-        extRiceFlag,
-        i);
-    }
-  }
-}
-
-
-static INLINE void updateState(
+void uvg_dep_quant_update_state(
   context_store * ctxs,
   int             numIPos,
   const uint32_t  scan_pos,
@@ -2090,7 +695,7 @@ static INLINE void updateState(
   int             decision_id) {
   all_depquant_states* state = &ctxs->m_allStates;
   int state_id = ctxs->m_curr_state_offset + decision_id;
-  // state->m_rdCost[state_id] = decisions->rdCost[decision_id];
+  state->m_rdCost[state_id] = decisions->rdCost[decision_id];
   if (decisions->prevId[decision_id] > -2) {
     if (decisions->prevId[decision_id] >= 0) {
       const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
@@ -2200,61 +805,6 @@ static INLINE void updateState(
 }
 
 static bool same[13];
-static void xDecideAndUpdate(
-  rate_estimator_t*                         re,
-  context_store*                          ctxs,
-  struct dep_quant_scan_info const* const scan_info,
-  const coeff_t                           absCoeff,
-  const uint32_t                          scan_pos,
-  const uint32_t                          width_in_sbb,
-  const uint32_t                          height_in_sbb,
-  const NbInfoSbb                         next_nb_info_ssb,
-  bool                                    zeroOut,
-  coeff_t                                 quantCoeff,
-  const uint32_t                          effWidth,
-  const uint32_t                          effHeight,
-  bool                                    is_chroma)
-{
-  Decision* decisions = &ctxs->m_trellis[scan_pos];
-  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
-
-  enum ScanPosType spt = 0;
-  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
-  {
-    spt = SCAN_SOCSBB;
-  }
-  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
-  {
-    spt = SCAN_EOCSBB;
-  }
-
-  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
-
-  if (scan_pos) {
-    if (!(scan_pos & 15)) {
-      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
-      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
-      //updateStateEOS(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
-      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
-      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
-      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
-    } else if (!zeroOut) {
-      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
-    /*  updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 0);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 1);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 2);
-      updateState(ctxs, next_nb_info_ssb.num, scan_pos, decisions, sigCtxOffsetNext, gtxCtxOffsetNext, next_nb_info_ssb, 4, false, 3);*/
-    }
-
-    if (spt == SCAN_SOCSBB) {
-      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
-    }
-  }
-}
-
 
 int uvg_dep_quant(
   const encoder_state_t* const state,
@@ -2419,7 +969,7 @@ int uvg_dep_quant(
     if (enableScalingLists) {
       init_quant_block(state, dep_quant_context.m_quant, cur_tu, log2_tr_width, log2_tr_height, compID, needs_block_size_trafo_scale, q_coeff[blkpos]);
 
-      xDecideAndUpdate(
+      uvg_dep_quant_decide_and_update(
         rate_estimator,
         ctxs,
         scan_info,
@@ -2436,7 +986,7 @@ int uvg_dep_quant(
         ); //tu.cu->slice->getReverseLastSigCoeffFlag());
     }
     else {
-      xDecideAndUpdate(
+      uvg_dep_quant_decide_and_update(
         rate_estimator,
         ctxs,
         scan_info,
diff --git a/src/dep_quant.h b/src/dep_quant.h
index ebb54d31..676d1bab 100644
--- a/src/dep_quant.h
+++ b/src/dep_quant.h
@@ -46,6 +46,8 @@
 
 typedef struct encoder_control_t encoder_control_t;
 
+enum ScanPosType { SCAN_ISCSBB = 0, SCAN_SOCSBB = 1, SCAN_EOCSBB = 2 };
+
 struct dep_quant_scan_info
 {
   uint8_t sig_ctx_offset[2];
@@ -97,6 +99,91 @@ typedef struct
   uint16_t outPos[5];
 } NbInfoOut;
 
+typedef struct {
+  int32_t absLevel[4];
+  int64_t deltaDist[4];
+} PQData;
+
+typedef struct {
+  int64_t ALIGNED(32) rdCost[8];
+  int32_t ALIGNED(32) absLevel[8];
+  int32_t ALIGNED(32) prevId[8];
+} Decision;
+
+
+typedef struct {
+  uint8_t* sbbFlags;
+  uint8_t* levels;
+} SbbCtx;
+
+typedef struct {
+  const NbInfoOut* m_nbInfo;
+  uint32_t         m_sbbFlagBits[2][2];
+  SbbCtx           m_allSbbCtx[8];
+  int              m_curr_sbb_ctx_offset;
+  int              m_prev_sbb_ctx_offset;
+  uint8_t          sbb_memory[8 * 1024];
+  uint8_t          level_memory[8 * TR_MAX_WIDTH * TR_MAX_WIDTH];
+  int              num_coeff;
+} common_context;
+
+
+typedef struct {
+  int64_t  m_rdCost;
+  uint16_t m_absLevelsAndCtxInit
+    [24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          m_numSigSbb;
+  int             m_remRegBins;
+  int8_t          m_refSbbCtxId;
+  uint32_t        m_sbbFracBits[2];
+  uint32_t        m_sigFracBits[2];
+  int32_t         m_coeffFracBits[6];
+  int8_t          m_goRicePar;
+  int8_t          m_goRiceZero;
+  int8_t          m_stateId;
+  uint32_t*       m_sigFracBitsArray[12];
+  int32_t*        m_gtxFracBitsArray[21];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+} depquant_state;
+typedef struct {
+  int64_t  ALIGNED(32) m_rdCost[12];
+  uint16_t ALIGNED(32) m_absLevelsAndCtxInit
+    [12][24]; // 16x8bit for abs levels + 16x16bit for ctx init id
+  int8_t          ALIGNED(16) m_numSigSbb[12];
+  int             ALIGNED(32) m_remRegBins[12];
+  int8_t          ALIGNED(16) m_refSbbCtxId[12];
+  uint32_t        ALIGNED(32) m_sbbFracBits[12][2];
+  uint32_t        ALIGNED(32) m_sigFracBits[12][2];
+  int32_t         ALIGNED(32) m_coeffFracBits[12][6];
+  int8_t          ALIGNED(16) m_goRicePar[12];
+  int8_t          ALIGNED(16) m_goRiceZero[12];
+  int8_t          ALIGNED(16) m_stateId[12];
+  uint32_t        ALIGNED(32) m_sigFracBitsArray[12][12][2];
+  int32_t         ALIGNED(32) m_gtxFracBitsArray[21][6];
+  common_context* m_commonCtx;
+
+  unsigned        effWidth;
+  unsigned        effHeight;
+
+  bool            all_gte_four;
+  bool            all_lt_four;
+} all_depquant_states;
+
+typedef struct {
+  common_context      m_common_context;
+  all_depquant_states m_allStates;
+  int                 m_curr_state_offset;
+  int                 m_prev_state_offset;
+  int                 m_skip_state_offset;
+  depquant_state      m_startState;
+  quant_block*        m_quant;
+  Decision            m_trellis[TR_MAX_WIDTH * TR_MAX_WIDTH];
+} context_store;
+
+
 int uvg_init_nb_info(encoder_control_t* encoder);
 void uvg_dealloc_nb_info(encoder_control_t* encoder);
 
@@ -122,4 +209,40 @@ int uvg_dep_quant(
   enum uvg_tree_type tree_type,
   int* absSum,
   const bool enableScalingLists);
+
+
+void uvg_dep_quant_update_state(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag,
+  int             decision_id);
+
+
+void uvg_dep_quant_update_state_eos(
+  context_store*  ctxs,
+  const uint32_t  scan_pos,
+  const uint32_t  cg_pos,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const uint32_t  width_in_sbb,
+  const uint32_t  height_in_sbb,
+  const uint32_t  next_sbb_right,
+  const uint32_t  next_sbb_below,
+  const Decision* decisions,
+  int             decision_id);
+
+void uvg_dep_quant_check_rd_costs(
+  const all_depquant_states* const state,
+  const enum ScanPosType           spt,
+  const PQData*                    pqDataA,
+  Decision*                        decisions,
+  const int                        decisionA,
+  const int                        decisionB,
+  const int                        state_offset);
 #endif
diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c
new file mode 100644
index 00000000..86056de4
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.c
@@ -0,0 +1,1389 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/*
+* \file
+*/
+
+#include "strategies/avx2/depquant-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include "dep_quant.h"
+
+#include <immintrin.h>
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "transform.h"
+#include "generic/quant-generic.h"
+#include "uvg_math.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+    { 32768,  65536,  98304, 131072, 163840, 196608, 262144, 262144, 327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+    { 65536,  65536,  98304,  98304, 131072, 131072, 163840, 163840, 196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448, 425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+    { 98304,  98304,  98304,  98304, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144, 327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+    {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608, 229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+static const int g_riceT[4] = { 32,128, 512, 2048 };
+static const int g_riceShift[5] = { 0, 2, 4, 6, 8 };
+
+static const uint32_t g_goRiceParsCoeff[32] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3 };
+
+static void check_rd_costs_avx2(const all_depquant_states* const state, const enum ScanPosType spt, const PQData* pqDataA, Decision* decisions, int start)
+{
+  int64_t temp_rd_cost_a[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_b[4] = {0, 0, 0, 0};
+  int64_t temp_rd_cost_z[4] = {0, 0, 0, 0};
+
+  __m256i pq_a_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[0], pqDataA->deltaDist[0], pqDataA->deltaDist[3], pqDataA->deltaDist[3]);
+  __m256i pq_b_delta_dist = _mm256_setr_epi64x(pqDataA->deltaDist[2], pqDataA->deltaDist[2], pqDataA->deltaDist[1], pqDataA->deltaDist[1]);
+
+  __m256i rd_cost_a = _mm256_load_si256((__m256i const*)&state->m_rdCost[start]);
+  __m256i rd_cost_b = rd_cost_a;
+  __m256i rd_cost_z = rd_cost_a;
+
+  rd_cost_a = _mm256_add_epi64(rd_cost_a, pq_a_delta_dist);
+  rd_cost_b = _mm256_add_epi64(rd_cost_b, pq_b_delta_dist);
+
+
+  if (state->all_gte_four) {
+    if (pqDataA->absLevel[0] < 4 && pqDataA->absLevel[3] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(&state->m_coeffFracBits[start][0], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, ext_frac_bits);
+    } else if (pqDataA->absLevel[0] >= 4 && pqDataA->absLevel[3] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[3] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1, (pqDataA->absLevel[0] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[3], 12 + pqDataA->absLevel[3], 6 + pqDataA->absLevel[0], 0 + pqDataA->absLevel[0]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqAs[4] = {0, 0, 3, 3};
+      ALIGNED(32) int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqA = pqAs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqA] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_loadu_si256((__m256i const *)&rd_costs[0]));
+    }
+
+    if (pqDataA->absLevel[1] < 4 && pqDataA->absLevel[2] < 4) {
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+      __m256i ext_frac_bits = _mm256_cvtepi32_epi64(coeff_frac_bits);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, ext_frac_bits);
+    } else if (pqDataA->absLevel[1] >= 4 && pqDataA->absLevel[2] >= 4) {
+      __m128i value = _mm_set_epi32((pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[1] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1, (pqDataA->absLevel[2] - 4) >> 1);
+
+      __m128i offsets = _mm_set_epi32(18 + pqDataA->absLevel[1], 12 + pqDataA->absLevel[1], 6 + pqDataA->absLevel[2], 0 + pqDataA->absLevel[2]);
+      __m128i t = _mm_slli_epi32(value, 1);
+      offsets = _mm_sub_epi32(offsets, t);
+      __m128i coeff_frac_bits = _mm_i32gather_epi32(state->m_coeffFracBits[start], offsets, 4);
+
+      __m128i max_rice = _mm_set1_epi32(31);
+      value = _mm_min_epi32(value, max_rice);
+      __m128i go_rice_tab = _mm_cvtepi8_epi32(_mm_loadu_si32(&state->m_goRicePar[start]));
+      go_rice_tab = _mm_slli_epi32(go_rice_tab, 5);
+      value = _mm_add_epi32(value, go_rice_tab);
+
+      __m128i temp = _mm_add_epi32(coeff_frac_bits, _mm_i32gather_epi32(&g_goRiceBits[0][0], value, 4));
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    } else {
+      const int pqBs[4] = {2, 2, 1, 1};
+      int64_t rd_costs[4] = {0, 0, 0, 0}; 
+      for (int i = 0; i < 4; i++) {
+        const int      state_offset = start + i;
+        const int      pqB = pqBs[i];
+        const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+        if (pqDataA->absLevel[pqB] < 4) {
+          rd_costs[i] = state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rd_costs[i] += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+      }
+      rd_cost_b =
+        _mm256_add_epi64(rd_cost_b, _mm256_loadu_si256((__m256i const *) & rd_costs[0]));
+    }
+
+    if (spt == SCAN_ISCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);
+    } else if (spt == SCAN_SOCSBB) {
+      __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+      __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+      __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+      __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+      __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sigFracBits_0 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+      __m256i m_sigFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+
+      original = _mm256_loadu_si256((__m256i const*)state->m_sbbFracBits[start]);
+      odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+      __m256i m_sbbFracBits_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+
+      
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sbbFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sbbFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sbbFracBits_1);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, m_sigFracBits_1);
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, m_sigFracBits_1);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, m_sigFracBits_0);
+    }
+    else {
+      if (state->m_numSigSbb[start] && state->m_numSigSbb[start + 1] && state->m_numSigSbb[start + 2] && state->m_numSigSbb[start + 3]) {
+        __m256i original = _mm256_loadu_si256((__m256i const*)state->m_sigFracBits[start]);
+        __m256i even_mask = _mm256_setr_epi32(0, 2, 4, 6, -1, -1, -1, -1);
+        __m256i odd_mask = _mm256_setr_epi32(1, 3, 5, 7, -1, -1, -1, -1);
+        __m256i even = _mm256_permutevar8x32_epi32(original, even_mask);
+        __m256i odd = _mm256_permutevar8x32_epi32(original, odd_mask);
+        __m256i even_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(even, 0));
+        __m256i odd_64 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(odd, 0));
+        rd_cost_a = _mm256_add_epi64(rd_cost_a, odd_64);
+        rd_cost_b = _mm256_add_epi64(rd_cost_b, odd_64);
+        rd_cost_z = _mm256_add_epi64(rd_cost_z, even_64);     
+      }
+      else if (!state->m_numSigSbb[start] && !state->m_numSigSbb[start + 1] && !state->m_numSigSbb[start + 2] && !state->m_numSigSbb[start + 3]) {
+        rd_cost_z = _mm256_setr_epi64x(decisions->rdCost[0], decisions->rdCost[0], decisions->rdCost[3], decisions->rdCost[3]);
+      }
+
+      else {
+        const int ALIGNED(32) pqAs[4] = {0, 0, 3, 3};
+        _mm256_store_si256((__m256i*)temp_rd_cost_a, rd_cost_a);
+        _mm256_store_si256((__m256i*)temp_rd_cost_b, rd_cost_b);
+        _mm256_store_si256((__m256i*)temp_rd_cost_z, rd_cost_z);
+        for (int i = 0; i < 4; i++) {
+          const int state_offset = start + i;
+          if (state->m_numSigSbb[state_offset]) {
+            temp_rd_cost_a[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_b[i] += state->m_sigFracBits[state_offset][1];
+            temp_rd_cost_z[i] += state->m_sigFracBits[state_offset][0];
+          } else {
+            temp_rd_cost_z[i] = decisions->rdCost[pqAs[i]];
+          }
+        }
+        rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+        rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+        rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+      }
+    }
+  } else if (state->all_lt_four) {
+    __m128i scale_bits = _mm_set1_epi32(1 << SCALE_BITS);
+    __m128i max_rice = _mm_set1_epi32(31);
+    __m128i go_rice_zero = _mm_cvtepi8_epi32(_mm_loadu_si128((const __m128i*)&state->m_goRiceZero[start]));
+    // RD cost A
+    {
+      __m128i pq_abs_a = _mm_set_epi32(pqDataA->absLevel[3], pqDataA->absLevel[3], pqDataA->absLevel[0], pqDataA->absLevel[0]);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_a, go_rice_zero);
+      
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_a, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_a, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_a = _mm256_add_epi64(rd_cost_a, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost b
+    {
+      __m128i pq_abs_b = _mm_set_epi32(pqDataA->absLevel[1], pqDataA->absLevel[1], pqDataA->absLevel[2], pqDataA->absLevel[2]);
+      __m128i cmp = _mm_cmpgt_epi32(pq_abs_b, go_rice_zero);
+
+      __m128i go_rice_smaller = _mm_min_epi32(pq_abs_b, max_rice);
+
+      __m128i other = _mm_sub_epi32(pq_abs_b, _mm_set1_epi32(1));
+
+      __m128i selected = _mm_blendv_epi8(other, go_rice_smaller, cmp);
+
+
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      __m128i offsets = _mm_add_epi32(selected, go_rice_offset);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], offsets, 4);
+      __m128i temp = _mm_add_epi32(go_rice_tab, scale_bits);
+
+      rd_cost_b = _mm256_add_epi64(rd_cost_b, _mm256_cvtepi32_epi64(temp));
+    }
+    // RD cost Z
+    {
+      __m128i go_rice_offset = _mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)&state->m_goRicePar[start]));
+      go_rice_offset = _mm_slli_epi32(go_rice_offset, 5);
+
+      go_rice_offset = _mm_add_epi32(go_rice_offset, go_rice_zero);
+      __m128i go_rice_tab = _mm_i32gather_epi32(&g_goRiceBits[0][0], go_rice_offset, 4);
+      rd_cost_z = _mm256_add_epi64(rd_cost_z, _mm256_cvtepi32_epi64(go_rice_tab));
+    }
+  } else {
+    const int pqAs[4] = {0, 0, 3, 3};
+    const int pqBs[4] = {2, 2, 1, 1};
+    const int decision_a[4] = {0, 2, 1, 3};
+    for (int i = 0; i < 4; i++) {
+      const int      state_offset = start + i;
+      const int32_t* goRiceTab = g_goRiceBits[state->m_goRicePar[state_offset]];
+      const int pqA = pqAs[i];
+      const int pqB = pqBs[i];
+      int64_t rdCostA = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqA];
+      int64_t rdCostB = state->m_rdCost[state_offset] + pqDataA->deltaDist[pqB];
+      int64_t rdCostZ = state->m_rdCost[state_offset];
+      if (state->m_remRegBins[state_offset] >= 4) {
+        if (pqDataA->absLevel[pqA] < 4) {
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqA] - 4) >> 1;
+          rdCostA += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqA] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (pqDataA->absLevel[pqB] < 4) {
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB]];
+        } else {
+          const coeff_t value = (pqDataA->absLevel[pqB] - 4) >> 1;
+          rdCostB += state->m_coeffFracBits[state_offset][pqDataA->absLevel[pqB] - (value << 1)] + goRiceTab[value < RICEMAX ? value : RICEMAX - 1];
+        }
+        if (spt == SCAN_ISCSBB) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else if (spt == SCAN_SOCSBB) {
+          rdCostA += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sbbFracBits[state_offset][1] + state->m_sigFracBits[state_offset][0];
+        } else if (state->m_numSigSbb[state_offset]) {
+          rdCostA += state->m_sigFracBits[state_offset][1];
+          rdCostB += state->m_sigFracBits[state_offset][1];
+          rdCostZ += state->m_sigFracBits[state_offset][0];
+        } else {
+          rdCostZ = decisions->rdCost[decision_a[i]];
+        }
+      } else {
+        rdCostA += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqA] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqA] - 1 : (pqDataA->absLevel[pqA] < RICEMAX ? pqDataA->absLevel[pqA] : RICEMAX - 1)];
+        rdCostB += (1 << SCALE_BITS) + goRiceTab[pqDataA->absLevel[pqB] <= state->m_goRiceZero[state_offset] ? pqDataA->absLevel[pqB] - 1 : (pqDataA->absLevel[pqB] < RICEMAX ? pqDataA->absLevel[pqB] : RICEMAX - 1)];
+        rdCostZ += goRiceTab[state->m_goRiceZero[state_offset]];
+      }
+      temp_rd_cost_a[i] = rdCostA;
+      temp_rd_cost_b[i] = rdCostB;
+      temp_rd_cost_z[i] = rdCostZ;
+    }
+    rd_cost_a = _mm256_loadu_si256((__m256i*)temp_rd_cost_a);
+    rd_cost_b = _mm256_loadu_si256((__m256i*)temp_rd_cost_b);
+    rd_cost_z = _mm256_loadu_si256((__m256i*)temp_rd_cost_z);
+  }
+  rd_cost_a = _mm256_permute4x64_epi64(rd_cost_a, 216);
+  rd_cost_b = _mm256_permute4x64_epi64(rd_cost_b, 141);
+  rd_cost_z = _mm256_permute4x64_epi64(rd_cost_z, 216);
+  __m256i rd_cost_decision = _mm256_load_si256((__m256i*)decisions->rdCost);
+
+  __m256i decision_abs_coeff = _mm256_load_si256((__m256i*)decisions->absLevel);
+  __m256i decision_prev_state = _mm256_load_si256((__m256i*)decisions->prevId);
+  __m256i decision_data = _mm256_permute2x128_si256(decision_abs_coeff, decision_prev_state, 0x20);
+  __m256i mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  decision_data = _mm256_permutevar8x32_epi32(decision_data, mask);
+
+  __m256i a_data = _mm256_set_epi32(3, pqDataA->absLevel[3], 1, pqDataA->absLevel[0], 2, pqDataA->absLevel[3], 0, pqDataA->absLevel[0]);
+  __m256i b_data = _mm256_set_epi32(2, pqDataA->absLevel[1], 0, pqDataA->absLevel[2], 3, pqDataA->absLevel[1], 1, pqDataA->absLevel[2]);
+  __m256i z_data = _mm256_set_epi32(3, 0, 1, 0, 2, 0, 0, 0);
+
+  __m256i a_vs_b = _mm256_cmpgt_epi64(rd_cost_a, rd_cost_b);
+  __m256i cheaper_first = _mm256_blendv_epi8(rd_cost_a, rd_cost_b, a_vs_b);
+  __m256i cheaper_first_data = _mm256_blendv_epi8(a_data, b_data, a_vs_b);
+
+  __m256i z_vs_decision = _mm256_cmpgt_epi64(rd_cost_z, rd_cost_decision);
+  __m256i cheaper_second = _mm256_blendv_epi8(rd_cost_z, rd_cost_decision, z_vs_decision);
+  __m256i cheaper_second_data = _mm256_blendv_epi8(z_data, decision_data, z_vs_decision);
+
+  __m256i final_decision = _mm256_cmpgt_epi64(cheaper_first, cheaper_second);
+  __m256i final_rd_cost = _mm256_blendv_epi8(cheaper_first, cheaper_second, final_decision);
+  __m256i final_data = _mm256_blendv_epi8(cheaper_first_data, cheaper_second_data, final_decision);
+
+  _mm256_store_si256((__m256i*)decisions->rdCost, final_rd_cost);
+  final_data = _mm256_permutevar8x32_epi32(final_data, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+  _mm256_storeu2_m128i((__m128i *)decisions->prevId, (__m128i *)decisions->absLevel, final_data);
+}
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision*                        decision,
+  const all_depquant_states* const state,
+  int                              decision_id,
+  int                              skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block *              qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  check_rd_costs_avx2(all_states, spt, &pqData, decisions, prev_offset);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  //uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void update_state_eos_avx2(context_store* ctxs, const uint32_t scan_pos, const uint32_t cg_pos,
+                                  const uint32_t sigCtxOffsetNext, const uint32_t gtxCtxOffsetNext,
+                                  const uint32_t width_in_sbb, const uint32_t height_in_sbb,
+                                  const uint32_t next_sbb_right, const uint32_t next_sbb_below,
+                                  const Decision* decisions)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+  bool all_above_minus_two = true;
+  bool all_between_zero_and_three = true;
+  bool all_above_four = true;
+
+  
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  for (int i = 0; i < 4; ++i) {
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_between_zero_and_three &= decisions->prevId[i] >= 0 && decisions->prevId[i] < 4;
+    all_above_four &= decisions->prevId[i] >= 4;
+  }
+  if (all_above_minus_two) {
+    bool all_have_previous_state = true;
+    __m128i prev_state;
+    __m128i prev_state_no_offset;
+    __m128i abs_level = _mm_load_si128((const __m128i*)decisions->absLevel);
+    if (all_above_four) {
+      prev_state = _mm_set1_epi32(ctxs->m_skip_state_offset);
+      prev_state_no_offset = _mm_sub_epi32(_mm_load_si128((const __m128i*)decisions->prevId), _mm_set1_epi32(4));
+      prev_state = _mm_add_epi32(
+        prev_state,
+            prev_state_no_offset
+      );
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16 * sizeof(uint8_t));    
+      }
+    } else if (all_between_zero_and_three) {
+      prev_state_no_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prev_state = _mm_add_epi32(
+        prev_state_no_offset,
+        _mm_load_si128((const __m128i*)decisions->prevId)
+      );
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      __m128i prev_state_with_ff_high_bytes = _mm_or_si128(prev_state, _mm_set1_epi32(0xffffff00));
+      __m128i num_sig_sbb = _mm_load_si128((const __m128i*)state->m_numSigSbb);
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, prev_state_with_ff_high_bytes);
+      num_sig_sbb = _mm_add_epi32(
+        num_sig_sbb,
+        _mm_min_epi32(abs_level, _mm_set1_epi32(1))
+      );
+
+      num_sig_sbb = _mm_shuffle_epi8(num_sig_sbb, control);
+      int num_sig_sbb_s = _mm_extract_epi32(num_sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &num_sig_sbb_s, 4);
+
+      int32_t prev_state_scalar[4];
+      _mm_storeu_si128((__m128i*)prev_state_scalar, prev_state);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prev_state_scalar[i]], 16 * sizeof(uint8_t));
+      }
+    } else {
+      int prev_state_s[4] = {-1, -1, -1, -1};
+      for (int i = 0; i < 4; ++i) {
+        const int decision_id = i;
+        const int curr_state_offset = state_offset + i;
+        if (decisions->prevId[decision_id] >= 4) {
+          prev_state_s[i] = ctxs->m_skip_state_offset + (decisions->prevId[decision_id] - 4);
+          state->m_numSigSbb[curr_state_offset] = 0;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+        } else if (decisions->prevId[decision_id] >= 0) {
+          prev_state_s[i] = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[curr_state_offset] = state->m_numSigSbb[prev_state_s[i]] + !!decisions->absLevel[decision_id];
+          memcpy(state->m_absLevelsAndCtxInit[curr_state_offset], state->m_absLevelsAndCtxInit[prev_state_s[i]], 16 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[curr_state_offset] = 1;
+          memset(state->m_absLevelsAndCtxInit[curr_state_offset], 0, 16 * sizeof(uint8_t));
+          all_have_previous_state = false;
+        }
+      }
+      prev_state = _mm_loadu_si128((__m128i const*)prev_state_s);
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i  max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    uint32_t max_abs_s[4];
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+
+    // Update common context
+    __m128i last;
+    {
+      const uint32_t numSbb = width_in_sbb * height_in_sbb;
+      common_context* cc = &ctxs->m_common_context;
+      size_t         setCpSize = cc->m_nbInfo[scan_pos - 1].maxDist * sizeof(uint8_t);
+      int previous_state_array[4];
+      _mm_storeu_si128((__m128i*)previous_state_array, prev_state);
+      for (int curr_state = 0; curr_state < 4; ++curr_state) {
+        uint8_t* sbbFlags = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].sbbFlags;
+        uint8_t* levels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset + (curr_state)].levels;
+        const int p_state = previous_state_array[curr_state];
+        if (p_state != -1 && ctxs->m_allStates.m_refSbbCtxId[p_state] >= 0) {
+          const int prev_sbb = cc->m_prev_sbb_ctx_offset + ctxs->m_allStates.m_refSbbCtxId[p_state];
+          memcpy(sbbFlags, cc->m_allSbbCtx[prev_sbb].sbbFlags, numSbb * sizeof(uint8_t));
+          memcpy(levels + scan_pos, cc->m_allSbbCtx[prev_sbb].levels + scan_pos, setCpSize);
+        } else {
+          memset(sbbFlags, 0, numSbb * sizeof(uint8_t));
+          memset(levels + scan_pos, 0, setCpSize);
+        }
+        sbbFlags[cg_pos] = !!ctxs->m_allStates.m_numSigSbb[curr_state + state_offset];
+        memcpy(levels + scan_pos, ctxs->m_allStates.m_absLevelsAndCtxInit[curr_state + state_offset], 16 * sizeof(uint8_t));
+      }
+
+      __m128i sbb_offsets = _mm_set_epi32(3 * numSbb, 2 * numSbb, 1 * numSbb, 0);
+      __m128i next_sbb_right_m = _mm_set1_epi32(next_sbb_right);
+      __m128i sbb_offsets_right = _mm_add_epi32(sbb_offsets, next_sbb_right_m);
+      __m128i sbb_right = next_sbb_right ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_right, 1) : _mm_set1_epi32(0);
+
+      __m128i sbb_offsets_below = _mm_add_epi32(sbb_offsets, _mm_set1_epi32(next_sbb_below));
+      __m128i sbb_below = next_sbb_below ? _mm_i32gather_epi32((const int *)cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].sbbFlags, sbb_offsets_below, 1) : _mm_set1_epi32(0);
+
+      __m128i sig_sbb = _mm_or_si128(sbb_right, sbb_below);
+      sig_sbb         = _mm_and_si128(sig_sbb, _mm_set1_epi32(0xff));
+      sig_sbb = _mm_min_epi32(sig_sbb, _mm_set1_epi32(1));
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((int64_t *)cc->m_sbbFlagBits[0], sig_sbb, 8);
+      _mm256_store_si256((__m256i*)state->m_sbbFracBits[state_offset], sbb_frac_bits);
+
+      memset(&state->m_numSigSbb[state_offset], 0, 4);
+      memset(&state->m_goRicePar[state_offset], 0, 4);
+
+      uint8_t states[4] = {0, 1, 2, 3};
+      memcpy(&state->m_refSbbCtxId[state_offset], states, 4);
+      if (all_have_previous_state) {
+        __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prev_state, 4);
+        _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+      } else {
+        const int temp = (state->effWidth * state->effHeight * 28) / 16;
+        for (int i = 0; i < 4; ++i) {
+          if (previous_state_array[i] != -1) {
+            state->m_remRegBins[i + state_offset] = state->m_remRegBins[previous_state_array[i]];
+          } else {
+            state->m_remRegBins[i + state_offset] = temp;
+          }
+        }
+      }
+      
+      const int        scanBeg = scan_pos - 16;
+      const NbInfoOut* nbOut = cc->m_nbInfo + scanBeg;
+      const uint8_t*   absLevels = cc->m_allSbbCtx[cc->m_curr_sbb_ctx_offset].levels + scanBeg;
+
+      __m128i          levels_offsets = _mm_set_epi32(cc->num_coeff * 3, cc->num_coeff * 2, cc->num_coeff * 1, 0);
+      __m128i          first_byte = _mm_set1_epi32(0xff);
+      __m128i          ones = _mm_set1_epi32(1);
+      __m128i         fours = _mm_set1_epi32(4);
+      __m256i          all[4];
+      uint64_t         temp[4];
+      const __m256i v_shuffle = _mm256_set_epi8(15, 14,  7,  6, 13, 12,  5,  4, 11, 10,  3,  2,  9,  8,  1,  0,
+                                                31, 30, 23, 22, 29, 28, 21, 20, 27, 26, 19, 18, 25, 24, 17, 16);
+
+      for (int id = 0; id < 16; id++, nbOut++) {
+        if (nbOut->num == 0) {
+          temp[id % 4] = 0;
+          if (id % 4 == 3) {
+            all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+            all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
+          }
+          continue;
+        }
+        __m128i sum_abs = _mm_set1_epi32(0);
+        __m128i sum_abs_1 = _mm_set1_epi32(0);
+        __m128i sum_num = _mm_set1_epi32(0);
+        switch (nbOut->num) {
+        case 5:
+          {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[4]));
+            __m128i t = _mm_i32gather_epi32((const int *)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)
+              )
+            );
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+          }
+        case 4: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[3]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 3: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[2]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 2: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[1]));
+            __m128i t     = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num   = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+        case 1: {
+            __m128i offset = _mm_add_epi32(levels_offsets, _mm_set1_epi32(nbOut->outPos[0]));
+            __m128i t = _mm_i32gather_epi32((const int*)absLevels, offset, 1);
+            t = _mm_and_si128(t, first_byte);
+            sum_abs = _mm_add_epi32(sum_abs, t);
+            sum_num = _mm_add_epi32(sum_num, _mm_min_epi32(t, ones));
+            __m128i min_t = _mm_min_epi32(
+              t,
+              _mm_add_epi32(
+                fours,
+                _mm_and_si128(t, ones)));
+            sum_abs_1 = _mm_add_epi32(sum_abs_1, min_t);
+        }
+            break;
+        default:
+          assert(0);
+        }
+        sum_abs_1 = _mm_slli_epi32(sum_abs_1, 3);
+        sum_abs = _mm_slli_epi32(_mm_min_epi32(_mm_set1_epi32(127), sum_abs), 8);
+        __m128i template_ctx_init = _mm_add_epi32(sum_num, sum_abs);
+        template_ctx_init = _mm_add_epi32(template_ctx_init, sum_abs_1);
+        __m128i shuffle_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0);
+        __m128i shuffled_template_ctx_init = _mm_shuffle_epi8(template_ctx_init, shuffle_mask);
+        temp[id % 4] = _mm_extract_epi64(shuffled_template_ctx_init, 0);
+        if (id % 4 == 3) {
+          all[id / 4] = _mm256_loadu_si256((__m256i const*)temp);
+          all[id / 4] = _mm256_shuffle_epi8(all[id / 4], v_shuffle);
+          last = template_ctx_init;
+        }
+      }
+
+      __m256i* v_src_tmp = all;
+
+      __m256i v_tmp[4];
+      v_tmp[0] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x20);
+      v_tmp[1] = _mm256_permute2x128_si256(v_src_tmp[0], v_src_tmp[1], 0x31);
+      v_tmp[2] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x20);
+      v_tmp[3] = _mm256_permute2x128_si256(v_src_tmp[2], v_src_tmp[3], 0x31);
+
+      __m256i v_tmp16_lo[2];
+      __m256i v_tmp16_hi[2];
+      v_tmp16_lo[0] = _mm256_unpacklo_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_lo[1] = _mm256_unpacklo_epi32(v_tmp[2], v_tmp[3]);
+      v_tmp16_hi[0] = _mm256_unpackhi_epi32(v_tmp[0], v_tmp[1]);
+      v_tmp16_hi[1] = _mm256_unpackhi_epi32(v_tmp[2], v_tmp[3]);
+
+      v_tmp[0] = _mm256_permute4x64_epi64(v_tmp16_lo[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[1] = _mm256_permute4x64_epi64(v_tmp16_lo[1], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[2] = _mm256_permute4x64_epi64(v_tmp16_hi[0], _MM_SHUFFLE(3, 1, 2, 0));
+      v_tmp[3] = _mm256_permute4x64_epi64(v_tmp16_hi[1], _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 1] + 8),  _mm256_permute2x128_si256(v_tmp[0], v_tmp[1], 0x31));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 2] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x20));
+      _mm256_store_si256((__m256i*)(state->m_absLevelsAndCtxInit[state_offset + 3] + 8),  _mm256_permute2x128_si256(v_tmp[2], v_tmp[3], 0x31));
+
+      for (int i = 0; i < 4; ++i) {
+        memset(state->m_absLevelsAndCtxInit[state_offset + i], 0, 16);
+      }
+    }
+
+    __m128i sum_num = _mm_and_si128(last, _mm_set1_epi32(7));
+    __m128i sum_abs1 = _mm_and_si128(
+      _mm_srli_epi32(last, 3),
+      _mm_set1_epi32(31));
+
+    __m128i sum_abs_min = _mm_min_epi32(
+      _mm_set1_epi32(3),
+      _mm_srli_epi32(
+        _mm_add_epi32(sum_abs1, _mm_set1_epi32(1)),
+        1));
+
+    __m128i offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+    offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+    offsets         = _mm_add_epi32(offsets, sum_abs_min);
+    __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)&state->m_sigFracBitsArray[state_offset][0][0], offsets, 8);
+    _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+
+    __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+    __m128i min_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+    uint32_t sum_gt1_s[4];
+    _mm_storeu_si128((__m128i*)sum_gt1_s, min_gt1);
+    for (int i = 0; i < 4; ++i) {
+      memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i] + gtxCtxOffsetNext], sizeof(state->m_coeffFracBits[0]));
+    }
+  }
+  else {
+    for (int i = 0; i < 4; i++) {
+      uvg_dep_quant_update_state_eos(
+        ctxs,
+        scan_pos,
+        cg_pos,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        width_in_sbb,
+        height_in_sbb,
+        next_sbb_right,
+        next_sbb_below,
+        decisions,
+        i);
+    }
+  }
+}
+
+static INLINE void update_states_avx2(
+  context_store*  ctxs,
+  int             numIPos,
+  const uint32_t  scan_pos,
+  const Decision* decisions,
+  const uint32_t  sigCtxOffsetNext,
+  const uint32_t  gtxCtxOffsetNext,
+  const NbInfoSbb next_nb_info_ssb,
+  const int       baseLevel,
+  const bool      extRiceFlag)
+{
+  all_depquant_states* state = &ctxs->m_allStates;
+
+  bool all_non_negative = true;
+  bool all_above_minus_two = true;
+  bool all_minus_one = true;
+  for (int i = 0; i < 4; ++i) {
+    all_non_negative &= decisions->prevId[i] >= 0;
+    all_above_minus_two &= decisions->prevId[i] > -2;
+    all_minus_one &= decisions->prevId[i] == -1;
+  }
+  int state_offset = ctxs->m_curr_state_offset;
+  __m256i rd_cost = _mm256_load_si256((__m256i const*)decisions->rdCost);
+  _mm256_store_si256((__m256i *)& ctxs->m_allStates.m_rdCost[state_offset], rd_cost);
+  if (all_above_minus_two) {
+
+    bool    rem_reg_all_gte_4 = true;
+    bool    rem_reg_all_lt4 = true;
+
+    __m128i abs_level = _mm_load_si128((__m128i const*)decisions->absLevel);
+    if (all_non_negative) {
+      __m128i prv_states  = _mm_load_si128((__m128i const*)decisions->prevId);
+      __m128i prev_offset = _mm_set1_epi32(ctxs->m_prev_state_offset);
+      prv_states = _mm_add_epi32(prv_states, prev_offset);
+      __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+      __m128i shuffled_prev_states = _mm_shuffle_epi8(prv_states, control);
+      
+      __m128i sig_sbb   = _mm_load_si128((__m128i const*)state->m_numSigSbb);
+      sig_sbb = _mm_shuffle_epi8(sig_sbb, shuffled_prev_states);
+      __m128i has_coeff = _mm_min_epi32(abs_level, _mm_set1_epi32(1));
+      has_coeff         = _mm_shuffle_epi8(has_coeff, control);
+      sig_sbb           = _mm_or_si128(sig_sbb, has_coeff);
+      int sig_sbb_i = _mm_extract_epi32(sig_sbb, 0);
+      memcpy(&state->m_numSigSbb[state_offset], &sig_sbb_i, 4);
+      
+      __m128i ref_sbb_ctx_idx = _mm_load_si128((__m128i const*)state->m_refSbbCtxId);
+      ref_sbb_ctx_idx = _mm_shuffle_epi8(ref_sbb_ctx_idx, shuffled_prev_states);
+      int ref_sbb_ctx = _mm_extract_epi32(ref_sbb_ctx_idx, 0);
+      memcpy(&state->m_refSbbCtxId[state_offset], &ref_sbb_ctx, 4);
+      
+      __m128i go_rice_par = _mm_load_si128((__m128i const*)state->m_goRicePar);
+      go_rice_par = _mm_shuffle_epi8(go_rice_par, shuffled_prev_states);
+      int go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+      memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+      
+      __m256i sbb_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sbbFracBits[0], prv_states, 8);
+      _mm256_store_si256((__m256i*)&state->m_sbbFracBits[state_offset][0], sbb_frac_bits);
+
+      __m128i rem_reg_bins = _mm_i32gather_epi32(state->m_remRegBins, prv_states, 4);
+      __m128i ones = _mm_set1_epi32(1);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, ones);
+
+      __m128i reg_bins_sub = _mm_set1_epi32(0);
+      __m128i abs_level_smaller_than_two = _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2));
+      __m128i secondary = _mm_blendv_epi8(_mm_set1_epi32(3), abs_level, abs_level_smaller_than_two);
+
+      __m128i rem_reg_bins_smaller_than_four = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      reg_bins_sub = _mm_blendv_epi8(secondary, reg_bins_sub, rem_reg_bins_smaller_than_four);
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, reg_bins_sub);
+      _mm_store_si128((__m128i*)&state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3)); 
+      int     bit_mask = _mm_movemask_epi8(mask);           
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask); 
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+
+      int32_t prv_states_scalar[4];
+      _mm_storeu_si128((__m128i*)prv_states_scalar, prv_states);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_absLevelsAndCtxInit[state_offset + i], state->m_absLevelsAndCtxInit[prv_states_scalar[i]], 48 * sizeof(uint8_t));        
+      }
+    }
+    else if (all_minus_one) {
+      memset(&state->m_numSigSbb[state_offset], 1, 4);
+      memset(&state->m_refSbbCtxId[state_offset], -1, 4);
+
+      const int a = (state->effWidth * state->effHeight * 28) / 16;
+
+      __m128i   rem_reg_bins = _mm_set1_epi32(a);
+      __m128i   sub = _mm_blendv_epi8(
+        _mm_set1_epi32(3),
+        abs_level,
+        _mm_cmplt_epi32(abs_level, _mm_set1_epi32(2))
+      );
+      rem_reg_bins = _mm_sub_epi32(rem_reg_bins, sub);
+      _mm_store_si128((__m128i*) & state->m_remRegBins[state_offset], rem_reg_bins);
+
+      __m128i mask = _mm_cmpgt_epi32(rem_reg_bins, _mm_set1_epi32(3));
+      int     bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_gte_4 = (bit_mask == 0xFFFF);
+      mask = _mm_cmplt_epi32(rem_reg_bins, _mm_set1_epi32(4));
+      bit_mask = _mm_movemask_epi8(mask);
+      rem_reg_all_lt4 = (bit_mask == 0xFFFF);
+      
+      memset(state->m_absLevelsAndCtxInit[state_offset], 0, 48 * sizeof(uint8_t) * 4);
+      
+    }
+    else {
+      for (int i = 0; i< 4; ++i) {
+        const int decision_id = i;
+        const int state_id = state_offset + i;
+        if (decisions->prevId[decision_id] >= 0) {
+          const int prvState = ctxs->m_prev_state_offset + decisions->prevId[decision_id];
+          state->m_numSigSbb[state_id] = (state->m_numSigSbb[prvState]) || !!decisions->absLevel[decision_id];
+          state->m_refSbbCtxId[state_id] = state->m_refSbbCtxId[prvState];
+          state->m_sbbFracBits[state_id][0] = state->m_sbbFracBits[prvState][0];
+          state->m_sbbFracBits[state_id][1] = state->m_sbbFracBits[prvState][1];
+          state->m_remRegBins[state_id] = state->m_remRegBins[prvState] - 1;
+          state->m_goRicePar[state_id] = state->m_goRicePar[prvState];
+          if (state->m_remRegBins[state_id] >= 4) {
+            state->m_remRegBins[state_id] -= (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          }
+          memcpy(state->m_absLevelsAndCtxInit[state_id], state->m_absLevelsAndCtxInit[prvState], 48 * sizeof(uint8_t));
+        } else {
+          state->m_numSigSbb[state_id] = 1;
+          state->m_refSbbCtxId[state_id] = -1;
+          int ctxBinSampleRatio = 28;
+          //(scanInfo.chType == CHANNEL_TYPE_LUMA) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
+          state->m_remRegBins[state_id] = (state->effWidth * state->effHeight * ctxBinSampleRatio) / 16 - (decisions->absLevel[decision_id] < 2 ? (unsigned)decisions->absLevel[decision_id] : 3);
+          memset(state->m_absLevelsAndCtxInit[state_id], 0, 48 * sizeof(uint8_t));
+        }
+        rem_reg_all_gte_4 &= state->m_remRegBins[state_id] >= 4;
+        rem_reg_all_lt4 &= state->m_remRegBins[state_id] < 4;
+      }
+    }
+    uint32_t level_offset = scan_pos & 15;
+    __m128i   max_abs = _mm_min_epi32(abs_level, _mm_set1_epi32(32));
+    uint32_t max_abs_s[4];
+    _mm_storeu_si128((__m128i*)max_abs_s, max_abs);
+    for (int i = 0; i < 4; ++i) {
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset + i];
+      levels[level_offset] = max_abs_s[i];
+    }
+    state->all_gte_four = rem_reg_all_gte_4;
+    state->all_lt_four = rem_reg_all_lt4;
+    if (rem_reg_all_gte_4) {
+      const __m128i  first_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  first_byte = _mm_set1_epi32(0xff);
+      const __m128i  ones = _mm_set1_epi32(1);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      __m128i        tinit = _mm_i32gather_epi32(
+        (int *)state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
+      tinit = _mm_and_si128(tinit, first_two_bytes);
+      __m128i sum_abs1 = _mm_and_si128(_mm_srli_epi32(tinit, 3), _mm_set1_epi32(31));
+      __m128i sum_num = _mm_and_si128(tinit, _mm_set1_epi32(7));
+
+      uint8_t* levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      switch (numIPos) {
+      case 5:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int *)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 4:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 3:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 2:
+        {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+        __m128i min_arg = _mm_min_epi32(
+              _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+              t
+            );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+          );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        }
+      case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          t = _mm_and_si128(t, first_byte);
+          __m128i min_arg = _mm_min_epi32(
+            _mm_add_epi32(_mm_set1_epi32(4), _mm_and_si128(t, ones)),
+            t
+          );
+          sum_abs1 = _mm_add_epi32(
+            sum_abs1,
+            min_arg
+            );
+          sum_num = _mm_add_epi32(
+            sum_num,
+            _mm_min_epi32(_mm_and_si128(t, first_byte), ones));
+        } break;
+      default:
+          assert(0);
+      }
+      __m128i sum_gt1 = _mm_sub_epi32(sum_abs1, sum_num);
+      __m128i  offsets = _mm_set_epi32(12 * 3, 12 * 2, 12 * 1, 12 * 0);
+      offsets = _mm_add_epi32(offsets, _mm_set1_epi32(sigCtxOffsetNext));
+      __m128i temp = _mm_min_epi32(
+        _mm_srli_epi32(_mm_add_epi32(sum_abs1, ones), 1),
+        _mm_set1_epi32(3));
+      offsets = _mm_add_epi32(offsets, temp);
+      __m256i sig_frac_bits = _mm256_i32gather_epi64((const int64_t *)state->m_sigFracBitsArray[state_offset][0], offsets, 8);
+      _mm256_store_si256((__m256i*)&state->m_sigFracBits[state_offset][0], sig_frac_bits);
+
+      sum_gt1 = _mm_min_epi32(sum_gt1, _mm_set1_epi32(4));
+      sum_gt1 = _mm_add_epi32(sum_gt1, _mm_set1_epi32(gtxCtxOffsetNext));
+      uint32_t sum_gt1_s[4];
+      _mm_storeu_si128((__m128i*)sum_gt1_s, sum_gt1);
+      for (int i = 0; i < 4; ++i) {
+        memcpy(state->m_coeffFracBits[state_offset + i], state->m_gtxFracBitsArray[sum_gt1_s[i]], sizeof(state->m_coeffFracBits[0]));
+      }
+
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      sum_abs = _mm_min_epi32(sum_abs, _mm_set1_epi32(32));
+      switch (numIPos) {
+        case 5:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 4:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 3:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 2:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          }
+        case 1:
+          {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          sum_abs = _mm_add_epi32(t, sum_abs);
+          } break;
+        default:
+          assert(0);
+      }
+      sum_abs = _mm_and_si128(sum_abs, first_byte);
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_max_epi32(
+          _mm_min_epi32(
+            _mm_set1_epi32(31),
+            _mm_sub_epi32(sum_abs, _mm_set1_epi32(20))),
+          _mm_set1_epi32(0));
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+      }
+    }
+
+    else if (rem_reg_all_lt4) {
+      uint8_t*       levels = (uint8_t*)state->m_absLevelsAndCtxInit[state_offset];
+      const __m128i  last_two_bytes = _mm_set1_epi32(0xffff);
+      const __m128i  last_byte = _mm_set1_epi32(0xff);
+      const uint32_t tinit_offset = MIN(level_offset - 1u, 15u) + 8;
+      const __m128i levels_start_offsets = _mm_set_epi32(48 * 3, 48 * 2, 48 * 1, 48 * 0);
+      const __m128i ctx_start_offsets = _mm_srli_epi32(levels_start_offsets, 1);
+      __m128i       tinit = _mm_i32gather_epi32(
+        (int*)state->m_absLevelsAndCtxInit[state_offset],
+        _mm_add_epi32(ctx_start_offsets, _mm_set1_epi32(tinit_offset)),
+        2);
+      tinit = _mm_and_si128(tinit, last_two_bytes);
+      __m128i sum_abs = _mm_srli_epi32(tinit, 8);
+      switch (numIPos) {
+        case 5: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[4])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 4: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[3])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 3: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[2])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 2: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[1])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        }
+        case 1: {
+          __m128i t = _mm_i32gather_epi32(
+            (int*)levels,
+            _mm_add_epi32(levels_start_offsets, _mm_set1_epi32(next_nb_info_ssb.inPos[0])),
+            1);
+          t = _mm_and_si128(t, last_byte);
+          sum_abs = _mm_add_epi32(sum_abs, t);
+        } break;
+        default:
+          assert(0);
+      }
+      if (extRiceFlag) {
+        assert(0 && "Not implemented for avx2");
+      } else {
+        __m128i sum_all = _mm_min_epi32(_mm_set1_epi32(31), sum_abs);
+        __m128i temp = _mm_i32gather_epi32(g_goRiceParsCoeff, sum_all, 4);
+        __m128i control = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+        __m128i go_rice_par = _mm_shuffle_epi8(temp, control);
+        int     go_rice_par_i = _mm_extract_epi32(go_rice_par, 0);
+        memcpy(&state->m_goRicePar[state_offset], &go_rice_par_i, 4);
+
+        
+        for (int i = 0; i < 4; ++i) {
+          state->m_goRiceZero[state_offset + i] = (i < 2 ? 1 : 2) << state->m_goRicePar[state_offset + i];
+          
+        }
+
+      }
+
+    }
+    else {
+      for (int i = 0; i < 4; ++i) {
+        const int state_id = state_offset + i;
+        uint8_t*  levels = (uint8_t*)(state->m_absLevelsAndCtxInit[state_id]);
+        if (state->m_remRegBins[state_id] >= 4) {
+          coeff_t tinit = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)];
+          coeff_t sumAbs1 = (tinit >> 3) & 31;
+          coeff_t sumNum = tinit & 7;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs1 += MIN(4 + (t & 1), t);                \
+    sumNum += !!t;                                 \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          coeff_t sumGt1 = sumAbs1 - sumNum;
+          state->m_sigFracBits[state_id][0] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][0];
+          state->m_sigFracBits[state_id][1] = state->m_sigFracBitsArray[state_id][sigCtxOffsetNext + MIN((sumAbs1 + 1) >> 1, 3)][1];
+          memcpy(state->m_coeffFracBits[state_id], state->m_gtxFracBitsArray[gtxCtxOffsetNext + (sumGt1 < 4 ? sumGt1 : 4)], sizeof(state->m_coeffFracBits[0]));
+
+
+          coeff_t sumAbs = state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)] >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            int sumAll = MAX(MIN(31, (int)sumAbs - 4 * 5), 0);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAll];
+          }
+        } else {
+          coeff_t sumAbs = (state->m_absLevelsAndCtxInit[state_id][8 + ((scan_pos - 1) & 15)]) >> 8;
+#define UPDATE(k)                                  \
+  {                                                \
+    coeff_t t = levels[next_nb_info_ssb.inPos[k]]; \
+    sumAbs += t;                                   \
+  }
+          switch (numIPos) {
+            case 5: UPDATE(4);
+            case 4: UPDATE(3);
+            case 3: UPDATE(2);
+            case 2: UPDATE(1);
+            case 1: UPDATE(0); break;
+            default: assert(0);
+          }
+#undef UPDATE
+          if (extRiceFlag) {
+            assert(0 && "Not implemented for avx2");
+          } else {
+            sumAbs = MIN(31, sumAbs);
+            state->m_goRicePar[state_id] = g_goRiceParsCoeff[sumAbs];
+          }
+          state->m_goRiceZero[state_id] = ((state_id & 3) < 2 ? 1 : 2) << state->m_goRicePar[state_id];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < 4; ++i) {
+      state->all_gte_four = true;
+      state->all_lt_four = true;
+      uvg_dep_quant_update_state(
+        ctxs,
+        numIPos,
+        scan_pos,
+        decisions,
+        sigCtxOffsetNext,
+        gtxCtxOffsetNext,
+        next_nb_info_ssb,
+        baseLevel,
+        extRiceFlag,
+        i);
+    }
+  }
+}
+
+void uvg_dep_quant_decide_and_update_avx2(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      update_state_eos_avx2(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      update_states_avx2(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
+
+  return success;
+}
diff --git a/src/strategies/avx2/depquant-avx2.h b/src/strategies/avx2/depquant-avx2.h
new file mode 100644
index 00000000..e6db110c
--- /dev/null
+++ b/src/strategies/avx2/depquant-avx2.h
@@ -0,0 +1,46 @@
+#ifndef STRATEGIES_DEPQUANT_AVX2_H_
+#define STRATEGIES_DEPQUANT_AVX2_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Optimizations for AVX2.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_AVX2_H_
diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c
new file mode 100644
index 00000000..aa2ea99e
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.c
@@ -0,0 +1,238 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/generic/depquant-generic.h"
+
+#include "dep_quant.h"
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "intra.h"
+#include "rdo.h"
+#include "strategyselector.h"
+#include "transform.h"
+#include "uvg_math.h"
+#include "generic/quant-generic.h"
+static const int32_t g_goRiceBits[4][RICEMAX] = {
+  {32768,  65536,  98304,  131072, 163840, 196608, 262144, 262144,
+   327680, 327680, 327680, 327680, 393216, 393216, 393216, 393216,
+   393216, 393216, 393216, 393216, 458752, 458752, 458752, 458752,
+   458752, 458752, 458752, 458752, 458752, 458752, 458752, 458752},
+  {65536,  65536,  98304,  98304,  131072, 131072, 163840, 163840,
+   196608, 196608, 229376, 229376, 294912, 294912, 294912, 294912,
+   360448, 360448, 360448, 360448, 360448, 360448, 360448, 360448,
+   425984, 425984, 425984, 425984, 425984, 425984, 425984, 425984},
+  {98304,  98304,  98304,  98304,  131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 262144, 262144, 262144, 262144,
+   327680, 327680, 327680, 327680, 327680, 327680, 327680, 327680},
+  {131072, 131072, 131072, 131072, 131072, 131072, 131072, 131072,
+   163840, 163840, 163840, 163840, 163840, 163840, 163840, 163840,
+   196608, 196608, 196608, 196608, 196608, 196608, 196608, 196608,
+   229376, 229376, 229376, 229376, 229376, 229376, 229376, 229376},
+};
+
+
+static INLINE void checkRdCostSkipSbbZeroOut(
+  Decision* decision, 
+  const all_depquant_states* const state,
+  int decision_id, 
+  int skip_offset) {
+  int64_t rdCost = state->m_rdCost[decision_id + skip_offset] + state->m_sbbFracBits[decision_id + skip_offset][0];
+  decision->rdCost[decision_id] = rdCost;
+  decision->absLevel[decision_id] = 0;
+  decision->prevId[decision_id] = 4 + state->m_stateId[decision_id + skip_offset];
+}
+
+static INLINE void checkRdCostSkipSbb(const all_depquant_states* const state, Decision * decisions, int decision_id, int skip_offset)
+{
+  int64_t rdCost = state->m_rdCost[skip_offset + decision_id] + state->m_sbbFracBits[skip_offset + decision_id][0];
+  if (rdCost < decisions->rdCost[decision_id])
+  {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = 0;
+    decisions->prevId[decision_id] = 4 + state->m_stateId[skip_offset + decision_id];
+  }
+}
+
+static INLINE void checkRdCostStart(const depquant_state* const state, int32_t lastOffset, const PQData *pqData, Decision *decisions, int
+                                    decision_id)
+{
+  int64_t rdCost = pqData->deltaDist[decision_id] + lastOffset;
+  if (pqData->absLevel[decision_id] < 4) {
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id]];
+  }
+  else {
+    const coeff_t value = (pqData->absLevel[decision_id] - 4) >> 1;
+    rdCost += state->m_coeffFracBits[pqData->absLevel[decision_id] - (value << 1)]
+              + g_goRiceBits[state->m_goRicePar][value < RICEMAX ? value : RICEMAX - 1];
+  }
+  if (rdCost < decisions->rdCost[decision_id]) {
+    decisions->rdCost[decision_id] = rdCost;
+    decisions->absLevel[decision_id] = pqData->absLevel[decision_id];
+    decisions->prevId[decision_id] = -1;
+  }
+}
+
+
+
+static const Decision startDec = { .rdCost = {INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2, INT64_MAX >> 2},
+  .absLevel = {-1, -1, -1, -1, 0, 0, 0, 0}, .prevId = {-2, -2, -2, -2, 4, 5, 6, 7} };
+
+static INLINE void preQuantCoeff(const quant_block * const qp, const coeff_t absCoeff, PQData* pqData, coeff_t quanCoeff)
+{
+  int64_t scaledOrg = (int64_t)(absCoeff) * quanCoeff;
+  coeff_t  qIdx = MAX(1, (coeff_t)MIN(qp->m_maxQIdx, ((scaledOrg + qp->m_QAdd) >> qp->m_QShift)));
+  int64_t scaledAdd = qIdx * qp->m_DistStepAdd - scaledOrg * qp->m_DistOrgFact;
+  int index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+  scaledAdd += qp->m_DistStepAdd;
+  index = qIdx & 3;
+  pqData->deltaDist[index] = (scaledAdd * qIdx + qp->m_DistAdd) >> qp->m_DistShift;
+  pqData->absLevel[index] = (++qIdx) >> 1;
+}
+
+static void xDecide(
+  all_depquant_states* const all_states,
+  depquant_state* const      m_startState,
+  quant_block*               qp,
+  const enum ScanPosType     spt,
+  const coeff_t              absCoeff,
+  const int                  lastOffset,
+  Decision*                  decisions,
+  bool                       zeroOut,
+  coeff_t                    quanCoeff,
+  const int                  skip_offset,
+  const int                  prev_offset)
+{
+  memcpy(decisions, &startDec, sizeof(Decision));
+
+  if (zeroOut) {
+    if (spt == SCAN_EOCSBB) {
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 0, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 1, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 2, skip_offset);
+      checkRdCostSkipSbbZeroOut(decisions, all_states, 3, skip_offset);
+    }
+    return;
+  }
+
+  PQData pqData;
+  preQuantCoeff(qp, absCoeff, &pqData, quanCoeff);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 0, 2, prev_offset + 0);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 2, 0, prev_offset + 1);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 1, 3, prev_offset + 2);
+  uvg_dep_quant_check_rd_costs(all_states, spt, &pqData, decisions, 3, 1, prev_offset + 3);
+  if (spt == SCAN_EOCSBB) {
+    checkRdCostSkipSbb(all_states, decisions, 0, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 1, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 2, skip_offset);
+    checkRdCostSkipSbb(all_states, decisions, 3, skip_offset);
+  }
+
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 0);
+  checkRdCostStart(m_startState, lastOffset, &pqData, decisions, 2);
+}
+
+
+static void uvg_dep_quant_decide_and_update_generic(
+  rate_estimator_t*                         re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma)
+{
+  Decision* decisions = &ctxs->m_trellis[scan_pos];
+  SWAP(ctxs->m_curr_state_offset, ctxs->m_prev_state_offset, int);
+
+  enum ScanPosType spt = 0;
+  if ((scan_pos & 15) == 15 && scan_pos > 16 && scan_pos < effHeight * effWidth - 1)
+  {
+    spt = SCAN_SOCSBB;
+  }
+  else if ((scan_pos & 15) == 0 && scan_pos > 0 && scan_pos < effHeight * effWidth - 16)
+  {
+    spt = SCAN_EOCSBB;
+  }
+
+  xDecide(&ctxs->m_allStates, &ctxs->m_startState, ctxs->m_quant, spt, absCoeff, re->m_lastBitsX[scan_info->pos_x] + re->m_lastBitsY[scan_info->pos_y], decisions, zeroOut, quantCoeff,ctxs->m_skip_state_offset, ctxs->m_prev_state_offset);
+
+  if (scan_pos) {
+    if (!(scan_pos & 15)) {
+      SWAP(ctxs->m_common_context.m_curr_sbb_ctx_offset, ctxs->m_common_context.m_prev_sbb_ctx_offset, int);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 0);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 1);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 2);
+      uvg_dep_quant_update_state_eos(ctxs, scan_pos, scan_info->cg_pos, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], width_in_sbb, height_in_sbb, scan_info->next_sbb_right, scan_info->next_sbb_below, decisions, 3);
+      memcpy(decisions->prevId + 4, decisions->prevId, 4 * sizeof(int32_t));
+      memcpy(decisions->absLevel + 4, decisions->absLevel, 4 * sizeof(int32_t));
+      memcpy(decisions->rdCost + 4, decisions->rdCost, 4 * sizeof(int64_t));
+    } else if (!zeroOut) {
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 0);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 1);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 2);
+      uvg_dep_quant_update_state(ctxs, next_nb_info_ssb.num, scan_pos, decisions, scan_info->sig_ctx_offset[is_chroma], scan_info->gtx_ctx_offset[is_chroma], next_nb_info_ssb, 4, false, 3);
+    }
+
+    if (spt == SCAN_SOCSBB) {
+      SWAP(ctxs->m_skip_state_offset, ctxs->m_prev_state_offset, int);
+    }
+  }
+}
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic);
+
+
+  return success;
+}
diff --git a/src/strategies/generic/depquant-generic.h b/src/strategies/generic/depquant-generic.h
new file mode 100644
index 00000000..488963be
--- /dev/null
+++ b/src/strategies/generic/depquant-generic.h
@@ -0,0 +1,50 @@
+#ifndef STRATEGIES_DEPQUANT_GENERIC_H_
+#define STRATEGIES_DEPQUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "tables.h"
+
+
+int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_DEPQUANT_GENERIC_H_
diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c
new file mode 100644
index 00000000..7ba62163
--- /dev/null
+++ b/src/strategies/strategies-depquant.c
@@ -0,0 +1,54 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#include "strategies/strategies-depquant.h"
+
+#include "strategies/avx2/depquant-avx2.h"
+#include "strategies/generic/depquant-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+
+
+int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= uvg_strategy_register_depquant_generic(opaque, bitdepth);
+
+  if (uvg_g_hardware_flags.intel_flags.avx2) {
+    success &= uvg_strategy_register_depquant_avx2(opaque, bitdepth);
+  }
+  return success;
+}
diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h
new file mode 100644
index 00000000..4021c458
--- /dev/null
+++ b/src/strategies/strategies-depquant.h
@@ -0,0 +1,77 @@
+#ifndef STRATEGIES_DEPQUANT_H_
+#define STRATEGIES_DEPQUANT_H_
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "uvg266.h"
+#include "dep_quant.h"
+
+
+// Declare function pointers.
+typedef int(dep_quant_decide_and_update_func)(
+  rate_estimator_t*                       re,
+  context_store*                          ctxs,
+  struct dep_quant_scan_info const* const scan_info,
+  const coeff_t                           absCoeff,
+  const uint32_t                          scan_pos,
+  const uint32_t                          width_in_sbb,
+  const uint32_t                          height_in_sbb,
+  const NbInfoSbb                         next_nb_info_ssb,
+  bool                                    zeroOut,
+  coeff_t                                 quantCoeff,
+  const uint32_t                          effWidth,
+  const uint32_t                          effHeight,
+  bool                                    is_chroma);
+
+
+
+// Declare function pointers.
+extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
+
+int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_DEPQUANT_EXPORTS \
+  {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
+
+
+
+#endif //STRATEGIES_DEPQUANT_H_
diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c
index 89baf86e..62c75d6f 100644
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@@ -38,15 +38,16 @@
 
 
 // Define function pointers.
-quant_func *uvg_quant;
-quant_cbcr_func *uvg_quant_cbcr_residual;
-quant_residual_func *uvg_quantize_residual;
-dequant_func *uvg_dequant;
-coeff_abs_sum_func *uvg_coeff_abs_sum;
+quant_func           *uvg_quant;
+quant_cbcr_func      *uvg_quant_cbcr_residual;
+quant_residual_func  *uvg_quantize_residual;
+dequant_func         *uvg_dequant;
+coeff_abs_sum_func   *uvg_coeff_abs_sum;
 fast_coeff_cost_func *uvg_fast_coeff_cost;
 
 
-int uvg_strategy_register_quant(void* opaque, uint8_t bitdepth) {
+int uvg_strategy_register_quant(void *opaque, uint8_t bitdepth)
+{
   bool success = true;
 
   success &= uvg_strategy_register_quant_generic(opaque, bitdepth);
diff --git a/src/strategyselector.c b/src/strategyselector.c
index 477604a9..d6dffa4e 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -107,6 +107,10 @@ int uvg_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
     fprintf(stderr, "uvg_strategy_register_encode failed!\n");
     return 0;
   }
+  if (!uvg_strategy_register_depquant(&strategies, bitdepth)) {
+    fprintf(stderr, "uvg_strategy_register_depquant failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
diff --git a/src/strategyselector.h b/src/strategyselector.h
index caadfda9..8bbdfbed 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -108,6 +108,7 @@ int uvg_strategyselector_register(void *opaque, const char *type, const char *st
 #include "strategies/strategies-intra.h"
 #include "strategies/strategies-sao.h"
 #include "strategies/strategies-encode.h"
+#include "strategies/strategies-depquant.h"
 #include "strategies/strategies-alf.h"
 
 static const strategy_to_select_t strategies_to_select[] = {
@@ -120,6 +121,7 @@ static const strategy_to_select_t strategies_to_select[] = {
   STRATEGIES_SAO_EXPORTS
   STRATEGIES_ENCODE_EXPORTS
   STRATEGIES_ALF_EXPORTS
+  STRATEGIES_DEPQUANT_EXPORTS
   { NULL, NULL },
 };