diff --git a/CMakeLists.txt b/CMakeLists.txt index d8c37bbc..cafb8fd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,7 +143,7 @@ target_include_directories(uvg266 PUBLIC src) target_include_directories(uvg266 PUBLIC src/extras) target_include_directories(uvg266 PUBLIC src/strategies) -file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c") +file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c" "src/dep_quant.c") file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c") file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c") diff --git a/src/dep_quant.c b/src/dep_quant.c index 99a15df3..87799e35 100644 --- a/src/dep_quant.c +++ b/src/dep_quant.c @@ -32,6 +32,8 @@ #include "dep_quant.h" +#include + #include "cu.h" #include "encoderstate.h" #include "intra.h" @@ -804,7 +806,6 @@ void uvg_dep_quant_update_state( } } -static bool same[13]; int uvg_dep_quant( const encoder_state_t* const state, @@ -889,14 +890,63 @@ int uvg_dep_quant( height >= 4) { firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15; } - const int32_t default_quant_coeff = dep_quant_context.m_quant->m_QScale; - const int32_t thres = dep_quant_context.m_quant->m_thresLast; - for (; firstTestPos >= 0; firstTestPos--) { - coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff)); - if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) { - break; + //uvg_find_first_non_zero_coeff(srcCoeff, enableScalingLists, dep_quant_context, scan, q_coeff, &firstTestPos, width, height); + const int default_quant_coeff = dep_quant_context.m_quant->m_QScale; + const int32_t thres = dep_quant_context.m_quant->m_thresLast; + int temp = firstTestPos; + if (enableScalingLists) { + for (; temp >= 0; (temp)--) { + coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]); + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + } else { + coeff_t thresTmp = thres / (4 * default_quant_coeff); + if (temp >= 16 && height >= 4) { + __m256i th = _mm256_set1_epi16(thresTmp); + temp -= 15; + for (; temp >= 0; temp -= 16) { + __m256i sbb_data; + if (width <= 4) { + sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]); + } else if (width == 8) { + uint32_t i = scan[temp]; + __m256i first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]); + __m256i second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i + 12]); + sbb_data = _mm256_blend_epi32(first, second, 204); + } else { + int16_t temp_d[16]; + uint32_t i = scan[temp]; + memcpy(temp_d, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 4, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 8, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 12, &srcCoeff[i], 8); + + sbb_data = _mm256_loadu_si256((__m256i const*)temp_d); + } + sbb_data = _mm256_abs_epi16(sbb_data); + + __m256i a = _mm256_cmpgt_epi16(sbb_data, th); + if (!_mm256_testz_si256(a, a)) { + if (temp >= 0) { + temp += 15; + } + break; + } + } + } + for (; temp >= 0; temp--) { + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } } } + + firstTestPos = temp; if (firstTestPos < 0) { return 0; } @@ -961,7 +1011,7 @@ int uvg_dep_quant( const uint32_t height_in_sbb = MAX(height >> 2, 1); const uint32_t width_in_sbb = MAX(width >> 2, 1); - + //===== populate trellis ===== for (int scanIdx = firstTestPos; scanIdx >= 0; scanIdx--) { uint32_t blkpos = scan[scanIdx]; diff --git a/src/strategies/avx2/depquant-avx2.c b/src/strategies/avx2/depquant-avx2.c index 601f04da..51f4e7d8 100644 --- a/src/strategies/avx2/depquant-avx2.c +++ b/src/strategies/avx2/depquant-avx2.c @@ -37,6 +37,8 @@ #include "strategies/avx2/depquant-avx2.h" #include "strategyselector.h" +#define COMPILE_INTEL_AVX2 1 + #if COMPILE_INTEL_AVX2 && defined X86_64 #include "dep_quant.h" @@ -1359,6 +1361,76 @@ void uvg_dep_quant_decide_and_update_avx2( } +void uvg_find_first_non_zero_avx2( + const coeff_t* srcCoeff, + const bool enableScalingLists, + context_store dep_quant_context, + const uint32_t* const scan, + const int32_t* q_coeff, + int* firstTestPos, + const int width, + const int height) +{ + const int default_quant_coeff = dep_quant_context.m_quant->m_QScale; + const int32_t thres = dep_quant_context.m_quant->m_thresLast; + int temp = *firstTestPos; + if (enableScalingLists) { + for (; temp >= 0; (temp)--) { + coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]); + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + } else { + coeff_t thresTmp = thres / (4 * default_quant_coeff); + if (temp >= 16 && height >= 4) { + __m256i th = _mm256_set1_epi16(thresTmp); + temp -= 15; + for (; temp >= 0; temp -= 16) { + __m256i sbb_data; + if (width <= 4) { + sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]); + } else if (width == 8) { + uint32_t i = scan[temp]; + __m256i first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]); + __m256i second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i+ 12]); + sbb_data = _mm256_blend_epi32(first, second, 204); + } else { + int16_t temp_d[16]; + uint32_t i = scan[temp]; + memcpy(temp_d, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 4, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 8, &srcCoeff[i], 8); + i += width; + memcpy(temp_d + 12, &srcCoeff[i], 8); + + sbb_data = _mm256_loadu_si256((__m256i const*)temp_d); + } + sbb_data = _mm256_abs_epi16(sbb_data); + + __m256i a = _mm256_cmpgt_epi16(sbb_data, th); + if (!_mm256_testz_si256(a, a)) + { + if (temp >= 0) { + temp += 15; + } + break; + } + } + } + for (;temp >= 0; temp--) { + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + } + + *firstTestPos = temp; +} + + #endif //COMPILE_INTEL_AVX2 && defined X86_64 int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth) @@ -1367,6 +1439,7 @@ int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth) #if COMPILE_INTEL_AVX2 && defined X86_64 success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2); + success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "avx2", 40, &uvg_find_first_non_zero_avx2); #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success; diff --git a/src/strategies/generic/depquant-generic.c b/src/strategies/generic/depquant-generic.c index aa2ea99e..f1103054 100644 --- a/src/strategies/generic/depquant-generic.c +++ b/src/strategies/generic/depquant-generic.c @@ -227,12 +227,26 @@ static void uvg_dep_quant_decide_and_update_generic( } +void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, context_store dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height) +{ + const int default_quant_coeff = dep_quant_context.m_quant->m_QScale; + const int32_t thres = dep_quant_context.m_quant->m_thresLast; + int temp = *firstTestPos; + for (; temp >= 0; (temp)--) { + coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff)); + if (abs(srcCoeff[scan[(temp)]]) > thresTmp) { + break; + } + } + *firstTestPos = temp; +} + int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth) { bool success = true; - success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic); - + success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic); + success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic); return success; } diff --git a/src/strategies/strategies-depquant.c b/src/strategies/strategies-depquant.c index 7ba62163..d0eac087 100644 --- a/src/strategies/strategies-depquant.c +++ b/src/strategies/strategies-depquant.c @@ -39,6 +39,7 @@ // Define function pointers. dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; +find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff; int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth) diff --git a/src/strategies/strategies-depquant.h b/src/strategies/strategies-depquant.h index 4021c458..6a49dc35 100644 --- a/src/strategies/strategies-depquant.h +++ b/src/strategies/strategies-depquant.h @@ -61,16 +61,27 @@ typedef int(dep_quant_decide_and_update_func)( const uint32_t effHeight, bool is_chroma); +typedef void(find_first_non_zero_coeff_func)( + const coeff_t* srcCoeff, + const bool enableScalingLists, + context_store dep_quant_context, + const uint32_t* const scan, + const int32_t* q_coeff, + int* firstTestPos, + int width, + int height); // Declare function pointers. extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update; +extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff; int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth); #define STRATEGIES_DEPQUANT_EXPORTS \ {"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \ + {"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \