mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-23 18:14:06 +00:00
[avx2] Try to do lnz decision with avx2
This commit is contained in:
parent
cf6f03b73b
commit
dda972c665
|
@ -143,7 +143,7 @@ target_include_directories(uvg266 PUBLIC src)
|
||||||
target_include_directories(uvg266 PUBLIC src/extras)
|
target_include_directories(uvg266 PUBLIC src/extras)
|
||||||
target_include_directories(uvg266 PUBLIC src/strategies)
|
target_include_directories(uvg266 PUBLIC src/strategies)
|
||||||
|
|
||||||
file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
|
file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c" "src/dep_quant.c")
|
||||||
file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
|
file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
|
||||||
file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
|
file(GLOB LIB_SOURCES_STRATEGIES_SSE42 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse42/*.c")
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,8 @@
|
||||||
|
|
||||||
#include "dep_quant.h"
|
#include "dep_quant.h"
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
#include "cu.h"
|
#include "cu.h"
|
||||||
#include "encoderstate.h"
|
#include "encoderstate.h"
|
||||||
#include "intra.h"
|
#include "intra.h"
|
||||||
|
@ -804,7 +806,6 @@ void uvg_dep_quant_update_state(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool same[13];
|
|
||||||
|
|
||||||
int uvg_dep_quant(
|
int uvg_dep_quant(
|
||||||
const encoder_state_t* const state,
|
const encoder_state_t* const state,
|
||||||
|
@ -889,14 +890,63 @@ int uvg_dep_quant(
|
||||||
height >= 4) {
|
height >= 4) {
|
||||||
firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
|
firstTestPos =((width == 4 && height == 4) || (width == 8 && height == 8)) ? 7 : 15;
|
||||||
}
|
}
|
||||||
const int32_t default_quant_coeff = dep_quant_context.m_quant->m_QScale;
|
//uvg_find_first_non_zero_coeff(srcCoeff, enableScalingLists, dep_quant_context, scan, q_coeff, &firstTestPos, width, height);
|
||||||
const int32_t thres = dep_quant_context.m_quant->m_thresLast;
|
const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
|
||||||
for (; firstTestPos >= 0; firstTestPos--) {
|
const int32_t thres = dep_quant_context.m_quant->m_thresLast;
|
||||||
coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[firstTestPos]])) : (thres / (4 * default_quant_coeff));
|
int temp = firstTestPos;
|
||||||
if (abs(srcCoeff[scan[firstTestPos]]) > thresTmp) {
|
if (enableScalingLists) {
|
||||||
break;
|
for (; temp >= 0; (temp)--) {
|
||||||
|
coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
|
||||||
|
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
coeff_t thresTmp = thres / (4 * default_quant_coeff);
|
||||||
|
if (temp >= 16 && height >= 4) {
|
||||||
|
__m256i th = _mm256_set1_epi16(thresTmp);
|
||||||
|
temp -= 15;
|
||||||
|
for (; temp >= 0; temp -= 16) {
|
||||||
|
__m256i sbb_data;
|
||||||
|
if (width <= 4) {
|
||||||
|
sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
|
||||||
|
} else if (width == 8) {
|
||||||
|
uint32_t i = scan[temp];
|
||||||
|
__m256i first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
|
||||||
|
__m256i second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i + 12]);
|
||||||
|
sbb_data = _mm256_blend_epi32(first, second, 204);
|
||||||
|
} else {
|
||||||
|
int16_t temp_d[16];
|
||||||
|
uint32_t i = scan[temp];
|
||||||
|
memcpy(temp_d, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 4, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 8, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 12, &srcCoeff[i], 8);
|
||||||
|
|
||||||
|
sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
|
||||||
|
}
|
||||||
|
sbb_data = _mm256_abs_epi16(sbb_data);
|
||||||
|
|
||||||
|
__m256i a = _mm256_cmpgt_epi16(sbb_data, th);
|
||||||
|
if (!_mm256_testz_si256(a, a)) {
|
||||||
|
if (temp >= 0) {
|
||||||
|
temp += 15;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; temp >= 0; temp--) {
|
||||||
|
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
firstTestPos = temp;
|
||||||
if (firstTestPos < 0) {
|
if (firstTestPos < 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,8 @@
|
||||||
#include "strategies/avx2/depquant-avx2.h"
|
#include "strategies/avx2/depquant-avx2.h"
|
||||||
#include "strategyselector.h"
|
#include "strategyselector.h"
|
||||||
|
|
||||||
|
#define COMPILE_INTEL_AVX2 1
|
||||||
|
|
||||||
#if COMPILE_INTEL_AVX2 && defined X86_64
|
#if COMPILE_INTEL_AVX2 && defined X86_64
|
||||||
#include "dep_quant.h"
|
#include "dep_quant.h"
|
||||||
|
|
||||||
|
@ -1359,6 +1361,76 @@ void uvg_dep_quant_decide_and_update_avx2(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void uvg_find_first_non_zero_avx2(
|
||||||
|
const coeff_t* srcCoeff,
|
||||||
|
const bool enableScalingLists,
|
||||||
|
context_store dep_quant_context,
|
||||||
|
const uint32_t* const scan,
|
||||||
|
const int32_t* q_coeff,
|
||||||
|
int* firstTestPos,
|
||||||
|
const int width,
|
||||||
|
const int height)
|
||||||
|
{
|
||||||
|
const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
|
||||||
|
const int32_t thres = dep_quant_context.m_quant->m_thresLast;
|
||||||
|
int temp = *firstTestPos;
|
||||||
|
if (enableScalingLists) {
|
||||||
|
for (; temp >= 0; (temp)--) {
|
||||||
|
coeff_t thresTmp = thres / (4 * q_coeff[scan[(temp)]]);
|
||||||
|
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
coeff_t thresTmp = thres / (4 * default_quant_coeff);
|
||||||
|
if (temp >= 16 && height >= 4) {
|
||||||
|
__m256i th = _mm256_set1_epi16(thresTmp);
|
||||||
|
temp -= 15;
|
||||||
|
for (; temp >= 0; temp -= 16) {
|
||||||
|
__m256i sbb_data;
|
||||||
|
if (width <= 4) {
|
||||||
|
sbb_data = _mm256_loadu_si256((__m256i const*)&srcCoeff[scan[temp]]);
|
||||||
|
} else if (width == 8) {
|
||||||
|
uint32_t i = scan[temp];
|
||||||
|
__m256i first = _mm256_loadu_si256((__m256i const*)&srcCoeff[i]);
|
||||||
|
__m256i second = _mm256_loadu_si256((__m256i const*)&srcCoeff[i+ 12]);
|
||||||
|
sbb_data = _mm256_blend_epi32(first, second, 204);
|
||||||
|
} else {
|
||||||
|
int16_t temp_d[16];
|
||||||
|
uint32_t i = scan[temp];
|
||||||
|
memcpy(temp_d, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 4, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 8, &srcCoeff[i], 8);
|
||||||
|
i += width;
|
||||||
|
memcpy(temp_d + 12, &srcCoeff[i], 8);
|
||||||
|
|
||||||
|
sbb_data = _mm256_loadu_si256((__m256i const*)temp_d);
|
||||||
|
}
|
||||||
|
sbb_data = _mm256_abs_epi16(sbb_data);
|
||||||
|
|
||||||
|
__m256i a = _mm256_cmpgt_epi16(sbb_data, th);
|
||||||
|
if (!_mm256_testz_si256(a, a))
|
||||||
|
{
|
||||||
|
if (temp >= 0) {
|
||||||
|
temp += 15;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (;temp >= 0; temp--) {
|
||||||
|
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*firstTestPos = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
||||||
|
|
||||||
int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
|
int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
|
||||||
|
@ -1367,6 +1439,7 @@ int uvg_strategy_register_depquant_avx2(void* opaque, uint8_t bitdepth)
|
||||||
|
|
||||||
#if COMPILE_INTEL_AVX2 && defined X86_64
|
#if COMPILE_INTEL_AVX2 && defined X86_64
|
||||||
success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
|
success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "avx2", 40, &uvg_dep_quant_decide_and_update_avx2);
|
||||||
|
success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "avx2", 40, &uvg_find_first_non_zero_avx2);
|
||||||
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
|
|
|
@ -227,12 +227,26 @@ static void uvg_dep_quant_decide_and_update_generic(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void uvg_find_first_non_zero_generic(const coeff_t* srcCoeff, const bool enableScalingLists, context_store dep_quant_context, const uint32_t* const scan, const int32_t* q_coeff, int* firstTestPos, int width, int height)
|
||||||
|
{
|
||||||
|
const int default_quant_coeff = dep_quant_context.m_quant->m_QScale;
|
||||||
|
const int32_t thres = dep_quant_context.m_quant->m_thresLast;
|
||||||
|
int temp = *firstTestPos;
|
||||||
|
for (; temp >= 0; (temp)--) {
|
||||||
|
coeff_t thresTmp = (enableScalingLists) ? (thres / (4 * q_coeff[scan[(temp)]])) : (thres / (4 * default_quant_coeff));
|
||||||
|
if (abs(srcCoeff[scan[(temp)]]) > thresTmp) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*firstTestPos = temp;
|
||||||
|
}
|
||||||
|
|
||||||
int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
|
int uvg_strategy_register_depquant_generic(void* opaque, uint8_t bitdepth)
|
||||||
{
|
{
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 40, &uvg_dep_quant_decide_and_update_generic);
|
success &= uvg_strategyselector_register(opaque, "dep_quant_decide_and_update", "generic", 0, &uvg_dep_quant_decide_and_update_generic);
|
||||||
|
success &= uvg_strategyselector_register(opaque, "find_first_non_zero_coeff", "generic", 0, &uvg_find_first_non_zero_generic);
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
|
|
||||||
// Define function pointers.
|
// Define function pointers.
|
||||||
dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
|
dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
|
||||||
|
find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
|
||||||
|
|
||||||
|
|
||||||
int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
|
int uvg_strategy_register_depquant(void *opaque, uint8_t bitdepth)
|
||||||
|
|
|
@ -61,16 +61,27 @@ typedef int(dep_quant_decide_and_update_func)(
|
||||||
const uint32_t effHeight,
|
const uint32_t effHeight,
|
||||||
bool is_chroma);
|
bool is_chroma);
|
||||||
|
|
||||||
|
typedef void(find_first_non_zero_coeff_func)(
|
||||||
|
const coeff_t* srcCoeff,
|
||||||
|
const bool enableScalingLists,
|
||||||
|
context_store dep_quant_context,
|
||||||
|
const uint32_t* const scan,
|
||||||
|
const int32_t* q_coeff,
|
||||||
|
int* firstTestPos,
|
||||||
|
int width,
|
||||||
|
int height);
|
||||||
|
|
||||||
|
|
||||||
// Declare function pointers.
|
// Declare function pointers.
|
||||||
extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
|
extern dep_quant_decide_and_update_func* uvg_dep_quant_decide_and_update;
|
||||||
|
extern find_first_non_zero_coeff_func* uvg_find_first_non_zero_coeff;
|
||||||
|
|
||||||
int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
|
int uvg_strategy_register_depquant(void* opaque, uint8_t bitdepth);
|
||||||
|
|
||||||
|
|
||||||
#define STRATEGIES_DEPQUANT_EXPORTS \
|
#define STRATEGIES_DEPQUANT_EXPORTS \
|
||||||
{"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
|
{"dep_quant_decide_and_update", (void**)&uvg_dep_quant_decide_and_update}, \
|
||||||
|
{"find_first_non_zero_coeff", (void**)&uvg_find_first_non_zero_coeff}, \
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue