From 19829da152e62d0c996ddf550c2f3ef313d4d09e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Fri, 21 Jul 2023 14:23:37 +0300 Subject: [PATCH] Disable all avx2 optimizations that cannot be used with mtt/isp --- src/strategies/avx2/dct-avx2.c | 24 ++++++------- src/strategies/avx2/intra-avx2.c | 8 ++--- src/strategies/avx2/picture-avx2.c | 58 +++++++++++++----------------- src/strategies/avx2/quant-avx2.c | 2 +- 4 files changed, 42 insertions(+), 50 deletions(-) diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index 04e92a7f..bb8a92bc 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -1656,22 +1656,22 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) #if COMPILE_INTEL_AVX2 #if UVG_BIT_DEPTH == 8 if (bitdepth == 8){ - success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); + //success &= uvg_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); - success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2); - success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2); - success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2); - success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2); + //success &= uvg_strategyselector_register(opaque, "dct_4x4", "avx2", 40, &matrix_dct_4x4_avx2); + //success &= uvg_strategyselector_register(opaque, "dct_8x8", "avx2", 40, &matrix_dct_8x8_avx2); + //success &= uvg_strategyselector_register(opaque, "dct_16x16", "avx2", 40, &matrix_dct_16x16_avx2); + //success &= uvg_strategyselector_register(opaque, "dct_32x32", "avx2", 40, &matrix_dct_32x32_avx2); - success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); + //success &= uvg_strategyselector_register(opaque, "fast_inverse_dst_4x4", "avx2", 40, &matrix_idst_4x4_avx2); - success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2); - success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2); - success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); - success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); + //success &= uvg_strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2); + //success &= uvg_strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2); + //success &= uvg_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); + //success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); - success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2); - success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2); + //success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2); + //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2); } #endif // UVG_BIT_DEPTH == 8 diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 838bad91..30bbe7f2 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -1075,10 +1075,10 @@ int uvg_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) #if COMPILE_INTEL_AVX2 && defined X86_64 #if UVG_BIT_DEPTH == 8 if (bitdepth == 8) { - success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2); - success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2); - success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2); - success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2); + //success &= uvg_strategyselector_register(opaque, "angular_pred", "avx2", 40, &uvg_angular_pred_avx2); + //success &= uvg_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &uvg_intra_pred_planar_avx2); + //success &= uvg_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &uvg_intra_pred_filtered_dc_avx2); + //success &= uvg_strategyselector_register(opaque, "pdpc_planar_dc", "avx2", 40, &uvg_pdpc_planar_dc_avx2); } #endif //UVG_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 && defined X86_64 diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 5d0b203c..f8be4987 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -1749,35 +1749,27 @@ static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in __m128i diff = _mm_setzero_si128(); switch (width) { case 4: - diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[0]), diff); - diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[4]), diff); - diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[8]), diff); - diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride); - _mm_storel_epi64((__m128i*) & (residual[12]), diff); + for (int y = 0; y < height; y+=4) { + diff = get_residual_4x1_avx2(ref_in + y * ref_stride, pred_in + y * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 1) * ref_stride, pred_in + (y + 1) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 4]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 2) * ref_stride, pred_in + (y + 2) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 8]), diff); + diff = get_residual_4x1_avx2(ref_in + (y + 3) * ref_stride, pred_in + (y + 3) * pred_stride); + _mm_storel_epi64((__m128i*) & (residual[y * 4 + 12]), diff); + } break; case 8: - diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[0]), diff); - diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[8]), diff); - diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[16]), diff); - diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[24]), diff); - diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[32]), diff); - diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[40]), diff); - diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[48]), diff); - diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]); - _mm_storeu_si128((__m128i*) & (residual[56]), diff); + for (int y = 0; y < height; y += 2) { + diff = get_residual_8x1_avx2(&ref_in[y * ref_stride], &pred_in[y * pred_stride]); + _mm_storeu_si128((__m128i*) & (residual[y * 8]), diff); + diff = get_residual_8x1_avx2(&ref_in[(y + 1) * ref_stride], &pred_in[(y + 1) * pred_stride]); + _mm_storeu_si128((__m128i*) & (residual[y*8 + 8]), diff); + } break; default: - for (int y = 0; y < width; ++y) { + for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 16) { diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]); _mm_storeu_si128((__m128i*) & residual[x + y * width], diff); @@ -1816,15 +1808,15 @@ int uvg_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) success &= uvg_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_32x32_8bit_avx2); success &= uvg_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_64x64_8bit_avx2); - success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2); - success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2); - success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2); - success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2); - success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2); - success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2); - success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_4x4_dual", "avx2", 40, &satd_8bit_4x4_dual_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_8x8_dual", "avx2", 40, &satd_8bit_8x8_dual_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_16x16_dual", "avx2", 40, &satd_8bit_16x16_dual_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2); + //success &= uvg_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2); - success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2); + //success &= uvg_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2); success &= uvg_strategyselector_register(opaque, "bipred_average", "avx2", 40, &bipred_average_avx2); success &= uvg_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2); success &= uvg_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2); diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 7729d272..bd857fa2 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -960,7 +960,7 @@ int uvg_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth) #if COMPILE_INTEL_AVX2 && defined X86_64 #if UVG_BIT_DEPTH == 8 if (bitdepth == 8) { - success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2); + //success &= uvg_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &uvg_quantize_residual_avx2); success &= uvg_strategyselector_register(opaque, "dequant", "avx2", 40, &uvg_dequant_avx2); } #endif // UVG_BIT_DEPTH == 8