diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c index ccf60c6d..dbd8e3b8 100644 --- a/src/strategies/avx2/dct-avx2.c +++ b/src/strategies/avx2/dct-avx2.c @@ -25,6 +25,8 @@ #include "strategies/avx2/dct-avx2.h" #if COMPILE_INTEL_AVX2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include #include "strategyselector.h" @@ -924,12 +926,14 @@ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const TRANSFORM(dct, 32); ITRANSFORM(dct, 32); +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "fast_forward_dst_4x4", "avx2", 40, &matrix_dst_4x4_avx2); @@ -945,6 +949,7 @@ int kvz_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2); success &= kvz_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2); } +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 return success; } diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 683ddd4b..f7b3f1af 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -21,10 +21,12 @@ #include "strategies/avx2/intra-avx2.h" #if COMPILE_INTEL_AVX2 && defined X86_64 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 + #include #include -#include "kvazaar.h" #include "strategyselector.h" #include "strategies/missing-intel-intrinsics.h" @@ -35,7 +37,7 @@ * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ -static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ +static INLINE __m128i filter_4x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); @@ -58,7 +60,7 @@ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_p * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */ -static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ +static void filter_4x4_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ __m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0); __m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0); @@ -86,7 +88,7 @@ static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ -static INLINE __m128i filter_8x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ +static INLINE __m128i filter_8x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); @@ -110,7 +112,7 @@ static INLINE __m128i filter_8x1_avx2(const kvz_pixel *ref_main, int16_t delta_p * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */ -static void filter_8x8_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ +static void filter_8x8_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ __m128i row0 = filter_8x1_avx2(ref_main, 1 * sample_disp, 0); __m128i row1 = filter_8x1_avx2(ref_main, 2 * sample_disp, 0); __m128i row2 = filter_8x1_avx2(ref_main, 3 * sample_disp, 0); @@ -163,7 +165,7 @@ static void filter_8x8_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ -static INLINE __m256i filter_16x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ +static INLINE __m256i filter_16x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); @@ -189,7 +191,7 @@ static INLINE __m256i filter_16x1_avx2(const kvz_pixel *ref_main, int16_t delta_ * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */ -static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ +static void filter_16x16_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){ for (int y = 0; y < 16; y += 8) { __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, 0); __m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, 0); @@ -281,7 +283,7 @@ static void filter_16x16_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sam * \param vertical_mode Mode direction, true if vertical * \param width Block width */ -static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode, int width){ +static void filter_NxN_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode, int width){ for (int y = 0; y < width; y += 8) { for (int x = 0; x < width; x += 16) { __m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, x); @@ -376,9 +378,9 @@ static void filter_NxN_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sampl static void kvz_angular_pred_avx2( const int_fast8_t log2_width, const int_fast8_t intra_mode, - const kvz_pixel *const in_ref_above, - const kvz_pixel *const in_ref_left, - kvz_pixel *const dst) + const uint8_t *const in_ref_above, + const uint8_t *const in_ref_left, + uint8_t *const dst) { assert(log2_width >= 2 && log2_width <= 5); assert(intra_mode >= 2 && intra_mode <= 34); @@ -388,7 +390,7 @@ static void kvz_angular_pred_avx2( // Temporary buffer for modes 11-25. // It only needs to be big enough to hold indices from -width to width-1. - kvz_pixel tmp_ref[2 * 32]; + uint8_t tmp_ref[2 * 32]; const int_fast8_t width = 1 << log2_width; // Whether to swap references to always project on the left reference row. @@ -399,9 +401,9 @@ static void kvz_angular_pred_avx2( const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)]; // Pointer for the reference we are interpolating from. - const kvz_pixel *ref_main; + const uint8_t *ref_main; // Pointer for the other reference. - const kvz_pixel *ref_side; + const uint8_t *ref_side; // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. @@ -463,15 +465,15 @@ static void kvz_angular_pred_avx2( */ static void kvz_intra_pred_planar_avx2( const int_fast8_t log2_width, - const kvz_pixel *const ref_top, - const kvz_pixel *const ref_left, - kvz_pixel *const dst) + const uint8_t *const ref_top, + const uint8_t *const ref_left, + uint8_t *const dst) { assert(log2_width >= 2 && log2_width <= 5); const int_fast8_t width = 1 << log2_width; - const kvz_pixel top_right = ref_top[width + 1]; - const kvz_pixel bottom_left = ref_left[width + 1]; + const uint8_t top_right = ref_top[width + 1]; + const uint8_t bottom_left = ref_left[width + 1]; if (log2_width > 2) { @@ -888,12 +890,11 @@ static void pred_filtered_dc_32x32(const uint8_t *ref_top, */ static void kvz_intra_pred_filtered_dc_avx2( const int_fast8_t log2_width, - const kvz_pixel *ref_top, - const kvz_pixel *ref_left, - kvz_pixel *out_block) + const uint8_t *ref_top, + const uint8_t *ref_left, + uint8_t *out_block) { assert(log2_width >= 2 && log2_width <= 5); - assert(sizeof(kvz_pixel) == sizeof(uint8_t)); if (log2_width == 2) { pred_filtered_dc_4x4(ref_top, ref_left, out_block); @@ -906,17 +907,20 @@ static void kvz_intra_pred_filtered_dc_avx2( } } +#endif //KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 && defined X86_64 int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 && defined X86_64 +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8) { success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2); success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2); success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2); } +#endif //KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success; } diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index d1cd75ce..c6fd0d8a 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -25,23 +25,25 @@ #include "strategies/avx2/ipol-avx2.h" #if COMPILE_INTEL_AVX2 +#include "kvazaar.h" + #include #include #include #include "encoder.h" -#include "kvazaar.h" #include "search_inter.h" #include "strategies/generic/picture-generic.h" #include "strategies/strategies-ipol.h" #include "strategyselector.h" #include "strategies/generic/ipol-generic.h" +#if KVZ_BIT_DEPTH == 8 extern int8_t kvz_g_luma_filter[4][8]; extern int8_t kvz_g_chroma_filter[8][4]; -static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) +static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, uint8_t *data) { __m128i fir = _mm_loadl_epi64((__m128i*)filter); __m128i row = _mm_loadl_epi64((__m128i*)data); @@ -100,7 +102,7 @@ static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) { filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45 } -static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, +static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out, __m256i *shuf_01_23, __m256i *shuf_45_67, __m256i *taps_01_23, __m256i *taps_45_67) { @@ -117,7 +119,7 @@ static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, _mm_storeu_si128((__m128i*)out, filtered); } -static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, +static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t * out, int out_stride, __m256i *shuf_01, __m256i *shuf_23, __m256i *taps_01, __m256i *taps_23) { @@ -143,7 +145,7 @@ static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_ _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper)); } -static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, +static void kvz_four_tap_filter_hor_4xN_avx2(uint8_t *data, int stride, int16_t * out, int out_stride, __m256i *shuf_01_23, __m256i *taps_01_23, int rows) { @@ -177,7 +179,7 @@ static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data return filtered; } -static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out) +static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out) { // Interpolation filter shifts int32_t shift2 = 6; @@ -243,7 +245,7 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat _mm_storel_epi64((__m128i*)out, filtered); } -static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride) +static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out, int16_t out_stride) { // Interpolation filter shifts int32_t shift2 = 6; @@ -366,7 +368,7 @@ static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int1 _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23)); } -INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride) +INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, uint8_t * out, int64_t out_stride) { // Interpolation filter shifts int32_t shift2 = 6; @@ -589,7 +591,7 @@ INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t s _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6); } -INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride) +INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, uint8_t *out, int out_stride) { // Filter even rows filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 @@ -610,11 +612,11 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil } static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -695,10 +697,10 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // VERTICAL STEP - kvz_pixel *out_l = filtered[0]; - kvz_pixel *out_r = filtered[1]; - kvz_pixel *out_t = filtered[2]; - kvz_pixel *out_b = filtered[3]; + uint8_t *out_l = filtered[0]; + uint8_t *out_r = filtered[1]; + uint8_t *out_t = filtered[2]; + uint8_t *out_b = filtered[3]; __m256i taps[4]; kvz_init_ver_filter_taps(fir0, taps); @@ -746,11 +748,11 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -774,10 +776,10 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t *col_pos2 = hor_first_cols[2]; // VERTICAL STEP - kvz_pixel *out_tl = filtered[0]; - kvz_pixel *out_tr = filtered[1]; - kvz_pixel *out_bl = filtered[2]; - kvz_pixel *out_br = filtered[3]; + uint8_t *out_tl = filtered[0]; + uint8_t *out_tr = filtered[1]; + uint8_t *out_bl = filtered[2]; + uint8_t *out_br = filtered[3]; __m256i taps[4]; kvz_init_ver_filter_taps(fir2, taps); @@ -829,11 +831,11 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco } static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -928,10 +930,10 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // VERTICAL STEP - kvz_pixel *out_l = filtered[0]; - kvz_pixel *out_r = filtered[1]; - kvz_pixel *out_t = filtered[2]; - kvz_pixel *out_b = filtered[3]; + uint8_t *out_l = filtered[0]; + uint8_t *out_r = filtered[1]; + uint8_t *out_t = filtered[2]; + uint8_t *out_b = filtered[3]; int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0; int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0; @@ -1056,11 +1058,11 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -1088,10 +1090,10 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t hor_stride = LCU_WIDTH; // VERTICAL STEP - kvz_pixel *out_tl = filtered[0]; - kvz_pixel *out_tr = filtered[1]; - kvz_pixel *out_bl = filtered[2]; - kvz_pixel *out_br = filtered[3]; + uint8_t *out_tl = filtered[0]; + uint8_t *out_tr = filtered[1]; + uint8_t *out_bl = filtered[2]; + uint8_t *out_br = filtered[3]; int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; @@ -1214,11 +1216,11 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco } static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel *dst, + uint8_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, @@ -1268,7 +1270,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco } static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, @@ -1323,11 +1325,11 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, - kvz_pixel *dst, + uint8_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, @@ -1385,7 +1387,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode } static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, + uint8_t *src, int16_t src_stride, int width, int height, @@ -1447,6 +1449,8 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const } } +#endif //KVZ_BIT_DEPTH == 8 + void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, int filter_size, int width, int height, kvz_extended_block *out) { @@ -1510,6 +1514,8 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 + if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2); success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2); @@ -1520,7 +1526,10 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); } +#endif //KVZ_BIT_DEPTH == 8 + success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2); + #endif //COMPILE_INTEL_AVX2 return success; } diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 9f001260..499b0625 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -25,6 +25,8 @@ #include "global.h" #if COMPILE_INTEL_AVX2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include "strategies/avx2/picture-avx2.h" #include "strategies/avx2/reg_sad_pow2_widths-avx2.h" @@ -33,7 +35,6 @@ #include #include #include -#include "kvazaar.h" #include "strategies/strategies-picture.h" #include "strategyselector.h" #include "strategies/generic/picture-generic.h" @@ -52,7 +53,7 @@ * * \returns Sum of Absolute Differences */ -uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2, +uint32_t kvz_reg_sad_avx2(const uint8_t * const data1, const uint8_t * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { if (width == 0) @@ -123,7 +124,7 @@ static INLINE uint32_t m256i_horizontal_sum(const __m256i sum) } -static unsigned sad_8bit_8x8_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) +static unsigned sad_8bit_8x8_avx2(const uint8_t *buf1, const uint8_t *buf2) { const __m256i *const a = (const __m256i *)buf1; const __m256i *const b = (const __m256i *)buf2; @@ -133,7 +134,7 @@ static unsigned sad_8bit_8x8_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) } -static unsigned sad_8bit_16x16_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) +static unsigned sad_8bit_16x16_avx2(const uint8_t *buf1, const uint8_t *buf2) { const __m256i *const a = (const __m256i *)buf1; const __m256i *const b = (const __m256i *)buf2; @@ -143,7 +144,7 @@ static unsigned sad_8bit_16x16_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2 } -static unsigned sad_8bit_32x32_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) +static unsigned sad_8bit_32x32_avx2(const uint8_t *buf1, const uint8_t *buf2) { const __m256i *const a = (const __m256i *)buf1; const __m256i *const b = (const __m256i *)buf2; @@ -163,7 +164,7 @@ static unsigned sad_8bit_32x32_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2 } -static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * buf2) +static unsigned sad_8bit_64x64_avx2(const uint8_t * buf1, const uint8_t * buf2) { const __m256i *const a = (const __m256i *)buf1; const __m256i *const b = (const __m256i *)buf2; @@ -182,7 +183,7 @@ static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * bu return m256i_horizontal_sum(sum0); } -static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur) +static unsigned satd_4x4_8bit_avx2(const uint8_t *org, const uint8_t *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); @@ -228,7 +229,7 @@ static unsigned satd_4x4_8bit_avx2(const kvz_pixel *org, const kvz_pixel *cur) static void satd_8bit_4x4_dual_avx2( - const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) + const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out) { __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig))); @@ -423,14 +424,14 @@ INLINE static void sum_block_dual_avx2(__m256i *ver_row, unsigned *sum0, unsigne *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1)); } -INLINE static __m128i diff_row_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2) +INLINE static __m128i diff_row_avx2(const uint8_t *buf1, const uint8_t *buf2) { __m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf1)); __m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)buf2)); return _mm_sub_epi16(buf1_row, buf2_row); } -INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2, const kvz_pixel *orig) +INLINE static __m256i diff_row_dual_avx2(const uint8_t *buf1, const uint8_t *buf2, const uint8_t *orig) { __m128i temp1 = _mm_loadl_epi64((__m128i*)buf1); __m128i temp2 = _mm_loadl_epi64((__m128i*)buf2); @@ -442,8 +443,8 @@ INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel } INLINE static void diff_blocks_avx2(__m128i (*row_diff)[8], - const kvz_pixel * buf1, unsigned stride1, - const kvz_pixel * orig, unsigned stride_orig) + const uint8_t * buf1, unsigned stride1, + const uint8_t * orig, unsigned stride_orig) { (*row_diff)[0] = diff_row_avx2(buf1 + 0 * stride1, orig + 0 * stride_orig); (*row_diff)[1] = diff_row_avx2(buf1 + 1 * stride1, orig + 1 * stride_orig); @@ -457,9 +458,9 @@ INLINE static void diff_blocks_avx2(__m128i (*row_diff)[8], } INLINE static void diff_blocks_dual_avx2(__m256i (*row_diff)[8], - const kvz_pixel * buf1, unsigned stride1, - const kvz_pixel * buf2, unsigned stride2, - const kvz_pixel * orig, unsigned stride_orig) + const uint8_t * buf1, unsigned stride1, + const uint8_t * buf2, unsigned stride2, + const uint8_t * orig, unsigned stride_orig) { (*row_diff)[0] = diff_row_dual_avx2(buf1 + 0 * stride1, buf2 + 0 * stride2, orig + 0 * stride_orig); (*row_diff)[1] = diff_row_dual_avx2(buf1 + 1 * stride1, buf2 + 1 * stride2, orig + 1 * stride_orig); @@ -496,9 +497,9 @@ INLINE static void hor_transform_block_dual_avx2(__m256i (*row_diff)[8]) hor_transform_row_dual_avx2((*row_diff) + 7); } -static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1, - const kvz_pixel * buf2, unsigned stride2, - const kvz_pixel * orig, unsigned stride_orig, +static void kvz_satd_8bit_8x8_general_dual_avx2(const uint8_t * buf1, unsigned stride1, + const uint8_t * buf2, unsigned stride2, + const uint8_t * orig, unsigned stride_orig, unsigned *sum0, unsigned *sum1) { __m256i temp[8]; @@ -516,18 +517,18 @@ static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned /** * \brief Calculate SATD between two 4x4 blocks inside bigger arrays. */ -static unsigned kvz_satd_4x4_subblock_8bit_avx2(const kvz_pixel * buf1, +static unsigned kvz_satd_4x4_subblock_8bit_avx2(const uint8_t * buf1, const int32_t stride1, - const kvz_pixel * buf2, + const uint8_t * buf2, const int32_t stride2) { // TODO: AVX2 implementation return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2); } -static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4], +static void kvz_satd_4x4_subblock_quad_avx2(const uint8_t *preds[4], const int stride, - const kvz_pixel *orig, + const uint8_t *orig, const int orig_stride, unsigned costs[4]) { @@ -535,7 +536,7 @@ static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4], kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs); } -static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2) +static unsigned satd_8x8_subblock_8bit_avx2(const uint8_t * buf1, unsigned stride1, const uint8_t * buf2, unsigned stride2) { __m128i temp[8]; @@ -549,9 +550,9 @@ static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned str return result; } -static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds, +static void satd_8x8_subblock_quad_avx2(const uint8_t **preds, const int stride, - const kvz_pixel *orig, + const uint8_t *orig, const int orig_stride, unsigned *costs) { @@ -570,7 +571,7 @@ SATD_ANY_SIZE(8bit_avx2) // multiples of 8x8 with the 8x8 hadamard function. #define SATD_NXN_DUAL_AVX2(n) \ static void satd_8bit_ ## n ## x ## n ## _dual_avx2( \ - const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) \ + const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out) \ { \ unsigned x, y; \ satds_out[0] = 0; \ @@ -590,7 +591,7 @@ static void satd_8bit_ ## n ## x ## n ## _dual_avx2( \ } static void satd_8bit_8x8_dual_avx2( - const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) + const pred_buffer preds, const uint8_t * const orig, unsigned num_modes, unsigned *satds_out) { unsigned x, y; satds_out[0] = 0; @@ -618,17 +619,17 @@ SATD_NXN_DUAL_AVX2(64) static cost_pixel_any_size_multi_func satd_any_size_## suffix; \ static void satd_any_size_ ## suffix ( \ int width, int height, \ - const kvz_pixel **preds, \ + const uint8_t **preds, \ const int stride, \ - const kvz_pixel *orig, \ + const uint8_t *orig, \ const int orig_stride, \ unsigned num_modes, \ unsigned *costs_out, \ int8_t *valid) \ { \ unsigned sums[num_parallel_blocks] = { 0 }; \ - const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\ - const kvz_pixel *orig_ptr = orig; \ + const uint8_t *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\ + const uint8_t *orig_ptr = orig; \ costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \ if (width % 8 != 0) { \ /* Process the first column using 4x4 blocks. */ \ @@ -681,7 +682,7 @@ SATD_NXN_DUAL_AVX2(64) SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4) -static unsigned pixels_calc_ssd_avx2(const kvz_pixel *const ref, const kvz_pixel *const rec, +static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *const rec, const int ref_stride, const int rec_stride, const int width) { @@ -767,9 +768,9 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, const hi_prec_buf_t*high_precision_rec0, const hi_prec_buf_t*high_precision_rec1, lcu_t* lcu, - kvz_pixel* temp_lcu_y, - kvz_pixel* temp_lcu_u, - kvz_pixel* temp_lcu_v, + uint8_t* temp_lcu_y, + uint8_t* temp_lcu_u, + uint8_t* temp_lcu_v, bool predict_luma, bool predict_chroma) { @@ -799,7 +800,7 @@ bool predict_chroma) int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); } } @@ -834,7 +835,7 @@ bool predict_chroma) int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); } } @@ -877,11 +878,11 @@ bool predict_chroma) int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); } } @@ -940,11 +941,11 @@ bool predict_chroma) int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); } } else { // Load 8 pixels to vector @@ -1013,7 +1014,7 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width) return NULL; } -static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_avx2(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t stride) { if (width == 0) @@ -1030,7 +1031,7 @@ static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); } -static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_avx2(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -1051,7 +1052,7 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat pic_stride, ref_stride, left, right); } -static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len) +static double pixel_var_avx2_largebuf(const uint8_t *buf, const uint32_t len) { const float len_f = (float)len; const __m256i zero = _mm256_setzero_si256(); @@ -1154,7 +1155,7 @@ static __m256i hsum_epi32_to_epi64(const __m256i v) return sums_64; } -static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len) +static double pixel_var_avx2(const uint8_t *buf, const uint32_t len) { assert(sizeof(*buf) == 1); assert((len & 31) == 0); @@ -1223,19 +1224,21 @@ static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len) #else // INACCURATE_VARIANCE_CALCULATION -static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len) +static double pixel_var_avx2(const uint8_t *buf, const uint32_t len) { return pixel_var_avx2_largebuf(buf, len); } #endif // !INACCURATE_VARIANCE_CALCULATION +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 // We don't actually use SAD for intra right now, other than 4x4 for // transform skip, but we might again one day and this is some of the // simplest code to look at for anyone interested in doing more @@ -1271,6 +1274,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2); } +#endif // KVZ_BIT_DEPTH == 8 #endif return success; } diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index 77759cb2..e8e42a30 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -506,35 +506,37 @@ void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr #undef LOG2_SCAN_SET_SIZE } -static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){ +#if KVZ_BIT_DEPTH == 8 + +static INLINE __m128i get_residual_4x1_avx2(const uint8_t *a_in, const uint8_t *b_in){ __m128i a = _mm_cvtsi32_si128(*(int32_t*)a_in); __m128i b = _mm_cvtsi32_si128(*(int32_t*)b_in); __m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) ); return diff; } -static INLINE __m128i get_residual_8x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){ +static INLINE __m128i get_residual_8x1_avx2(const uint8_t *a_in, const uint8_t *b_in){ __m128i a = _mm_cvtsi64_si128(*(int64_t*)a_in); __m128i b = _mm_cvtsi64_si128(*(int64_t*)b_in); __m128i diff = _mm_sub_epi16(_mm_cvtepu8_epi16(a), _mm_cvtepu8_epi16(b) ); return diff; } -static INLINE int32_t get_quantized_recon_4x1_avx2(int16_t *residual, const kvz_pixel *pred_in){ +static INLINE int32_t get_quantized_recon_4x1_avx2(int16_t *residual, const uint8_t *pred_in){ __m128i res = _mm_loadl_epi64((__m128i*)residual); __m128i pred = _mm_cvtsi32_si128(*(int32_t*)pred_in); __m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred)); return _mm_cvtsi128_si32(_mm_packus_epi16(rec, rec)); } -static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const kvz_pixel *pred_in){ +static INLINE int64_t get_quantized_recon_8x1_avx2(int16_t *residual, const uint8_t *pred_in){ __m128i res = _mm_loadu_si128((__m128i*)residual); __m128i pred = _mm_cvtsi64_si128(*(int64_t*)pred_in); __m128i rec = _mm_add_epi16(res, _mm_cvtepu8_epi16(pred)); return _mm_cvtsi128_si64(_mm_packus_epi16(rec, rec)); } -static void get_residual_avx2(const kvz_pixel *ref_in, const kvz_pixel *pred_in, int16_t *residual, int width, int in_stride){ +static void get_residual_avx2(const uint8_t *ref_in, const uint8_t *pred_in, int16_t *residual, int width, int in_stride){ __m128i diff = _mm_setzero_si128(); switch (width) { @@ -579,7 +581,7 @@ static void get_residual_avx2(const kvz_pixel *ref_in, const kvz_pixel *pred_in, } } -static void get_quantized_recon_avx2(int16_t *residual, const kvz_pixel *pred_in, int in_stride, kvz_pixel *rec_out, int out_stride, int width){ +static void get_quantized_recon_avx2(int16_t *residual, const uint8_t *pred_in, int in_stride, uint8_t *rec_out, int out_stride, int width){ switch (width) { case 4: @@ -629,8 +631,8 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state, const cu_info_t *const cur_cu, const int width, const color_t color, const coeff_scan_order_t scan_order, const int use_trskip, const int in_stride, const int out_stride, - const kvz_pixel *const ref_in, const kvz_pixel *const pred_in, - kvz_pixel *rec_out, coeff_t *coeff_out, + const uint8_t *const ref_in, const uint8_t *const pred_in, + uint8_t *rec_out, coeff_t *coeff_out, bool early_skip) { // Temporary arrays to pass data to and from kvz_quant and transform functions. @@ -766,6 +768,8 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef } } +#endif // KVZ_BIT_DEPTH == 8 + static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length) { assert(length % 8 == 0); @@ -870,11 +874,13 @@ int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth) bool success = true; #if COMPILE_INTEL_AVX2 && defined X86_64 - success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2); +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8) { success &= kvz_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &kvz_quantize_residual_avx2); success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2); } +#endif // KVZ_BIT_DEPTH == 8 + success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2); success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2); success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2); #endif //COMPILE_INTEL_AVX2 && defined X86_64 diff --git a/src/strategies/avx2/reg_sad_pow2_widths-avx2.h b/src/strategies/avx2/reg_sad_pow2_widths-avx2.h index 182e5bb9..ed2cbb54 100644 --- a/src/strategies/avx2/reg_sad_pow2_widths-avx2.h +++ b/src/strategies/avx2/reg_sad_pow2_widths-avx2.h @@ -21,10 +21,13 @@ #ifndef REG_SAD_POW2_WIDTHS_AVX2_H_ #define REG_SAD_POW2_WIDTHS_AVX2_H_ -#include "strategies/sse41/reg_sad_pow2_widths-sse41.h" #include "kvazaar.h" -static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixel * const data2, +#if KVZ_BIT_DEPTH == 8 + +#include "strategies/sse41/reg_sad_pow2_widths-sse41.h" + +static INLINE uint32_t reg_sad_w32(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -74,7 +77,7 @@ static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixe return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w64(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -129,7 +132,7 @@ static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixe return _mm_cvtsi128_si32(sad); } -static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_avx2_w32(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t pic_stride, uint32_t ref_stride, const uint32_t left, const uint32_t right) { @@ -206,4 +209,6 @@ static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref return _mm_cvtsi128_si32(sad); } +#endif // KVZ_BIT_DEPTH == 8 + #endif diff --git a/src/strategies/avx2/sao-avx2.c b/src/strategies/avx2/sao-avx2.c index e42f911a..ce0e3ba3 100644 --- a/src/strategies/avx2/sao-avx2.c +++ b/src/strategies/avx2/sao-avx2.c @@ -21,6 +21,8 @@ #include "strategies/avx2/sao-avx2.h" #if COMPILE_INTEL_AVX2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include #include @@ -31,7 +33,6 @@ #include "cu.h" #include "encoder.h" #include "encoderstate.h" -#include "kvazaar.h" #include "sao.h" #include "strategyselector.h" @@ -271,12 +272,12 @@ static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a, return calc_diff_off_delta(diff_lo, diff_hi, offset, orig); } -static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, - const kvz_pixel *rec_data, - int32_t block_width, - int32_t block_height, - int32_t eo_class, - const int32_t offsets[NUM_SAO_EDGE_CATEGORIES]) +static int32_t sao_edge_ddistortion_avx2(const uint8_t *orig_data, + const uint8_t *rec_data, + int32_t block_width, + int32_t block_height, + int32_t eo_class, + const int32_t offsets[NUM_SAO_EDGE_CATEGORIES]) { int32_t y, x; vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0]; @@ -407,12 +408,12 @@ static void FIX_W32 calc_edge_dir_one_ymm(const __m256i a, } } -static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, - const kvz_pixel *rec_data, - int32_t eo_class, - int32_t block_width, - int32_t block_height, - int32_t cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES]) +static void calc_sao_edge_dir_avx2(const uint8_t *orig_data, + const uint8_t *rec_data, + int32_t eo_class, + int32_t block_width, + int32_t block_height, + int32_t cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES]) { vector2d_t a_ofs = g_sao_edge_offsets[eo_class][0]; vector2d_t b_ofs = g_sao_edge_offsets[eo_class][1]; @@ -608,8 +609,8 @@ static __m256i lookup_color_band_ymm(const __m256i curr_row, } static INLINE void reconstruct_color_band(const encoder_control_t *encoder, - const kvz_pixel *rec_data, - kvz_pixel *new_rec_data, + const uint8_t *rec_data, + uint8_t *new_rec_data, const sao_info_t *sao, int32_t stride, int32_t new_stride, @@ -695,8 +696,8 @@ static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a, } static INLINE void reconstruct_color_other(const encoder_control_t *encoder, - const kvz_pixel *rec_data, - kvz_pixel *new_rec_data, + const uint8_t *rec_data, + uint8_t *new_rec_data, const sao_info_t *sao, int32_t stride, int32_t new_stride, @@ -784,8 +785,8 @@ static INLINE void reconstruct_color_other(const encoder_control_t *encoder, } static void sao_reconstruct_color_avx2(const encoder_control_t *encoder, - const kvz_pixel *rec_data, - kvz_pixel *new_rec_data, + const uint8_t *rec_data, + uint8_t *new_rec_data, const sao_info_t *sao, int32_t stride, int32_t new_stride, @@ -884,18 +885,21 @@ use_generic: block_height, band_pos, sao_bands); } +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8) { success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &sao_edge_ddistortion_avx2); success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &calc_sao_edge_dir_avx2); success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &sao_reconstruct_color_avx2); success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &sao_band_ddistortion_avx2); } +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_AVX2 return success; } diff --git a/src/strategies/sse2/picture-sse2.c b/src/strategies/sse2/picture-sse2.c index 9e6413b3..421c979f 100644 --- a/src/strategies/sse2/picture-sse2.c +++ b/src/strategies/sse2/picture-sse2.c @@ -21,14 +21,15 @@ #include "strategies/sse2/picture-sse2.h" #if COMPILE_INTEL_SSE2 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include #include -#include "kvazaar.h" #include "strategyselector.h" -static unsigned reg_sad_sse2(const kvz_pixel * const data1, const kvz_pixel * const data2, +static unsigned reg_sad_sse2(const uint8_t * const data1, const uint8_t * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { int y, x; @@ -53,7 +54,7 @@ static unsigned reg_sad_sse2(const kvz_pixel * const data1, const kvz_pixel * co return sad; } -static unsigned sad_8bit_4x4_sse2(const kvz_pixel *buf1, const kvz_pixel *buf2) +static unsigned sad_8bit_4x4_sse2(const uint8_t *buf1, const uint8_t *buf2) { const __m128i *const mbuf1 = (const __m128i *)buf1; const __m128i *const mbuf2 = (const __m128i *)buf2; @@ -65,15 +66,18 @@ static unsigned sad_8bit_4x4_sse2(const kvz_pixel *buf1, const kvz_pixel *buf2) return result[0] + result[2]; } +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_SSE2 int kvz_strategy_register_picture_sse2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_SSE2 +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "reg_sad", "sse2", 10, ®_sad_sse2); success &= kvz_strategyselector_register(opaque, "sad_4x4", "sse2", 10, &sad_8bit_4x4_sse2); } +#endif // KVZ_BIT_DEPTH == 8 #endif return success; } diff --git a/src/strategies/sse41/picture-sse41.c b/src/strategies/sse41/picture-sse41.c index dd3aa585..2f1e4f18 100644 --- a/src/strategies/sse41/picture-sse41.c +++ b/src/strategies/sse41/picture-sse41.c @@ -21,16 +21,17 @@ #include "global.h" #if COMPILE_INTEL_SSE41 +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include "strategies/sse41/picture-sse41.h" #include "strategies/sse41/reg_sad_pow2_widths-sse41.h" #include #include -#include "kvazaar.h" #include "strategyselector.h" -uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, +uint32_t kvz_reg_sad_sse41(const uint8_t * const data1, const uint8_t * const data2, const int32_t width, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -68,7 +69,7 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width) return NULL; } -static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_sse41(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t stride) { if (width == 0) @@ -85,7 +86,7 @@ static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); } -static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_sse41_w32(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -194,7 +195,7 @@ static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *re return _mm_cvtsi128_si32(sad); } -static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_sse41(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -215,18 +216,21 @@ static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da pic_stride, ref_stride, left, right); } +#endif // KVZ_BIT_DEPTH == 8 #endif //COMPILE_INTEL_SSE41 int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_SSE41 +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41); success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41); success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41); success &= kvz_strategyselector_register(opaque, "hor_sad", "sse41", 20, &hor_sad_sse41); } +#endif // KVZ_BIT_DEPTH == 8 #endif return success; } diff --git a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h index 433265ec..ec401695 100644 --- a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h +++ b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h @@ -22,17 +22,20 @@ #define REG_SAD_POW2_WIDTHS_SSE41_H_ #include "kvazaar.h" + +#if KVZ_BIT_DEPTH == 8 + #include "strategies/missing-intel-intrinsics.h" #include -static INLINE uint32_t reg_sad_w0(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w0(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { return 0; } -static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w4(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -71,7 +74,7 @@ static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w8(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -122,7 +125,7 @@ static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w12(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -141,7 +144,7 @@ static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixe return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w16(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -186,7 +189,7 @@ static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixe return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_w24(const uint8_t * const data1, const uint8_t * const data2, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -238,7 +241,7 @@ static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixe return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kvz_pixel * const data2, +static INLINE uint32_t reg_sad_arbitrary(const uint8_t * const data1, const uint8_t * const data2, const int32_t width, const int32_t height, const uint32_t stride1, const uint32_t stride2) { @@ -334,7 +337,7 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv return _mm_cvtsi128_si32(sad); } -static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_w4(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t stride) { __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data); @@ -371,7 +374,7 @@ static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, return _mm_cvtsi128_si32(sad); } -static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_w8(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t stride) { const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data); @@ -415,7 +418,7 @@ static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, return _mm_cvtsi128_si32(sad); } -static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_w12(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t stride) { const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); @@ -434,7 +437,7 @@ static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data return _mm_cvtsi128_si32(sad); } -static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_w16(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t stride) { const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); @@ -474,7 +477,7 @@ static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data return _mm_cvtsi128_si32(sad); } -static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t ver_sad_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t stride) { int32_t y, x; @@ -561,7 +564,7 @@ static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *re return _mm_cvtsi128_si32(sad); } -static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_sse41_w4(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -625,7 +628,7 @@ static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref return _mm_cvtsi128_si32(sad); } -static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_sse41_w8(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -738,7 +741,7 @@ static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref * border pixel, and use a suitable mask to fill all the other pixels with * that value. */ -static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static uint32_t hor_sad_sse41_w16(const uint8_t *pic_data, const uint8_t *ref_data, int32_t height, uint32_t pic_stride, uint32_t ref_stride, const uint32_t left, const uint32_t right) { @@ -821,7 +824,7 @@ static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *re return _mm_cvtsi128_si32(sad); } -static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, +static INLINE uint32_t hor_sad_sse41_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data, int32_t width, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { @@ -1024,4 +1027,6 @@ static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const return _mm_cvtsi128_si32(sad); } +#endif // KVZ_BIT_DEPTH == 8 + #endif diff --git a/src/strategies/x86_asm/picture-x86-asm-sad.h b/src/strategies/x86_asm/picture-x86-asm-sad.h index 6f108038..25ecba2b 100644 --- a/src/strategies/x86_asm/picture-x86-asm-sad.h +++ b/src/strategies/x86_asm/picture-x86-asm-sad.h @@ -27,17 +27,18 @@ */ #include "global.h" // IWYU pragma: keep +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 +unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*); +unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*); +unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*); -unsigned kvz_sad_4x4_avx(const kvz_pixel*, const kvz_pixel*); -unsigned kvz_sad_8x8_avx(const kvz_pixel*, const kvz_pixel*); -unsigned kvz_sad_16x16_avx(const kvz_pixel*, const kvz_pixel*); - -unsigned kvz_sad_4x4_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); -unsigned kvz_sad_8x8_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); -unsigned kvz_sad_16x16_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); -unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); -unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride); - +unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); +unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); +unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); +unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); +unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); +#endif // KVZ_BIT_DEPTH == 8 #endif diff --git a/src/strategies/x86_asm/picture-x86-asm.c b/src/strategies/x86_asm/picture-x86-asm.c index 12b4b93c..b1ab63fd 100644 --- a/src/strategies/x86_asm/picture-x86-asm.c +++ b/src/strategies/x86_asm/picture-x86-asm.c @@ -21,16 +21,17 @@ #include "strategies/x86_asm/picture-x86-asm.h" #if defined(KVZ_COMPILE_ASM) +#include "kvazaar.h" +#if KVZ_BIT_DEPTH == 8 #include -#include "kvazaar.h" #include "strategies/x86_asm/picture-x86-asm-sad.h" #include "strategies/x86_asm/picture-x86-asm-satd.h" #include "strategies/sse41/picture-sse41.h" #include "strategyselector.h" -static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2) +static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2) { unsigned sad = 0; sad += kvz_sad_16x16_avx(data1, data2); @@ -40,7 +41,7 @@ static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2 return sad; } -static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2) +static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2) { unsigned sad = 0; sad += kvz_sad_32x32_avx(data1, data2); @@ -50,7 +51,7 @@ static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2 return sad; } -static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2, +static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2, int width, int height, unsigned stride) { @@ -65,7 +66,7 @@ static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2 return sad; } -static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2, +static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { @@ -90,12 +91,14 @@ static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2, } } +#endif // KVZ_BIT_DEPTH == 8 #endif //defined(KVZ_COMPILE_ASM) int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) { bool success = true; #if defined(KVZ_COMPILE_ASM) +#if KVZ_BIT_DEPTH == 8 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, ®_sad_x86_asm); @@ -111,6 +114,7 @@ int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx); success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx); } +#endif // KVZ_BIT_DEPTH == 8 #endif //!defined(KVZ_COMPILE_ASM) return success; }