From 84222cf3e7f3c3d20e68b4d0f3dba6eac8b7fe65 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 19:48:57 +0200 Subject: [PATCH 01/19] Replace old block extrapolation with more capable one. Separate paddings for different directions can be now specified. --- src/image.c | 55 ++-- src/inter.c | 386 ++++++++++++++------------ src/search_inter.c | 44 ++- src/strategies/avx2/ipol-avx2.c | 145 +++------- src/strategies/generic/ipol-generic.c | 77 +++-- src/strategies/strategies-ipol.h | 30 +- 6 files changed, 364 insertions(+), 373 deletions(-) diff --git a/src/image.c b/src/image.c index 404bc48c..9fa47c64 100644 --- a/src/image.c +++ b/src/image.c @@ -477,33 +477,42 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic, ref->stride) >> (KVZ_BIT_DEPTH - 8); } else { // Extrapolate pixels from outside the frame. - kvz_extended_block block; - kvz_get_extended_block(pic_x, - pic_y, - ref_x - pic_x, - ref_y - pic_y, - 0, - 0, - ref->y, - ref->width, - ref->height, - 0, - block_width, - block_height, - &block); + + // Space for extrapolated pixels and the part from the picture + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[LCU_LUMA_SIZE]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; + kvz_epol_args epol_args = { + .src = ref->y, + .src_w = ref->width, + .src_h = ref->height, + .src_s = ref->stride, + .blk_x = ref_x, + .blk_y = ref_y, + .blk_w = block_width, + .blk_h = block_height, + .pad_l = 0, + .pad_r = 0, + .pad_t = 0, + .pad_b = 0, + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; + + kvz_get_extended_block(&epol_args); const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x]; unsigned satd = kvz_satd_any_size(block_width, - block_height, - pic_data, - pic->stride, - block.buffer, - block.stride) >> (KVZ_BIT_DEPTH - 8); - - if (block.malloc_used) { - FREE_POINTER(block.buffer); - } + block_height, + pic_data, + pic->stride, + ext_origin, + ext_s) >> (KVZ_BIT_DEPTH - 8); return satd; } diff --git a/src/inter.c b/src/inter.c index 63718483..f34a2a2a 100644 --- a/src/inter.c +++ b/src/inter.c @@ -40,224 +40,242 @@ typedef struct { } merge_candidates_t; -static void inter_recon_frac_luma(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t block_width, - int32_t block_height, - const int16_t mv_param[2], - lcu_t *lcu) +static void inter_recon_frac_luma(const encoder_state_t *const state, + const kvz_picture *const ref, + int32_t xpos, + int32_t ypos, + int32_t block_width, + int32_t block_height, + const int16_t mv_param[2], + lcu_t *lcu) { int mv_frac_x = (mv_param[0] & 3); int mv_frac_y = (mv_param[1] & 3); - // Fractional luma 1/4-pel - kvz_extended_block src = {0, 0, 0, 0}; + // Space for extrapolated pixels and the part from the picture. + // One extra row for AVX2. + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; + kvz_epol_args epol_args = { + .src = ref->y, + .src_w = ref->width, + .src_h = ref->height, + .src_s = ref->stride, + .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2), + .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2), + .blk_w = block_width, + .blk_h = block_height, + .pad_l = KVZ_LUMA_FILTER_OFFSET, + .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_t = KVZ_LUMA_FILTER_OFFSET, + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; - // Fractional luma - kvz_get_extended_block(xpos, - ypos, - mv_param[0] >> 2, - mv_param[1] >> 2, - state->tile->offset_x, - state->tile->offset_y, - ref->y, - ref->width, - ref->height, - KVZ_LUMA_FILTER_TAPS, - block_width, - block_height, - &src); + kvz_get_extended_block(&epol_args); kvz_sample_quarterpel_luma(state->encoder_control, - src.orig_topleft, - src.stride, - block_width, - block_height, - lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), - LCU_WIDTH, - mv_frac_x, - mv_frac_y, - mv_param); - - if (src.malloc_used) free(src.buffer); + ext_origin, + ext_s, + block_width, + block_height, + lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), + LCU_WIDTH, + mv_frac_x, + mv_frac_y, + mv_param); } -static void inter_recon_14bit_frac_luma(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t block_width, - int32_t block_height, - const int16_t mv_param[2], - hi_prec_buf_t *hi_prec_out) +static void inter_recon_14bit_frac_luma(const encoder_state_t *const state, + const kvz_picture *const ref, + int32_t xpos, + int32_t ypos, + int32_t block_width, + int32_t block_height, + const int16_t mv_param[2], + hi_prec_buf_t *hi_prec_out) { int mv_frac_x = (mv_param[0] & 3); int mv_frac_y = (mv_param[1] & 3); - // Fractional luma 1/4-pel - kvz_extended_block src = { 0, 0, 0, 0 }; + // Space for extrapolated pixels and the part from the picture. + // One extra row for AVX2. + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; + kvz_epol_args epol_args = { + .src = ref->y, + .src_w = ref->width, + .src_h = ref->height, + .src_s = ref->stride, + .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2), + .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2), + .blk_w = block_width, + .blk_h = block_height, + .pad_l = KVZ_LUMA_FILTER_OFFSET, + .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_t = KVZ_LUMA_FILTER_OFFSET, + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; - // Fractional luma - kvz_get_extended_block(xpos, - ypos, - mv_param[0] >> 2, - mv_param[1] >> 2, - state->tile->offset_x, - state->tile->offset_y, - ref->y, - ref->width, - ref->height, - KVZ_LUMA_FILTER_TAPS, - block_width, - block_height, - &src); + kvz_get_extended_block(&epol_args); kvz_sample_14bit_quarterpel_luma(state->encoder_control, - src.orig_topleft, - src.stride, - block_width, - block_height, - hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), - LCU_WIDTH, - mv_frac_x, - mv_frac_y, - mv_param); - - if (src.malloc_used) free(src.buffer); + ext_origin, + ext_s, + block_width, + block_height, + hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), + LCU_WIDTH, + mv_frac_x, + mv_frac_y, + mv_param); } -static void inter_recon_frac_chroma(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t block_width, - int32_t block_height, - const int16_t mv_param[2], - lcu_t *lcu) +static void inter_recon_frac_chroma(const encoder_state_t *const state, + const kvz_picture *const ref, + int32_t xpos, + int32_t ypos, + int32_t block_width, + int32_t block_height, + const int16_t mv_param[2], + lcu_t *lcu) { int mv_frac_x = (mv_param[0] & 7); int mv_frac_y = (mv_param[1] & 7); - // Translate to chroma - xpos >>= 1; - ypos >>= 1; - block_width >>= 1; - block_height >>= 1; + // Space for extrapolated pixels and the part from the picture. + // Three extra rows for AVX2. + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; - // Fractional chroma 1/8-pel - kvz_extended_block src_u = { 0, 0, 0, 0 }; - kvz_extended_block src_v = { 0, 0, 0, 0 }; + // Chroma U + // Divisions by 2 due to 4:2:0 chroma subsampling + kvz_epol_args epol_args = { + .src = ref->u, + .src_w = ref->width / 2, + .src_h = ref->height / 2, + .src_s = ref->stride / 2, + .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3), + .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3), + .blk_w = block_width / 2, + .blk_h = block_height / 2, + .pad_l = KVZ_CHROMA_FILTER_OFFSET, + .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, + .pad_t = KVZ_CHROMA_FILTER_OFFSET, + .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; - //Fractional chroma U - kvz_get_extended_block(xpos, ypos, - (mv_param[0] >> 2) >> 1, - (mv_param[1] >> 2) >> 1, - state->tile->offset_x >> 1, - state->tile->offset_y >> 1, - ref->u, - ref->width >> 1, - ref->height >> 1, - KVZ_CHROMA_FILTER_TAPS, - block_width, - block_height, - &src_u); - kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width, - block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); + kvz_get_extended_block(&epol_args); + kvz_sample_octpel_chroma(state->encoder_control, + ext_origin, + ext_s, + block_width / 2, + block_height / 2, + lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C), + LCU_WIDTH_C, + mv_frac_x, + mv_frac_y, + mv_param); - //Fractional chroma V - kvz_get_extended_block(xpos, ypos, - (mv_param[0] >> 2) >> 1, - (mv_param[1] >> 2) >> 1, - state->tile->offset_x >> 1, - state->tile->offset_y >> 1, - ref->v, - ref->width >> 1, - ref->height >> 1, - KVZ_CHROMA_FILTER_TAPS, - block_width, - block_height, - &src_v); - kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width, - block_height, lcu->rec.v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); - - if (src_u.malloc_used) free(src_u.buffer); - if (src_v.malloc_used) free(src_v.buffer); + // Chroma V + epol_args.src = ref->v; + kvz_get_extended_block(&epol_args); + kvz_sample_octpel_chroma(state->encoder_control, + ext_origin, + ext_s, + block_width / 2, + block_height / 2, + lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C), + LCU_WIDTH_C, + mv_frac_x, + mv_frac_y, + mv_param); } -static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t block_width, - int32_t block_height, - const int16_t mv_param[2], - hi_prec_buf_t *hi_prec_out) +static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state, + const kvz_picture *const ref, + int32_t xpos, + int32_t ypos, + int32_t block_width, + int32_t block_height, + const int16_t mv_param[2], + hi_prec_buf_t *hi_prec_out) { int mv_frac_x = (mv_param[0] & 7); int mv_frac_y = (mv_param[1] & 7); - // Translate to chroma - xpos >>= 1; - ypos >>= 1; - block_width >>= 1; - block_height >>= 1; + // Space for extrapolated pixels and the part from the picture. + // Three extra rows for AVX2. + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; - // Fractional chroma 1/8-pel - kvz_extended_block src_u = { 0, 0, 0, 0 }; - kvz_extended_block src_v = { 0, 0, 0, 0 }; + // Chroma U + // Divisions by 2 due to 4:2:0 chroma subsampling + kvz_epol_args epol_args = { + .src = ref->u, + .src_w = ref->width / 2, + .src_h = ref->height / 2, + .src_s = ref->stride / 2, + .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3), + .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3), + .blk_w = block_width / 2, + .blk_h = block_height / 2, + .pad_l = KVZ_CHROMA_FILTER_OFFSET, + .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, + .pad_t = KVZ_CHROMA_FILTER_OFFSET, + .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; - //Fractional chroma U - kvz_get_extended_block(xpos, - ypos, - (mv_param[0] >> 2) >> 1, - (mv_param[1] >> 2) >> 1, - state->tile->offset_x >> 1, - state->tile->offset_y >> 1, - ref->u, - ref->width >> 1, - ref->height >> 1, - KVZ_CHROMA_FILTER_TAPS, - block_width, - block_height, - &src_u); + kvz_get_extended_block(&epol_args); kvz_sample_14bit_octpel_chroma(state->encoder_control, - src_u.orig_topleft, - src_u.stride, - block_width, - block_height, - hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), - LCU_WIDTH_C, - mv_frac_x, - mv_frac_y, - mv_param); + ext_origin, + ext_s, + block_width / 2, + block_height / 2, + hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C), + LCU_WIDTH_C, + mv_frac_x, + mv_frac_y, + mv_param); - //Fractional chroma V - kvz_get_extended_block(xpos, - ypos, - (mv_param[0] >> 2) >> 1, - (mv_param[1] >> 2) >> 1, - state->tile->offset_x >> 1, - state->tile->offset_y >> 1, - ref->v, - ref->width >> 1, - ref->height >> 1, - KVZ_CHROMA_FILTER_TAPS, - block_width, - block_height, - &src_v); + // Chroma V + epol_args.src = ref->v; + kvz_get_extended_block(&epol_args); kvz_sample_14bit_octpel_chroma(state->encoder_control, - src_v.orig_topleft, - src_v.stride, - block_width, - block_height, - hi_prec_out->v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), - LCU_WIDTH_C, - mv_frac_x, - mv_frac_y, - mv_param); - - if (src_u.malloc_used) free(src_u.buffer); - if (src_v.malloc_used) free(src_v.buffer); + ext_origin, + ext_s, + block_width / 2, + block_height / 2, + hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C), + LCU_WIDTH_C, + mv_frac_x, + mv_frac_y, + mv_param); } diff --git a/src/search_inter.c b/src/search_inter.c index e13a491e..0fcd70c6 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -992,7 +992,6 @@ static void search_frac(inter_search_info_t *info) unsigned costs[4] = { 0 }; - kvz_extended_block src = { 0, 0, 0, 0 }; ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH]; // Storage buffers for intermediate horizontally filtered results. @@ -1013,20 +1012,41 @@ static void search_frac(inter_search_info_t *info) int8_t sample_off_x = 0; int8_t sample_off_y = 0; - kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1, - state->tile->offset_x, - state->tile->offset_y, - ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS, - internal_width+1, internal_height+1, - &src); + // Space for (possibly) extrapolated pixels and the part from the picture + // One extra column for ME and two extra columns for ME and AVX2 + // The extrapolation function will set the pointers and stride. + kvz_pixel ext_buffer[(KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 2)]; + kvz_pixel *ext = NULL; + kvz_pixel *ext_origin = NULL; + int ext_s = 0; + kvz_epol_args epol_args = { + .src = ref->y, + .src_w = ref->width, + .src_h = ref->height, + .src_s = ref->stride, + .blk_x = state->tile->offset_x + orig.x + mv.x - 1, + .blk_y = state->tile->offset_y + orig.y + mv.y - 1, + .blk_w = internal_width + 1, // TODO: real width + .blk_h = internal_height + 1, // TODO: real height + .pad_l = KVZ_LUMA_FILTER_OFFSET, + .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_t = KVZ_LUMA_FILTER_OFFSET, + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .buf = ext_buffer, + .ext = &ext, + .ext_origin = &ext_origin, + .ext_s = &ext_s + }; + + kvz_get_extended_block(&epol_args); kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x; int tmp_stride = pic->stride; // Search integer position costs[0] = kvz_satd_any_size(width, height, - tmp_pic, tmp_stride, - src.orig_topleft + src.stride + 1, src.stride); + tmp_pic, tmp_stride, + ext_origin + ext_s + 1, ext_s); costs[0] += info->mvd_cost_func(state, mv.x, mv.y, 2, @@ -1056,8 +1076,8 @@ static void search_frac(inter_search_info_t *info) const int mv_shift = (step < 2) ? 1 : 0; filter_steps[step](state->encoder_control, - src.orig_topleft, - src.stride, + ext_origin, + ext_s, internal_width, internal_height, filtered, @@ -1131,8 +1151,6 @@ static void search_frac(inter_search_info_t *info) info->best_mv = mv; info->best_cost = best_cost; info->best_bitcost = best_bitcost; - - if (src.malloc_used) free(src.buffer); } /** diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index c6fd0d8a..8ba28e66 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -25,25 +25,23 @@ #include "strategies/avx2/ipol-avx2.h" #if COMPILE_INTEL_AVX2 -#include "kvazaar.h" - #include #include #include #include "encoder.h" +#include "kvazaar.h" #include "search_inter.h" #include "strategies/generic/picture-generic.h" #include "strategies/strategies-ipol.h" #include "strategyselector.h" #include "strategies/generic/ipol-generic.h" -#if KVZ_BIT_DEPTH == 8 extern int8_t kvz_g_luma_filter[4][8]; extern int8_t kvz_g_chroma_filter[8][4]; -static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, uint8_t *data) +static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) { __m128i fir = _mm_loadl_epi64((__m128i*)filter); __m128i row = _mm_loadl_epi64((__m128i*)data); @@ -102,7 +100,7 @@ static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) { filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45 } -static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out, +static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, __m256i *shuf_01_23, __m256i *shuf_45_67, __m256i *taps_01_23, __m256i *taps_45_67) { @@ -119,7 +117,7 @@ static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out, _mm_storeu_si128((__m128i*)out, filtered); } -static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t * out, int out_stride, +static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, __m256i *shuf_01, __m256i *shuf_23, __m256i *taps_01, __m256i *taps_23) { @@ -145,7 +143,7 @@ static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper)); } -static void kvz_four_tap_filter_hor_4xN_avx2(uint8_t *data, int stride, int16_t * out, int out_stride, +static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, __m256i *shuf_01_23, __m256i *taps_01_23, int rows) { @@ -179,7 +177,7 @@ static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data return filtered; } -static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out) +static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out) { // Interpolation filter shifts int32_t shift2 = 6; @@ -245,7 +243,7 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat _mm_storel_epi64((__m128i*)out, filtered); } -static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out, int16_t out_stride) +static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride) { // Interpolation filter shifts int32_t shift2 = 6; @@ -368,7 +366,7 @@ static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int1 _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23)); } -INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, uint8_t * out, int64_t out_stride) +INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride) { // Interpolation filter shifts int32_t shift2 = 6; @@ -591,7 +589,7 @@ INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t s _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6); } -INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, uint8_t *out, int out_stride) +INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride) { // Filter even rows filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 @@ -612,11 +610,11 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil } static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -697,10 +695,10 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // VERTICAL STEP - uint8_t *out_l = filtered[0]; - uint8_t *out_r = filtered[1]; - uint8_t *out_t = filtered[2]; - uint8_t *out_b = filtered[3]; + kvz_pixel *out_l = filtered[0]; + kvz_pixel *out_r = filtered[1]; + kvz_pixel *out_t = filtered[2]; + kvz_pixel *out_b = filtered[3]; __m256i taps[4]; kvz_init_ver_filter_taps(fir0, taps); @@ -748,11 +746,11 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -776,10 +774,10 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t *col_pos2 = hor_first_cols[2]; // VERTICAL STEP - uint8_t *out_tl = filtered[0]; - uint8_t *out_tr = filtered[1]; - uint8_t *out_bl = filtered[2]; - uint8_t *out_br = filtered[3]; + kvz_pixel *out_tl = filtered[0]; + kvz_pixel *out_tr = filtered[1]; + kvz_pixel *out_bl = filtered[2]; + kvz_pixel *out_br = filtered[3]; __m256i taps[4]; kvz_init_ver_filter_taps(fir2, taps); @@ -831,11 +829,11 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco } static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -930,10 +928,10 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // VERTICAL STEP - uint8_t *out_l = filtered[0]; - uint8_t *out_r = filtered[1]; - uint8_t *out_t = filtered[2]; - uint8_t *out_b = filtered[3]; + kvz_pixel *out_l = filtered[0]; + kvz_pixel *out_r = filtered[1]; + kvz_pixel *out_t = filtered[2]; + kvz_pixel *out_b = filtered[3]; int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0; int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0; @@ -1058,11 +1056,11 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH], + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], @@ -1090,10 +1088,10 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t hor_stride = LCU_WIDTH; // VERTICAL STEP - uint8_t *out_tl = filtered[0]; - uint8_t *out_tr = filtered[1]; - uint8_t *out_bl = filtered[2]; - uint8_t *out_br = filtered[3]; + kvz_pixel *out_tl = filtered[0]; + kvz_pixel *out_tr = filtered[1]; + kvz_pixel *out_bl = filtered[2]; + kvz_pixel *out_br = filtered[3]; int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; @@ -1216,11 +1214,11 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco } static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t *dst, + kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, @@ -1270,7 +1268,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco } static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, @@ -1325,11 +1323,11 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, - uint8_t *dst, + kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, @@ -1387,7 +1385,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode } static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder, - uint8_t *src, + kvz_pixel *src, int16_t src_stride, int width, int height, @@ -1449,73 +1447,12 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const } } -#endif //KVZ_BIT_DEPTH == 8 - -void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filter_size, int width, int height, kvz_extended_block *out) { - - int half_filter_size = filter_size >> 1; - - out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x); - out->stride = ref_width; - out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size; - out->malloc_used = 0; - - int min_y = ypos - half_filter_size + off_y + mv_y; - int max_y = min_y + height + filter_size; - int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height); - - int min_x = xpos - half_filter_size + off_x + mv_x; - int max_x = min_x + width + filter_size; - int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width); - - int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x; - - if (sample_out_of_bounds){ - // Alloc 5 pixels more than we actually use because AVX2 filter - // functions read up to 5 pixels past the last pixel. - out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size) + 5); - if (!out->buffer){ - fprintf(stderr, "Memory allocation failed!\n"); - assert(0); - } - out->stride = width + filter_size; - out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size; - out->malloc_used = 1; - - int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - - for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) { - - // calculate y-pixel offset - coord_y = y + off_y + mv_y; - coord_y = CLIP(0, (ref_height)-1, coord_y); - coord_y *= ref_width; - - if (!out_of_bounds_x){ - memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel)); - } else { - for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) { - - coord_x = x + off_x + mv_x; - coord_x = CLIP(0, (ref_width)-1, coord_x); - - // Store source block data (with extended borders) - out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x]; - } - } - } - } -} - #endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) { bool success = true; #if COMPILE_INTEL_AVX2 -#if KVZ_BIT_DEPTH == 8 - if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2); success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2); @@ -1526,10 +1463,6 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); } -#endif //KVZ_BIT_DEPTH == 8 - - success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2); - #endif //COMPILE_INTEL_AVX2 return success; } diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index f71615c8..75c0eee9 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -728,59 +728,46 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco } -void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filter_size, int width, int height, kvz_extended_block *out) { +void kvz_get_extended_block_generic(kvz_epol_args args) { - int half_filter_size = filter_size >> 1; + int min_y = args.blk_y - args.pad_t; + int max_y = args.blk_y + args.blk_h + args.pad_b - 1; + bool out_of_bounds_y = (min_y < 0) || (max_y >= args.src_h); - out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x); - out->stride = ref_width; - out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size; - out->malloc_used = 0; + int min_x = args.blk_x - args.pad_l; + int max_x = args.blk_x + args.blk_w + args.pad_r - 1; + bool out_of_bounds_x = (min_x < 0) || (max_x >= args.src_w); - int min_y = ypos - half_filter_size + off_y + mv_y; - int max_y = min_y + height + filter_size; - int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height); + if (out_of_bounds_y || out_of_bounds_x) { - int min_x = xpos - half_filter_size + off_x + mv_x; - int max_x = min_x + width + filter_size; - int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width); + *args.ext = args.buf; + *args.ext_s = args.pad_l + args.blk_w + args.pad_r; + *args.ext_origin = args.buf + args.pad_t * (*args.ext_s) + args.pad_l; - int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x; + int cnt_l = CLIP(0, *args.ext_s, -min_x); + int cnt_r = CLIP(0, *args.ext_s, max_x - (args.src_w - 1)); + int cnt_m = CLIP(0, *args.ext_s, *args.ext_s - cnt_l - cnt_r); - if (sample_out_of_bounds){ - out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size)); - if (!out->buffer){ - fprintf(stderr, "Memory allocation failed!\n"); - assert(0); + // For each row including padding + for (int y = -args.pad_t; y < args.blk_h + args.pad_b; ++y) { + + int clipped_y = CLIP(0, args.src_h - 1, args.blk_y + y); + kvz_pixel sample_l = *(args.src + clipped_y * args.src_s); + kvz_pixel sample_r = *(args.src + clipped_y * args.src_s + args.src_w - 1); + kvz_pixel *src_m = args.src + clipped_y * args.src_s + MAX(min_x, 0); + kvz_pixel *dst_l = args.buf + (y + args.pad_t) * (*args.ext_s); + kvz_pixel *dst_m = dst_l + cnt_l; + kvz_pixel *dst_r = dst_m + cnt_m; + for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l; + for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i); + for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r; } - out->stride = width + filter_size; - out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size; - out->malloc_used = 1; + } else { - int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - - for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) { - - // calculate y-pixel offset - coord_y = y + off_y + mv_y; - coord_y = CLIP(0, (ref_height)-1, coord_y); - coord_y *= ref_width; - - if (!out_of_bounds_x){ - memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel)); - } else { - for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) { - - coord_x = x + off_x + mv_x; - coord_x = CLIP(0, (ref_width)-1, coord_x); - - // Store source block data (with extended borders) - out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x]; - } - } - } - } + *args.ext = args.src + (args.blk_y - args.pad_t) * args.src_s + (args.blk_x - args.pad_l); + *args.ext_origin = args.src + args.blk_y * args.src_s + args.blk_x; + *args.ext_s = args.src_s; + } } int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth) diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index ce6608f7..0566507d 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -38,8 +38,34 @@ typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t sample_off_x, int8_t sample_off_y); -typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filter_size, int width, int height, kvz_extended_block *out); +typedef struct { + // Source samples + kvz_pixel *src; // Top-left sample + int src_w; // Width + int src_h; // Height + int src_s; // Stride + + // Requested sampling position, base dimensions, and padding + int blk_x; + int blk_y; + int blk_w; // Width + int blk_h; // Height + int pad_l; // Left + int pad_r; // Right + int pad_t; // Top + int pad_b; // Bottom + + // Buffer for possible extrapolation. Free memory provided by the caller. + kvz_pixel *buf; + + // Extended block data. These are set by the function. + kvz_pixel **ext; // Top-left sample with padding + kvz_pixel **ext_origin; // Top-left sample without padding + int *ext_s; // Stride +} kvz_epol_args; + +typedef unsigned(epol_func)(kvz_epol_args *args); + typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); From d9a3225ae5c7cedc55ed59d290cb89daab1808e1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 24 Aug 2020 01:51:24 +0300 Subject: [PATCH 02/19] Add new AVX2 vertical ip filter for high-precision --- src/strategies/avx2/ipol-avx2.c | 97 +++++++++++++++++++++++++++++---- 1 file changed, 87 insertions(+), 10 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 8ba28e66..1a491407 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -1267,6 +1267,90 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco } } +static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter, +int width, +int height, +int16_t *src, +int16_t src_stride, +int16_t *dst, +int16_t dst_stride) +{ + const int shift2 = 6; + + __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter); + __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); + __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); + __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2)); + __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3)); + + for (int x = 0; x + 3 < width; x += 4) { + + int16_t *strip_ptr = src + 0 * src_stride + x; + + // Initial values + // Broadcasted rows in both lanes + // __m256i r0; // Unused + // __m256i r1; // Unused + __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride)); + __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride)); + __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride)); + __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride)); + __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride)); + __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride)); + __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride)); + + // Consecutive rows in low and high lanes + // __m256i r0_r1; // Unused + // __m256i r1_r2; // Unused + __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0); + __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0); + __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + // Paired samples of consecutive rows + __m256i r01_r12; + __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6); + __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + for (int y = 0; y < height; y += 2) { + + strip_ptr = src + y * src_stride + x; + + // Slide window + r01_r12 = r23_r34; + r23_r34 = r45_r56; + r45_r56 = r67_r78; + r6 = r8; + r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride)); + r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride)); + r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); + __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); + __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45); + __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67); + + __m256i sum0123 = _mm256_add_epi32(dot01, dot23); + __m256i sum4567 = _mm256_add_epi32(dot45, dot67); + __m256i sum = _mm256_add_epi32(sum0123, sum4567); + sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_packs_epi32(sum, sum); + + int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; + int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x]; + _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum)); + _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1)); + } + } +} + static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, @@ -1289,8 +1373,8 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; - int16_t hor_stride = LCU_WIDTH; - int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; + int16_t hor_stride = width; + ALIGNED(64) int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; // HORIZONTAL STEP __m256i shuf_01_23, shuf_45_67; @@ -1311,14 +1395,7 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons } // VERTICAL STEP - __m256i taps[4]; - kvz_init_ver_filter_taps(ver_fir, taps); - - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); - } - } + kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } From f5b0e3c52b394d6d72eb0a67decdc855c443dd84 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 21:01:40 +0200 Subject: [PATCH 03/19] Add new AVX2 horizontal ip filter capable of every luma PB --- src/strategies/avx2/ipol-avx2.c | 130 +++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 28 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 1a491407..6332f652 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -1267,6 +1267,101 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco } } +static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, + int width, + int height, + kvz_pixel *src, + int16_t src_stride, + int16_t *dst, + int16_t dst_stride) { + __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); + + __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0)); + __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2)); + __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4)); + __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6)); + + int y_offset = -KVZ_LUMA_FILTER_OFFSET; + int x_offset = -KVZ_LUMA_FILTER_OFFSET; + + kvz_pixel *top_left = src + src_stride * y_offset + x_offset; + + int y = 0; + int x = 0; + + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { + + for (x = 0; x + 7 < width; x += 8) { + + kvz_pixel *chunk_ptr = top_left + src_stride * y + x; + __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride)); + __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride)); + __m256i r0_r1 = _mm256_castsi128_si256(r0); + r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); + + __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); + __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); + __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); + __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); + + __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); + __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); + __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); + __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); + + __m256i sum0123 = _mm256_add_epi16(dot01, dot23); + __m256i sum4567 = _mm256_add_epi16(dot45, dot67); + __m256i sum = _mm256_add_epi16(sum0123, sum4567); + + __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); + __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); + __m128i sum_r0 = _mm256_castsi256_si128(sum); + __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); + _mm_storeu_si128(dst_r0, sum_r0); + _mm_storeu_si128(dst_r1, sum_r1); + } + } + + if (x < width) { + for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { + + kvz_pixel *chunk_ptr = top_left + src_stride * y + x; + __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride)); + __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride)); + __m256i r0_r1 = _mm256_castsi128_si256(r0); + r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); + + __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); + __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); + __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); + __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); + + __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); + __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); + __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); + __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); + + __m256i sum0123 = _mm256_add_epi16(dot01, dot23); + __m256i sum4567 = _mm256_add_epi16(dot45, dot67); + __m256i sum = _mm256_add_epi16(sum0123, sum4567); + + __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); + __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); + __m128i sum_r0 = _mm256_castsi256_si128(sum); + __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); + _mm_storel_epi64(dst_r0, sum_r0); + _mm_storel_epi64(dst_r1, sum_r1); + } + } +} + static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter, int width, int height, @@ -1362,40 +1457,19 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons int8_t ver_flag, const int16_t mv[2]) { - // TODO: Optimize SMP and AMP - if (width != height) { - kvz_sample_14bit_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); - return; - } // TODO: horizontal and vertical only filtering int x, y; int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; + + // Buffer for intermediate values with one extra row + // because the loop writes two rows each iteration. + ALIGNED(64) int16_t hor_filtered[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + int16_t hor_stride = LCU_WIDTH; - int16_t hor_stride = width; - ALIGNED(64) int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; - - // HORIZONTAL STEP - __m256i shuf_01_23, shuf_45_67; - __m256i taps_01_23, taps_45_67; - - kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); - kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } - - // VERTICAL STEP - kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); + kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_filtered, hor_stride); + kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_filtered, hor_stride, dst, dst_stride); } From 2175023843faa8b2a0e75c434fb965614c60e62b Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 21:12:39 +0200 Subject: [PATCH 04/19] Relocate function --- src/strategies/avx2/ipol-avx2.c | 109 ++++++++++++++++---------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 6332f652..fd7182b6 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -1213,60 +1213,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco } } -static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, - int16_t src_stride, - int width, - int height, - kvz_pixel *dst, - int16_t dst_stride, - int8_t hor_flag, - int8_t ver_flag, - const int16_t mv[2]) -{ - // TODO: Optimize SMP and AMP - if (width != height) { - kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); - return; - } - - int x, y; - - int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; - int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; - - int16_t hor_stride = LCU_WIDTH; - int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; - - // HORIZONTAL STEP - __m256i shuf_01_23, shuf_45_67; - __m256i taps_01_23, taps_45_67; - - kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); - kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } - - // VERTICAL STEP - __m256i taps[4]; - kvz_init_ver_filter_taps(ver_fir, taps); - - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); - } - } -} - static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, int width, int height, @@ -1446,6 +1392,61 @@ int16_t dst_stride) } } +static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, + const int16_t mv[2]) +{ + // TODO: Optimize SMP and AMP + if (width != height) { + kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + return; + } + + int x, y; + + int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; + int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; + + int16_t hor_stride = LCU_WIDTH; + int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; + + // HORIZONTAL STEP + __m256i shuf_01_23, shuf_45_67; + __m256i taps_01_23, taps_45_67; + + kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); + kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); + + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { + + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 + } + } + + // VERTICAL STEP + __m256i taps[4]; + kvz_init_ver_filter_taps(ver_fir, taps); + + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); + } + } +} + + static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, From 9e4b62a891bbaa3db78cb9af6286ac68b6577b3c Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 21:14:28 +0200 Subject: [PATCH 05/19] Use the new horizontal filter for pixel precision as well --- src/strategies/avx2/ipol-avx2.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index fd7182b6..8431cdb1 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -1414,26 +1414,12 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; + // Buffer for intermediate values with one extra row + // because the loop writes two rows each iteration. + ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; int16_t hor_stride = LCU_WIDTH; - int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; - // HORIZONTAL STEP - __m256i shuf_01_23, shuf_45_67; - __m256i taps_01_23, taps_45_67; - - kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); - kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } + kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); // VERTICAL STEP __m256i taps[4]; From e572066e46d4ef7900fd8106ecc3caf0c5199f50 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 21:53:15 +0200 Subject: [PATCH 06/19] Add new AVX2 vertical ip filter for pixel precision --- src/strategies/avx2/ipol-avx2.c | 593 +++++++++++++++++--------------- 1 file changed, 308 insertions(+), 285 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 8431cdb1..44ffdac4 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -609,6 +609,277 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil } +static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, + int width, + int height, + kvz_pixel *src, + int16_t src_stride, + int16_t *dst, + int16_t dst_stride) { + __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); + + __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0)); + __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2)); + __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4)); + __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6)); + + int y_offset = -KVZ_LUMA_FILTER_OFFSET; + int x_offset = -KVZ_LUMA_FILTER_OFFSET; + + kvz_pixel *top_left = src + src_stride * y_offset + x_offset; + + int y = 0; + int x = 0; + + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { + + for (x = 0; x + 7 < width; x += 8) { + + kvz_pixel *chunk_ptr = top_left + src_stride * y + x; + __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride)); + __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride)); + __m256i r0_r1 = _mm256_castsi128_si256(r0); + r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); + + __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); + __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); + __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); + __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); + + __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); + __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); + __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); + __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); + + __m256i sum0123 = _mm256_add_epi16(dot01, dot23); + __m256i sum4567 = _mm256_add_epi16(dot45, dot67); + __m256i sum = _mm256_add_epi16(sum0123, sum4567); + + __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); + __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); + __m128i sum_r0 = _mm256_castsi256_si128(sum); + __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); + _mm_storeu_si128(dst_r0, sum_r0); + _mm_storeu_si128(dst_r1, sum_r1); + } + } + + if (x < width) { + for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { + + kvz_pixel *chunk_ptr = top_left + src_stride * y + x; + __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride)); + __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride)); + __m256i r0_r1 = _mm256_castsi128_si256(r0); + r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); + + __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); + __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); + __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); + __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); + + __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); + __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); + __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); + __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); + + __m256i sum0123 = _mm256_add_epi16(dot01, dot23); + __m256i sum4567 = _mm256_add_epi16(dot45, dot67); + __m256i sum = _mm256_add_epi16(sum0123, sum4567); + + __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); + __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); + __m128i sum_r0 = _mm256_castsi256_si128(sum); + __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); + _mm_storel_epi64(dst_r0, sum_r0); + _mm_storel_epi64(dst_r1, sum_r1); + } + } +} + +static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter, + int width, + int height, + int16_t *src, + int16_t src_stride, + kvz_pixel *dst, + int16_t dst_stride) +{ + // Interpolation filter shifts + int32_t shift2 = 6; + + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + + __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter); + __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); + __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); + __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2)); + __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3)); + + for (int x = 0; x + 3 < width; x += 4) { + + int16_t *strip_ptr = src + 0 * src_stride + x; + + // Initial values + // Broadcasted rows in both lanes + // __m256i r0; // Unused + // __m256i r1; // Unused + __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride)); + __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride)); + __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride)); + __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride)); + __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride)); + __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride)); + __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride)); + + // Consecutive rows in low and high lanes + // __m256i r0_r1; // Unused + // __m256i r1_r2; // Unused + __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0); + __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0); + __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + // Paired samples of consecutive rows + __m256i r01_r12; + __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6); + __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + for (int y = 0; y < height; y += 2) { + + strip_ptr = src + y * src_stride + x; + + // Slide window + r01_r12 = r23_r34; + r23_r34 = r45_r56; + r45_r56 = r67_r78; + r6 = r8; + r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride)); + r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride)); + r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); + __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); + __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45); + __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67); + + __m256i sum0123 = _mm256_add_epi32(dot01, dot23); + __m256i sum4567 = _mm256_add_epi32(dot45, dot67); + __m256i sum = _mm256_add_epi32(sum0123, sum4567); + sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_add_epi32(sum, _mm256_set1_epi32(wp_offset1)); + sum = _mm256_srai_epi32(sum, wp_shift1); + sum = _mm256_packs_epi32(sum, sum); + sum = _mm256_packus_epi16(sum, sum); + + kvz_pixel *dst_addr0 = &dst[(y + 0) * dst_stride + x]; + kvz_pixel *dst_addr1 = &dst[(y + 1) * dst_stride + x]; + *(uint32_t*)dst_addr0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); + *(uint32_t*)dst_addr1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sum, 1)); + } + } +} + +static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter, +int width, +int height, +int16_t *src, +int16_t src_stride, +int16_t *dst, +int16_t dst_stride) +{ + const int shift2 = 6; + + __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter); + __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); + __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); + __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2)); + __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3)); + + for (int x = 0; x + 3 < width; x += 4) { + + int16_t *strip_ptr = src + 0 * src_stride + x; + + // Initial values + // Broadcasted rows in both lanes + // __m256i r0; // Unused + // __m256i r1; // Unused + __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride)); + __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride)); + __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride)); + __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride)); + __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride)); + __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride)); + __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride)); + + // Consecutive rows in low and high lanes + // __m256i r0_r1; // Unused + // __m256i r1_r2; // Unused + __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0); + __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0); + __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + // Paired samples of consecutive rows + __m256i r01_r12; + __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6); + __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + for (int y = 0; y < height; y += 2) { + + strip_ptr = src + y * src_stride + x; + + // Slide window + r01_r12 = r23_r34; + r23_r34 = r45_r56; + r45_r56 = r67_r78; + r6 = r8; + r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride)); + r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride)); + r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); + r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); + + r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); + + __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); + __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); + __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45); + __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67); + + __m256i sum0123 = _mm256_add_epi32(dot01, dot23); + __m256i sum4567 = _mm256_add_epi32(dot45, dot67); + __m256i sum = _mm256_add_epi32(sum0123, sum4567); + sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_packs_epi32(sum, sum); + + int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; + int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x]; + _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum)); + _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1)); + } + } +} + static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, @@ -675,16 +946,7 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67); - for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos2[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } + kvz_ipol_8tap_hor_px_im_avx2(fir2, width, height + 1, src + 1, src_stride, hor_pos2, hor_stride); // Write the first column in contiguous memory x = 0; @@ -704,12 +966,9 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e kvz_init_ver_filter_taps(fir0, taps); // Right - for (y = 0; y + 7 < height; y+=8) { - - for (x = 0; x + 7 < width ; x+=8) { - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[(y + 1) * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride); - } - } + int16_t *im = &hor_pos2[hor_stride]; + kvz_pixel *dst = out_r; + kvz_ipol_8tap_ver_im_px_avx2(fir0, width, height, im, hor_stride, dst, dst_stride); // Left // Copy from the right filtered block and filter the extra column @@ -725,11 +984,9 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e kvz_init_ver_filter_taps(fir2, taps); // Top - for (y = 0; y + 7 < height; y+=8) { - for (x = 0; x + 7 < width; x+=8) { - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos0[y * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride); - } - } + im = hor_pos0; + dst = out_t; + kvz_ipol_8tap_ver_im_px_avx2(fir2, width, height, im, hor_stride, dst, dst_stride); // Bottom // Copy what can be copied from the top filtered values. @@ -782,11 +1039,9 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco __m256i taps[4]; kvz_init_ver_filter_taps(fir2, taps); // Top-Right - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[y * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride); - } - } + int16_t *im = hor_pos2; + kvz_pixel *dst = out_tr; + kvz_ipol_8tap_ver_im_px_avx2(fir2, width, height, im, hor_stride, dst, dst_stride); // Top-left // Copy from the top-right filtered block and filter the extra column @@ -885,17 +1140,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67); int sample_off_y = hpel_off_y < 0 ? 0 : 1; - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_l[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } + kvz_ipol_8tap_hor_px_im_avx2(hor_fir_l, width, height + 1, src + 1, src_stride, hor_pos_l, hor_stride); // Write the first column in contiguous memory x = 0; @@ -907,17 +1152,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Right QPEL kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67); - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { - - for (x = 0; x + 7 < width; x += 8) { - int ypos = y - KVZ_LUMA_FILTER_OFFSET; - int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; - kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_r[y * hor_stride + x], - &shuf_01_23, &shuf_45_67, - &taps_01_23, &taps_45_67); //TODO: >> shift1 - } - } + kvz_ipol_8tap_hor_px_im_avx2(hor_fir_r, width, height + 1, src + 1, src_stride, hor_pos_r, hor_stride); // Write the first column in contiguous memory x = 0; @@ -944,12 +1179,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Filter block and then filter column and align if neccessary kvz_init_ver_filter_taps(ver_fir_l, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + sample_off_y; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_l[y * dst_stride + x], dst_stride); - } - } + int16_t *im = &hor_pos_l[sample_off_y * hor_stride]; + kvz_pixel *dst = out_l; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_l, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_l) { for (y = 0; y < height; ++y) { @@ -972,12 +1204,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Filter block and then filter column and align if neccessary kvz_init_ver_filter_taps(ver_fir_r, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + sample_off_y; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride); - } - } + im = &hor_pos_r[sample_off_y * hor_stride]; + dst = out_r; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_r, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_r) { for (y = 0; y < height; ++y) { @@ -1002,12 +1231,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e int sample_off_x = (hpel_off_x > -1 ? 1 : 0); kvz_init_ver_filter_taps(ver_fir_t, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_t; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride); - } - } + im = &hor_hpel_pos[off_y_fir_t * hor_stride]; + dst = out_t; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride); if (!sample_off_x) { for (y = 0; y < height; ++y) { @@ -1030,12 +1256,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Filter block and then filter column and align if neccessary kvz_init_ver_filter_taps(ver_fir_b, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_b; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_b[y * dst_stride + x], dst_stride); - } - } + im = &hor_hpel_pos[off_y_fir_b * hor_stride]; + dst = out_b; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride); if (!sample_off_x) { for (y = 0; y < height; ++y) { @@ -1107,12 +1330,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco // Filter block and then filter column and align if neccessary kvz_init_ver_filter_taps(ver_fir_t, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_t; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_tl[y * dst_stride + x], dst_stride); - } - } + int16_t *im = &hor_pos_l[off_y_fir_t * hor_stride]; + kvz_pixel *dst = out_tl; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_l) { for (y = 0; y < height; ++y) { @@ -1134,12 +1354,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco // Top-right QPEL // Filter block and then filter column and align if neccessary - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_t; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride); - } - } + im = &hor_pos_r[off_y_fir_t * hor_stride]; + dst = out_tr; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_r) { for (y = 0; y < height; ++y) { @@ -1162,12 +1379,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco // Filter block and then filter column and align if neccessary kvz_init_ver_filter_taps(ver_fir_b, taps); - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_b; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_bl[y * dst_stride + x], dst_stride); - } - } + im = &hor_pos_l[off_y_fir_b * hor_stride]; + dst = out_bl; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_l) { for (y = 0; y < height; ++y) { @@ -1188,12 +1402,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco // Bottom-right QPEL // Filter block and then filter column and align if neccessary - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - int ypos = y + off_y_fir_b; - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_br[y * dst_stride + x], dst_stride); - } - } + im = &hor_pos_r[off_y_fir_b * hor_stride]; + dst = out_br; + kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride); if (!off_x_fir_r) { for (y = 0; y < height; ++y) { @@ -1213,185 +1424,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco } } -static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, - int width, - int height, - kvz_pixel *src, - int16_t src_stride, - int16_t *dst, - int16_t dst_stride) { - __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); - __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); - __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); - __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); - - __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0)); - __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2)); - __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4)); - __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6)); - - int y_offset = -KVZ_LUMA_FILTER_OFFSET; - int x_offset = -KVZ_LUMA_FILTER_OFFSET; - - kvz_pixel *top_left = src + src_stride * y_offset + x_offset; - - int y = 0; - int x = 0; - - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { - - for (x = 0; x + 7 < width; x += 8) { - - kvz_pixel *chunk_ptr = top_left + src_stride * y + x; - __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride)); - __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride)); - __m256i r0_r1 = _mm256_castsi128_si256(r0); - r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); - - __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); - __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); - __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); - __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); - - __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); - __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); - __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); - __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); - - __m256i sum0123 = _mm256_add_epi16(dot01, dot23); - __m256i sum4567 = _mm256_add_epi16(dot45, dot67); - __m256i sum = _mm256_add_epi16(sum0123, sum4567); - - __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); - __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); - __m128i sum_r0 = _mm256_castsi256_si128(sum); - __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); - _mm_storeu_si128(dst_r0, sum_r0); - _mm_storeu_si128(dst_r1, sum_r1); - } - } - - if (x < width) { - for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) { - - kvz_pixel *chunk_ptr = top_left + src_stride * y + x; - __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride)); - __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride)); - __m256i r0_r1 = _mm256_castsi128_si256(r0); - r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1); - - __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01); - __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23); - __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45); - __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67); - - __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); - __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); - __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45); - __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67); - - __m256i sum0123 = _mm256_add_epi16(dot01, dot23); - __m256i sum4567 = _mm256_add_epi16(dot45, dot67); - __m256i sum = _mm256_add_epi16(sum0123, sum4567); - - __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); - __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); - __m128i sum_r0 = _mm256_castsi256_si128(sum); - __m128i sum_r1 = _mm256_extracti128_si256(sum, 1); - _mm_storel_epi64(dst_r0, sum_r0); - _mm_storel_epi64(dst_r1, sum_r1); - } - } -} - -static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter, -int width, -int height, -int16_t *src, -int16_t src_stride, -int16_t *dst, -int16_t dst_stride) -{ - const int shift2 = 6; - - __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter); - __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); - __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); - __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); - __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2)); - __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3)); - - for (int x = 0; x + 3 < width; x += 4) { - - int16_t *strip_ptr = src + 0 * src_stride + x; - - // Initial values - // Broadcasted rows in both lanes - // __m256i r0; // Unused - // __m256i r1; // Unused - __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride)); - __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride)); - __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride)); - __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride)); - __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride)); - __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride)); - __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride)); - - // Consecutive rows in low and high lanes - // __m256i r0_r1; // Unused - // __m256i r1_r2; // Unused - __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); - __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); - __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0); - __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0); - __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); - __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); - - // Paired samples of consecutive rows - __m256i r01_r12; - __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); - __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6); - __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); - - for (int y = 0; y < height; y += 2) { - - strip_ptr = src + y * src_stride + x; - - // Slide window - r01_r12 = r23_r34; - r23_r34 = r45_r56; - r45_r56 = r67_r78; - r6 = r8; - r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride)); - r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride)); - r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0); - r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0); - - r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8); - - __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); - __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); - __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45); - __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67); - - __m256i sum0123 = _mm256_add_epi32(dot01, dot23); - __m256i sum4567 = _mm256_add_epi32(dot45, dot67); - __m256i sum = _mm256_add_epi32(sum0123, sum4567); - sum = _mm256_srai_epi32(sum, shift2); - sum = _mm256_packs_epi32(sum, sum); - - int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; - int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x]; - _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum)); - _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1)); - } - } -} - static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, @@ -1420,16 +1452,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco int16_t hor_stride = LCU_WIDTH; kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); - - // VERTICAL STEP - __m256i taps[4]; - kvz_init_ver_filter_taps(ver_fir, taps); - - for (y = 0; y + 7 < height; y += 8) { - for (x = 0; x + 7 < width; x += 8) { - kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); - } - } + kvz_ipol_8tap_ver_im_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } From 3476fc62c7ddfa06139d3bce8c9f7dcce5b569fa Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 21:59:05 +0200 Subject: [PATCH 07/19] Fix parameter to signed --- src/strategies/avx2/ipol-avx2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 44ffdac4..27fc1fde 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -609,7 +609,7 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil } -static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, +static void kvz_ipol_8tap_hor_px_im_avx2(int8_t *filter, int width, int height, kvz_pixel *src, @@ -704,7 +704,7 @@ static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter, } } -static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter, +static void kvz_ipol_8tap_ver_im_px_avx2(int8_t *filter, int width, int height, int16_t *src, @@ -796,7 +796,7 @@ static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter, } } -static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter, +static void kvz_ipol_8tap_ver_im_hi_avx2(int8_t *filter, int width, int height, int16_t *src, From 7e6ba9750f89ed444550b50f8839c5ea8962edf1 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Tue, 2 Mar 2021 22:17:50 +0200 Subject: [PATCH 08/19] Add new AVX2 ip filters for chroma --- src/strategies/avx2/ipol-avx2.c | 331 ++++++++++++++++++++++---------- 1 file changed, 231 insertions(+), 100 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 27fc1fde..22ea3641 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -872,6 +872,210 @@ int16_t dst_stride) sum = _mm256_srai_epi32(sum, shift2); sum = _mm256_packs_epi32(sum, sum); + int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; + int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x]; + _mm_storel_epi64((__m128i*)dst_addr0, _mm256_castsi256_si128(sum)); + _mm_storel_epi64((__m128i*)dst_addr1, _mm256_extracti128_si256(sum, 1)); + } + } +} + +static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter, + int width, + int height, + kvz_pixel *src, + int16_t src_stride, + int16_t *dst, + int16_t dst_stride) { + + __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, + 8, 9, 9, 10, 10, 11, 11, 12, + 0, 1, 1, 2, 2, 3, 3, 4, + 8, 9, 9, 10, 10, 11, 11, 12); + + __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, + 10, 11, 11, 12, 12, 13, 13, 14, + 2, 3, 3, 4, 4, 5, 5, 6, + 10, 11, 11, 12, 12, 13, 13, 14); + + __m256i all_w01 = _mm256_set1_epi16(*(uint16_t*)(filter + 0)); + __m256i all_w23 = _mm256_set1_epi16(*(uint16_t*)(filter + 2)); + + int y_offset = -KVZ_CHROMA_FILTER_OFFSET; + int x_offset = -KVZ_CHROMA_FILTER_OFFSET; + + kvz_pixel *top_left = src + src_stride * y_offset + x_offset; + + int y = 0; + int x = 0; + + for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; y += 4) { + + for (x = 0; x + 3 < width; x += 4) { + + kvz_pixel *chunk_ptr = top_left + src_stride * y + x; + __m128i r0r1 = _mm_loadl_epi64((__m128i*)(chunk_ptr + 0 * src_stride)); + __m128i r2r3 = _mm_loadl_epi64((__m128i*)(chunk_ptr + 2 * src_stride)); + r0r1 = _mm_insert_epi64(r0r1, *(uint64_t*)(chunk_ptr + 1 * src_stride), 1); + r2r3 = _mm_insert_epi64(r2r3, *(uint64_t*)(chunk_ptr + 3 * src_stride), 1); + + __m256i r0r1_r2r3 = _mm256_castsi128_si256(r0r1); + r0r1_r2r3 = _mm256_inserti128_si256(r0r1_r2r3, r2r3, 1); + + __m256i r0_r1_01 = _mm256_shuffle_epi8(r0r1_r2r3, shuf01); + __m256i r0_r1_23 = _mm256_shuffle_epi8(r0r1_r2r3, shuf23); + + __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01); + __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23); + + __m256i sum = _mm256_add_epi16(dot01, dot23); + + __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x); + __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x); + __m128i *dst_r2 = (__m128i*)(dst + (y + 2) * dst_stride + x); + __m128i *dst_r3 = (__m128i*)(dst + (y + 3) * dst_stride + x); + __m128i sum_r0r1 = _mm256_castsi256_si128(sum); + __m128i sum_r2r3 = _mm256_extracti128_si256(sum, 1); + _mm_storel_epi64(dst_r0, sum_r0r1); + _mm_storeh_pd((double*)dst_r1, _mm_castsi128_pd(sum_r0r1)); + _mm_storel_epi64(dst_r2, sum_r2r3); + _mm_storeh_pd((double*)dst_r3, _mm_castsi128_pd(sum_r2r3)); + } + } +} + +static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter, + int width, + int height, + int16_t *src, + int16_t src_stride, + kvz_pixel *dst, + int16_t dst_stride) +{ + // Interpolation filter shifts + int32_t shift2 = 6; + + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + + __m128i weights_8b = _mm_set1_epi64x(*(uint64_t*)filter); + __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); + __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); + + for (int x = 0; x + 3 < width; x += 4) { + + int16_t *strip_ptr = src + 0 * src_stride + x; + + // Initial values + // Broadcasted rows in both lanes + // __m256i r0; // Unused + // __m256i r1; // Unused + __m256i r2 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 0 * src_stride)); + __m256i r3 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 1 * src_stride)); + __m256i r4 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 2 * src_stride)); + + // Consecutive rows in low and high lanes + // __m256i r0_r1; // Unused + // __m256i r1_r2; // Unused + __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + + // Paired samples of consecutive rows + __m256i r01_r12; + __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + + for (int y = 0; y < height; y += 2) { + + strip_ptr = src + y * src_stride + x; + + // Slide window + r01_r12 = r23_r34; + r2 = r4; + r3 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 3 * src_stride)); + r4 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 4 * src_stride)); + r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + + r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + + __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); + __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); + + __m256i sum = _mm256_add_epi32(dot01, dot23); + sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_add_epi32(sum, _mm256_set1_epi32(wp_offset1)); + sum = _mm256_srai_epi32(sum, wp_shift1); + sum = _mm256_packs_epi32(sum, sum); + sum = _mm256_packus_epi16(sum, sum); + + kvz_pixel *dst_addr0 = &dst[(y + 0) * dst_stride + x]; + kvz_pixel *dst_addr1 = &dst[(y + 1) * dst_stride + x]; + *(uint32_t*)dst_addr0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); + *(uint32_t*)dst_addr1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sum, 1)); + } + } +} + +static void kvz_ipol_4tap_ver_hi_hi_avx2(int8_t *filter, + int width, + int height, + int16_t *src, + int16_t src_stride, + int16_t *dst, + int16_t dst_stride) +{ + const int shift2 = 6; + + __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter); + __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b); + __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0)); + __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1)); + + for (int x = 0; x + 3 < width; x += 4) { + + int16_t *strip_ptr = src + 0 * src_stride + x; + + // Initial values + // Broadcasted rows in both lanes + // __m256i r0; // Unused + // __m256i r1; // Unused + __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride)); + __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride)); + __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride)); + + // Consecutive rows in low and high lanes + // __m256i r0_r1; // Unused + // __m256i r1_r2; // Unused + __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + + // Paired samples of consecutive rows + __m256i r01_r12; + __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + + for (int y = 0; y < height; y += 2) { + + strip_ptr = src + y * src_stride + x; + + // Slide window + r01_r12 = r23_r34; + r2 = r4; + r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride)); + r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride)); + r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0); + r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0); + + r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4); + + __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01); + __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23); + + __m256i sum = _mm256_add_epi32(dot01, dot23); + sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_packs_epi32(sum, sum); + int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x]; _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum)); @@ -1441,8 +1645,6 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco return; } - int x, y; - int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; @@ -1468,22 +1670,20 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons const int16_t mv[2]) { // TODO: horizontal and vertical only filtering - int x, y; - int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; // Buffer for intermediate values with one extra row // because the loop writes two rows each iteration. - ALIGNED(64) int16_t hor_filtered[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; int16_t hor_stride = LCU_WIDTH; - kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_filtered, hor_stride); - kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_filtered, hor_stride, dst, dst_stride); + kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); + kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } -static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, +static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder, kvz_pixel *src, int16_t src_stride, int width, @@ -1494,118 +1694,49 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode int8_t ver_flag, const int16_t mv[2]) { - // TODO: Optimize SMP and AMP - if (width != height) { + // TODO: Optimizations for rest of the blocks (for example 2x8). + if (width % 4 != 0) { kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); return; } - int x, y; - int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7]; int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7]; + // Buffer for intermediate values with 3 extra rows + // because the loop writes four rows each iteration. + ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; int16_t hor_stride = LCU_WIDTH_C; - int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C]; - // HORIZONTAL STEP - __m256i shuf_01, shuf_23; - __m256i taps_01, taps_23; - - kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23); - kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23); - - for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) { - - for (x = 0; x + 3 < width; x += 4) { - int ypos = y - KVZ_CHROMA_FILTER_OFFSET; - int xpos = x - KVZ_CHROMA_FILTER_OFFSET; - kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, - &shuf_01, &shuf_23, - &taps_01, &taps_23); //TODO: >> shift1 - } - } - - __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0)); - - int rows = 3; - for (x = 0; x + 3 < width; x += 4) { - int ypos = y - KVZ_CHROMA_FILTER_OFFSET; - int xpos = x - KVZ_CHROMA_FILTER_OFFSET; - kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, - &shuf_01_23, &taps_01_23, - rows); //TODO: >> shift1 - } - - // VERTICAL STEP - for (y = 0; y + 3 < height; y += 4) { - for (x = 0; x + 3 < width; x += 4) { - kvz_four_tap_filter_ver_16bit_4x4_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); - } - } + kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); + kvz_ipol_4tap_ver_hi_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } -static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder, - kvz_pixel *src, - int16_t src_stride, - int width, - int height, - int16_t *dst, - int16_t dst_stride, - int8_t hor_flag, - int8_t ver_flag, +static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + int16_t *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, const int16_t mv[2]) { - // TODO: Optimize SMP and AMP - if (width != height) { + // TODO: Optimizations for rest of the blocks (for example 2x8). + if (width % 4 != 0) { kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); return; } - // TODO: horizontal and vertical only filtering - int x, y; - int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7]; int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7]; + // Buffer for intermediate values with 3 extra rows + // because the loop writes four rows each iteration. + ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; int16_t hor_stride = LCU_WIDTH_C; - int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C]; - // HORIZONTAL STEP - __m256i shuf_01, shuf_23; - __m256i taps_01, taps_23; - - kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23); - kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23); - - for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) { - - for (x = 0; x + 3 < width; x += 4) { - int ypos = y - KVZ_CHROMA_FILTER_OFFSET; - int xpos = x - KVZ_CHROMA_FILTER_OFFSET; - kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, - &shuf_01, &shuf_23, - &taps_01, &taps_23); //TODO: >> shift1 - } - } - - __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0)); - - int rows = 3; - for (x = 0; x + 3 < width; x += 4) { - int ypos = y - KVZ_CHROMA_FILTER_OFFSET; - int xpos = x - KVZ_CHROMA_FILTER_OFFSET; - kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, - &shuf_01_23, &taps_01_23, - rows); //TODO: >> shift1 - } - - // VERTICAL STEP - for (y = 0; y + 3 < height; y += 4) { - for (x = 0; x + 3 < width; x += 4) { - kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); - } - } + kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); + kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } #endif //COMPILE_INTEL_AVX2 From e38219e489d490535b0e2b01a2d93bd1594637bc Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 5 Mar 2021 18:14:27 +0200 Subject: [PATCH 09/19] Fix epol_func signature and function definition --- src/strategies/generic/ipol-generic.c | 44 +++++++++++++-------------- src/strategies/strategies-ipol.h | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 75c0eee9..b9024c98 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -728,34 +728,34 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco } -void kvz_get_extended_block_generic(kvz_epol_args args) { +void kvz_get_extended_block_generic(kvz_epol_args *args) { - int min_y = args.blk_y - args.pad_t; - int max_y = args.blk_y + args.blk_h + args.pad_b - 1; - bool out_of_bounds_y = (min_y < 0) || (max_y >= args.src_h); + int min_y = args->blk_y - args->pad_t; + int max_y = args->blk_y + args->blk_h + args->pad_b - 1; + bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h); - int min_x = args.blk_x - args.pad_l; - int max_x = args.blk_x + args.blk_w + args.pad_r - 1; - bool out_of_bounds_x = (min_x < 0) || (max_x >= args.src_w); + int min_x = args->blk_x - args->pad_l; + int max_x = args->blk_x + args->blk_w + args->pad_r - 1; + bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w); if (out_of_bounds_y || out_of_bounds_x) { - *args.ext = args.buf; - *args.ext_s = args.pad_l + args.blk_w + args.pad_r; - *args.ext_origin = args.buf + args.pad_t * (*args.ext_s) + args.pad_l; + *args->ext = args->buf; + *args->ext_s = args->pad_l + args->blk_w + args->pad_r; + *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l; - int cnt_l = CLIP(0, *args.ext_s, -min_x); - int cnt_r = CLIP(0, *args.ext_s, max_x - (args.src_w - 1)); - int cnt_m = CLIP(0, *args.ext_s, *args.ext_s - cnt_l - cnt_r); + int cnt_l = CLIP(0, *args->ext_s, -min_x); + int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1)); + int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r); // For each row including padding - for (int y = -args.pad_t; y < args.blk_h + args.pad_b; ++y) { + for (int y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) { - int clipped_y = CLIP(0, args.src_h - 1, args.blk_y + y); - kvz_pixel sample_l = *(args.src + clipped_y * args.src_s); - kvz_pixel sample_r = *(args.src + clipped_y * args.src_s + args.src_w - 1); - kvz_pixel *src_m = args.src + clipped_y * args.src_s + MAX(min_x, 0); - kvz_pixel *dst_l = args.buf + (y + args.pad_t) * (*args.ext_s); + int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y); + kvz_pixel sample_l = *(args->src + clipped_y * args->src_s); + kvz_pixel sample_r = *(args->src + clipped_y * args->src_s + args->src_w - 1); + kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0); + kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s); kvz_pixel *dst_m = dst_l + cnt_l; kvz_pixel *dst_r = dst_m + cnt_m; for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l; @@ -764,9 +764,9 @@ void kvz_get_extended_block_generic(kvz_epol_args args) { } } else { - *args.ext = args.src + (args.blk_y - args.pad_t) * args.src_s + (args.blk_x - args.pad_l); - *args.ext_origin = args.src + args.blk_y * args.src_s + args.blk_x; - *args.ext_s = args.src_s; + *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l); + *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x; + *args->ext_s = args->src_s; } } diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index 0566507d..31d15bc4 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -64,7 +64,7 @@ typedef struct { int *ext_s; // Stride } kvz_epol_args; -typedef unsigned(epol_func)(kvz_epol_args *args); +typedef void(epol_func)(kvz_epol_args *args); typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); From d8e7aac380c584557157eb6147106a52fb6223a3 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 5 Mar 2021 18:50:00 +0200 Subject: [PATCH 10/19] Do not use nonstandard extension for struct initialization. --- src/image.c | 11 +++++++---- src/inter.c | 44 ++++++++++++++++++++++++++++---------------- src/search_inter.c | 11 +++++++---- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/image.c b/src/image.c index 9fa47c64..71e791dd 100644 --- a/src/image.c +++ b/src/image.c @@ -497,12 +497,15 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic, .pad_r = 0, .pad_t = 0, .pad_b = 0, - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x]; diff --git a/src/inter.c b/src/inter.c index f34a2a2a..11ed73ac 100644 --- a/src/inter.c +++ b/src/inter.c @@ -72,12 +72,15 @@ static void inter_recon_frac_luma(const encoder_state_t *const state, .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); kvz_sample_quarterpel_luma(state->encoder_control, ext_origin, @@ -123,12 +126,15 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t *const state, .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); kvz_sample_14bit_quarterpel_luma(state->encoder_control, ext_origin, @@ -177,12 +183,15 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state, .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, .pad_t = KVZ_CHROMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); kvz_sample_octpel_chroma(state->encoder_control, ext_origin, @@ -245,12 +254,15 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state, .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, .pad_t = KVZ_CHROMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); kvz_sample_14bit_octpel_chroma(state->encoder_control, ext_origin, diff --git a/src/search_inter.c b/src/search_inter.c index 0fcd70c6..81e6fb92 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1032,12 +1032,15 @@ static void search_frac(inter_search_info_t *info) .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 - .buf = ext_buffer, - .ext = &ext, - .ext_origin = &ext_origin, - .ext_s = &ext_s }; + // Initialize separately. Gets rid of warning + // about using nonstandard extension. + epol_args.buf = ext_buffer; + epol_args.ext = &ext; + epol_args.ext_origin = &ext_origin; + epol_args.ext_s = &ext_s; + kvz_get_extended_block(&epol_args); kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x; From 563165146985c302fdd8d0ae74af696050cfc186 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 5 Mar 2021 18:31:32 +0200 Subject: [PATCH 11/19] Remove unused functions and variables --- src/strategies/avx2/ipol-avx2.c | 503 -------------------------------- 1 file changed, 503 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 22ea3641..987461c6 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -56,111 +56,6 @@ static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) return filtered; } -static void kvz_init_shuffle_masks(__m256i *shuf_01_23, __m256i *shuf_45_67) { - // Shuffle pairs - *shuf_01_23 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); - *shuf_45_67 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); -} - -static void kvz_init_shuffle_masks_chroma(__m256i *shuf_01, __m256i *shuf_23) { - // Shuffle pairs - *shuf_01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12, - 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12); - *shuf_23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14, - 2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14); -} - -static void kvz_init_filter_taps(int8_t *filter, - __m256i *taps_01_23, __m256i *taps_45_67) { - // Filter weights - __m256i all_taps = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter)); - __m256i perm_01 = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); - __m256i perm_23 = _mm256_setr_epi32(2, 2, 2, 2, 3, 3, 3, 3); - all_taps = _mm256_unpacklo_epi16(all_taps, all_taps); - *taps_01_23 = _mm256_permutevar8x32_epi32(all_taps, perm_01); - *taps_45_67 = _mm256_permutevar8x32_epi32(all_taps, perm_23); -} - -static void kvz_init_filter_taps_chroma(int8_t *filter, - __m256i *taps_01, __m256i *taps_23) { - // Filter weights - __m256i all_taps = _mm256_set1_epi32(*(int32_t*)filter); - all_taps = _mm256_unpacklo_epi16(all_taps, all_taps); - *taps_01 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); - *taps_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); -} - -static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) { - for (int i = 0; i < 4; ++i) filters[i] = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&filter[2 * i])); - filters[0] = _mm256_inserti128_si256(filters[0], _mm256_castsi256_si128(filters[3]), 1); // Pairs 01 67 - filters[1] = _mm256_inserti128_si256(filters[1], _mm256_castsi256_si128(filters[0]), 1); // Pairs 23 01 - filters[2] = _mm256_inserti128_si256(filters[2], _mm256_castsi256_si128(filters[1]), 1); // Pairs 45 23 - filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45 -} - -static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, - __m256i *shuf_01_23, __m256i *shuf_45_67, - __m256i *taps_01_23, __m256i *taps_45_67) { - - __m256i row = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)data)); - - __m256i pairs_01_23 = _mm256_shuffle_epi8(row, *shuf_01_23); - __m256i pairs_45_67 = _mm256_shuffle_epi8(row, *shuf_45_67); - - __m256i temp0 = _mm256_maddubs_epi16(pairs_01_23, *taps_01_23); - __m256i temp1 = _mm256_maddubs_epi16(pairs_45_67, *taps_45_67); - - __m256i sum = _mm256_add_epi16(temp0, temp1); - __m128i filtered = _mm_add_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - _mm_storeu_si128((__m128i*)out, filtered); -} - -static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, - __m256i *shuf_01, __m256i *shuf_23, - __m256i *taps_01, __m256i *taps_23) { - - __m256i four_rows = _mm256_setr_epi64x( - *(int64_t*)&data[0 * stride], - *(int64_t*)&data[1 * stride], - *(int64_t*)&data[2 * stride], - *(int64_t*)&data[3 * stride]); - - __m256i pairs_l = _mm256_shuffle_epi8(four_rows, *shuf_01); - __m256i pairs_r = _mm256_shuffle_epi8(four_rows, *shuf_23); - - __m256i temp_l = _mm256_maddubs_epi16(pairs_l, *taps_01); - __m256i temp_r = _mm256_maddubs_epi16(pairs_r, *taps_23); - - __m256i sum = _mm256_add_epi16(temp_l, temp_r); - - __m128i lower = _mm256_castsi256_si128(sum); - __m128i upper = _mm256_extracti128_si256(sum, 1); - _mm_storel_epi64((__m128i*)(out + 0 * out_stride), lower); - _mm_storeh_pd((double*)(out + 1 * out_stride), _mm_castsi128_pd(lower)); - _mm_storel_epi64((__m128i*)(out + 2 * out_stride), upper); - _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper)); -} - -static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, - __m256i *shuf_01_23, __m256i *taps_01_23, - int rows) { - - for (int i = 0; i < rows; ++i) { - __m256i row = _mm256_set1_epi64x(*(int64_t*)&data[i * stride]); - - __m256i pairs_l_r = _mm256_shuffle_epi8(row, *shuf_01_23); - __m256i temp_l_r = _mm256_maddubs_epi16(pairs_l_r, *taps_01_23); - - __m128i temp_l = _mm256_castsi256_si128(temp_l_r); - __m128i temp_r = _mm256_extracti128_si256(temp_l_r, 1); - __m128i sum = _mm_add_epi16(temp_l, temp_r); - - _mm_storel_epi64((__m128i*)(out + i * out_stride), sum); - } -} - static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) { __m128i fir = _mm_loadl_epi64((__m128i*)filter); @@ -243,372 +138,6 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat _mm_storel_epi64((__m128i*)out, filtered); } -static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride) -{ - // Interpolation filter shifts - int32_t shift2 = 6; - - // Weighted prediction offset and shift - int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; - int32_t wp_offset1 = 1 << (wp_shift1 - 1); - - // Filter weights - __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter)); - __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); - - __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]); - __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]); - __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]); - __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]); - __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]); - __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]); - __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]); - - __m128i pairs01 = _mm_unpacklo_epi16(row0, row1); - __m128i pairs23 = _mm_unpacklo_epi16(row2, row3); - __m128i temp01 = _mm_madd_epi16(pairs01, taps_01); - __m128i temp23 = _mm_madd_epi16(pairs23, taps_23); - __m128i sum0123 = _mm_add_epi32(temp01, temp23); - - __m128i pairs12 = _mm_unpacklo_epi16(row1, row2); - __m128i pairs34 = _mm_unpacklo_epi16(row3, row4); - __m128i temp12 = _mm_madd_epi16(pairs12, taps_01); - __m128i temp34 = _mm_madd_epi16(pairs34, taps_23); - __m128i sum1234 = _mm_add_epi32(temp12, temp34); - - __m128i pairs45 = _mm_unpacklo_epi16(row4, row5); - __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01); - __m128i temp45 = _mm_madd_epi16(pairs45, taps_23); - __m128i sum2345 = _mm_add_epi32(temp23_2, temp45); - - __m128i pairs56 = _mm_unpacklo_epi16(row5, row6); - __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01); - __m128i temp56 = _mm_madd_epi16(pairs56, taps_23); - __m128i sum3456 = _mm_add_epi32(temp34_2, temp56); - - sum0123 = _mm_srai_epi32(sum0123, shift2); - sum1234 = _mm_srai_epi32(sum1234, shift2); - sum2345 = _mm_srai_epi32(sum2345, shift2); - sum3456 = _mm_srai_epi32(sum3456, shift2); - - __m128i offset = _mm_set1_epi32(wp_offset1); - sum0123 = _mm_add_epi32(sum0123, offset); - sum1234 = _mm_add_epi32(sum1234, offset); - sum2345 = _mm_add_epi32(sum2345, offset); - sum3456 = _mm_add_epi32(sum3456, offset); - - sum0123 = _mm_srai_epi32(sum0123, wp_shift1); - sum1234 = _mm_srai_epi32(sum1234, wp_shift1); - sum2345 = _mm_srai_epi32(sum2345, wp_shift1); - sum3456 = _mm_srai_epi32(sum3456, wp_shift1); - - __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234); - __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456); - __m128i filtered = _mm_packus_epi16(filtered01, filtered23); - - *(int32_t*)&out[0 * out_stride] = _mm_cvtsi128_si32(filtered); - *(int32_t*)&out[1 * out_stride] = _mm_extract_epi32(filtered, 1); - *(int32_t*)&out[2 * out_stride] = _mm_extract_epi32(filtered, 2); - *(int32_t*)&out[3 * out_stride] = _mm_extract_epi32(filtered, 3); -} - -static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int16_t *data, int16_t stride, int16_t *out, int16_t out_stride) -{ - int32_t shift2 = 6; - - // Filter weights - __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter)); - __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); - - __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]); - __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]); - __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]); - __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]); - __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]); - __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]); - __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]); - - __m128i pairs01 = _mm_unpacklo_epi16(row0, row1); - __m128i pairs23 = _mm_unpacklo_epi16(row2, row3); - __m128i temp01 = _mm_madd_epi16(pairs01, taps_01); - __m128i temp23 = _mm_madd_epi16(pairs23, taps_23); - __m128i sum0123 = _mm_add_epi32(temp01, temp23); - - __m128i pairs12 = _mm_unpacklo_epi16(row1, row2); - __m128i pairs34 = _mm_unpacklo_epi16(row3, row4); - __m128i temp12 = _mm_madd_epi16(pairs12, taps_01); - __m128i temp34 = _mm_madd_epi16(pairs34, taps_23); - __m128i sum1234 = _mm_add_epi32(temp12, temp34); - - __m128i pairs45 = _mm_unpacklo_epi16(row4, row5); - __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01); - __m128i temp45 = _mm_madd_epi16(pairs45, taps_23); - __m128i sum2345 = _mm_add_epi32(temp23_2, temp45); - - __m128i pairs56 = _mm_unpacklo_epi16(row5, row6); - __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01); - __m128i temp56 = _mm_madd_epi16(pairs56, taps_23); - __m128i sum3456 = _mm_add_epi32(temp34_2, temp56); - - sum0123 = _mm_srai_epi32(sum0123, shift2); - sum1234 = _mm_srai_epi32(sum1234, shift2); - sum2345 = _mm_srai_epi32(sum2345, shift2); - sum3456 = _mm_srai_epi32(sum3456, shift2); - - __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234); - __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456); - - _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered01)); - _mm_storeh_pi((__m64*)&out[1 * out_stride], _mm_castsi128_ps(filtered01)); - _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered23)); - _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23)); -} - -INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride) -{ - // Interpolation filter shifts - int32_t shift2 = 6; - - // Weighted prediction offset and shift - int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; - int32_t wp_offset1 = 1 << (wp_shift1 - 1); - - __m256i pairs_lo, pairs_hi; - - // Filter 01 later with 67 - __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride))); - __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride))); - - __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride))); - __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br2, br3); - pairs_hi = _mm256_unpackhi_epi16(br2, br3); - __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 - __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 - - __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride))); - __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br4, br5); - pairs_hi = _mm256_unpackhi_epi16(br4, br5); - __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 - __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 - - __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride))); - __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br6, br7); - pairs_hi = _mm256_unpackhi_epi16(br6, br7); - __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 - __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 - __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 - __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 - - __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride))); - __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br8, br9); - pairs_hi = _mm256_unpackhi_epi16(br8, br9); - // Filter rows02 later - __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 - __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 - - __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride))); - __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br10, br11); - pairs_hi = _mm256_unpackhi_epi16(br10, br11); - __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 - __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 - - // Deferred - __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0)); - pairs_lo = _mm256_unpacklo_epi16(r08, r19); - pairs_hi = _mm256_unpackhi_epi16(r08, r19); - __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 - __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 - - __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride))); - __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride))); - - __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0)); - pairs_lo = _mm256_unpacklo_epi16(r412, r513); - pairs_hi = _mm256_unpackhi_epi16(r412, r513); - __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 - __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 - - __m256i accu02_lo, accu02_hi; - __m256i accu46_lo, accu46_hi; - - accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo); - accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo); - accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo); - - accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi); - accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi); - accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi); - - accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo); - accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo); - accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo); - - accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi); - accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi); - accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi); - - accu02_lo = _mm256_srai_epi32(accu02_lo, shift2); - accu02_hi = _mm256_srai_epi32(accu02_hi, shift2); - accu46_lo = _mm256_srai_epi32(accu46_lo, shift2); - accu46_hi = _mm256_srai_epi32(accu46_hi, shift2); - - __m256i offset = _mm256_set1_epi32(wp_offset1); - accu02_lo = _mm256_add_epi32(accu02_lo, offset); - accu02_hi = _mm256_add_epi32(accu02_hi, offset); - accu46_lo = _mm256_add_epi32(accu46_lo, offset); - accu46_hi = _mm256_add_epi32(accu46_hi, offset); - - accu02_lo = _mm256_srai_epi32(accu02_lo, wp_shift1); - accu02_hi = _mm256_srai_epi32(accu02_hi, wp_shift1); - accu46_lo = _mm256_srai_epi32(accu46_lo, wp_shift1); - accu46_hi = _mm256_srai_epi32(accu46_hi, wp_shift1); - - __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi); - __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi); - - __m256i filtered04_26 = _mm256_packus_epi16(rows02, rows46); - __m128i filtered04 = _mm256_castsi256_si128(filtered04_26); - __m128i filtered26 = _mm256_extracti128_si256(filtered04_26, 1); - - _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered04)); - _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered26)); - _mm_storeh_pi((__m64*)&out[4 * out_stride], _mm_castsi128_ps(filtered04)); - _mm_storeh_pi((__m64*)&out[6 * out_stride], _mm_castsi128_ps(filtered26)); -} - -INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t stride, __m256i *taps, int16_t *out, int64_t out_stride) { - - int32_t shift2 = 6; - - __m256i pairs_lo, pairs_hi; - - // Filter 01 later with 67 - __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride))); - __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride))); - - __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride))); - __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br2, br3); - pairs_hi = _mm256_unpackhi_epi16(br2, br3); - __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 - __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 - - __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride))); - __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br4, br5); - pairs_hi = _mm256_unpackhi_epi16(br4, br5); - __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 - __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 - - __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride))); - __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br6, br7); - pairs_hi = _mm256_unpackhi_epi16(br6, br7); - __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 - __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 - __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 - __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 - - __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride))); - __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br8, br9); - pairs_hi = _mm256_unpackhi_epi16(br8, br9); - // Filter rows02 later - __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 - __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 - - __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride))); - __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride))); - pairs_lo = _mm256_unpacklo_epi16(br10, br11); - pairs_hi = _mm256_unpackhi_epi16(br10, br11); - __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 - __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 - - // Deferred - __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0)); - pairs_lo = _mm256_unpacklo_epi16(r08, r19); - pairs_hi = _mm256_unpackhi_epi16(r08, r19); - __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 - __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 - - __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride))); - __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride))); - - __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0)); - pairs_lo = _mm256_unpacklo_epi16(r412, r513); - pairs_hi = _mm256_unpackhi_epi16(r412, r513); - __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 - __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 - - __m256i accu02_lo, accu02_hi; - __m256i accu46_lo, accu46_hi; - - accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo); - accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo); - accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo); - - accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi); - accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi); - accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi); - - accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo); - accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo); - accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo); - - accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi); - accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi); - accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi); - - accu02_lo = _mm256_srai_epi32(accu02_lo, shift2); - accu02_hi = _mm256_srai_epi32(accu02_hi, shift2); - accu46_lo = _mm256_srai_epi32(accu46_lo, shift2); - accu46_hi = _mm256_srai_epi32(accu46_hi, shift2); - - __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi); - __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi); - - __m128i filtered0 = _mm256_castsi256_si128(rows02); - __m128i filtered2 = _mm256_extracti128_si256(rows02, 1); - __m128i filtered4 = _mm256_castsi256_si128(rows46); - __m128i filtered6 = _mm256_extracti128_si256(rows46, 1); - - _mm_storeu_si128((__m128i*)(out + 0 * out_stride), filtered0); - _mm_storeu_si128((__m128i*)(out + 2 * out_stride), filtered2); - _mm_storeu_si128((__m128i*)(out + 4 * out_stride), filtered4); - _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6); -} - -INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride) -{ - // Filter even rows - filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 - - // Filter odd rows - filter_row_ver_16b_8x1_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7 - -} - -INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *filters, int16_t *data, int16_t stride, int16_t *out, int out_stride) -{ - // Filter even rows - filter_row_ver_16b_8x1_no_round_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 - - // Filter odd rows - filter_row_ver_16b_8x1_no_round_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7 - -} - static void kvz_ipol_8tap_hor_px_im_avx2(int8_t *filter, int width, int height, @@ -1122,9 +651,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // HORIZONTAL STEP // Integer pixels - __m256i shuf_01_23, shuf_45_67; - __m256i taps_01_23, taps_45_67; - for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { for (x = 0; x + 7 < width; x += 8) { @@ -1147,9 +673,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // Half pixels - kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); - kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67); - kvz_ipol_8tap_hor_px_im_avx2(fir2, width, height + 1, src + 1, src_stride, hor_pos2, hor_stride); // Write the first column in contiguous memory @@ -1166,9 +689,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e kvz_pixel *out_t = filtered[2]; kvz_pixel *out_b = filtered[3]; - __m256i taps[4]; - kvz_init_ver_filter_taps(fir0, taps); - // Right int16_t *im = &hor_pos2[hor_stride]; kvz_pixel *dst = out_r; @@ -1186,7 +706,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e out_l[y * dst_stride + x] = sample; } - kvz_init_ver_filter_taps(fir2, taps); // Top im = hor_pos0; dst = out_t; @@ -1240,8 +759,6 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco kvz_pixel *out_bl = filtered[2]; kvz_pixel *out_br = filtered[3]; - __m256i taps[4]; - kvz_init_ver_filter_taps(fir2, taps); // Top-Right int16_t *im = hor_pos2; kvz_pixel *dst = out_tr; @@ -1336,13 +853,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; // HORIZONTAL STEP - __m256i shuf_01_23, shuf_45_67; - __m256i taps_01_23, taps_45_67; - // Left QPEL - kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); - kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67); - int sample_off_y = hpel_off_y < 0 ? 0 : 1; kvz_ipol_8tap_hor_px_im_avx2(hor_fir_l, width, height + 1, src + 1, src_stride, hor_pos_l, hor_stride); @@ -1355,7 +866,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e } // Right QPEL - kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67); kvz_ipol_8tap_hor_px_im_avx2(hor_fir_r, width, height + 1, src + 1, src_stride, hor_pos_r, hor_stride); // Write the first column in contiguous memory @@ -1377,12 +887,8 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; - __m256i taps[4]; - // Left QPEL (1/4 or 3/4 x positions) // Filter block and then filter column and align if neccessary - kvz_init_ver_filter_taps(ver_fir_l, taps); - int16_t *im = &hor_pos_l[sample_off_y * hor_stride]; kvz_pixel *dst = out_l; kvz_ipol_8tap_ver_im_px_avx2(ver_fir_l, width, height, im, hor_stride, dst, dst_stride); @@ -1406,8 +912,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Right QPEL (3/4 or 1/4 x positions) // Filter block and then filter column and align if neccessary - kvz_init_ver_filter_taps(ver_fir_r, taps); - im = &hor_pos_r[sample_off_y * hor_stride]; dst = out_r; kvz_ipol_8tap_ver_im_px_avx2(ver_fir_r, width, height, im, hor_stride, dst, dst_stride); @@ -1433,7 +937,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Top QPEL (1/4 or 3/4 y positions) // Filter block and then filter column and align if neccessary int sample_off_x = (hpel_off_x > -1 ? 1 : 0); - kvz_init_ver_filter_taps(ver_fir_t, taps); im = &hor_hpel_pos[off_y_fir_t * hor_stride]; dst = out_t; @@ -1458,7 +961,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e // Bottom QPEL (3/4 or 1/4 y positions) // Filter block and then filter column and align if neccessary - kvz_init_ver_filter_taps(ver_fir_b, taps); im = &hor_hpel_pos[off_y_fir_b * hor_stride]; dst = out_b; @@ -1529,11 +1031,8 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco int off_y_fir_t = hpel_off_y < 1 ? 0 : 1; int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; - __m256i taps[4]; // Top-left QPEL // Filter block and then filter column and align if neccessary - kvz_init_ver_filter_taps(ver_fir_t, taps); - int16_t *im = &hor_pos_l[off_y_fir_t * hor_stride]; kvz_pixel *dst = out_tl; kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride); @@ -1581,8 +1080,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco // Bottom-left QPEL // Filter block and then filter column and align if neccessary - kvz_init_ver_filter_taps(ver_fir_b, taps); - im = &hor_pos_l[off_y_fir_b * hor_stride]; dst = out_bl; kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride); From 5a70b49f690713cdae2604933a12428e402a7958 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sat, 6 Mar 2021 18:09:57 +0200 Subject: [PATCH 12/19] Require 64-bit build for AVX2 interpolation filter functions --- src/strategies/avx2/ipol-avx2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 987461c6..8db217aa 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -24,7 +24,7 @@ #include "strategies/avx2/ipol-avx2.h" -#if COMPILE_INTEL_AVX2 +#if COMPILE_INTEL_AVX2 && defined X86_64 #include #include #include @@ -1236,12 +1236,12 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } -#endif //COMPILE_INTEL_AVX2 +#endif //COMPILE_INTEL_AVX2 && defined X86_64 int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) { bool success = true; -#if COMPILE_INTEL_AVX2 +#if COMPILE_INTEL_AVX2 && defined X86_64 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2); success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2); @@ -1252,6 +1252,6 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); } -#endif //COMPILE_INTEL_AVX2 +#endif //COMPILE_INTEL_AVX2 && defined X86_64 return success; } From 4314f3a9a7980657f15ad012c53afa04b8af3fe5 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sat, 6 Mar 2021 19:46:36 +0200 Subject: [PATCH 13/19] Rename some interpolation functions and strategies for consistency --- src/inter.c | 14 +++++++------- src/strategies/avx2/ipol-avx2.c | 24 ++++++++++++------------ src/strategies/generic/ipol-generic.c | 8 ++++---- src/strategies/generic/ipol-generic.h | 4 ++-- src/strategies/strategies-ipol.c | 4 ++-- src/strategies/strategies-ipol.h | 12 ++++++------ 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/inter.c b/src/inter.c index 11ed73ac..95351a5e 100644 --- a/src/inter.c +++ b/src/inter.c @@ -94,7 +94,7 @@ static void inter_recon_frac_luma(const encoder_state_t *const state, mv_param); } -static void inter_recon_14bit_frac_luma(const encoder_state_t *const state, +static void inter_recon_frac_luma_hi(const encoder_state_t *const state, const kvz_picture *const ref, int32_t xpos, int32_t ypos, @@ -136,7 +136,7 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t *const state, epol_args.ext_s = &ext_s; kvz_get_extended_block(&epol_args); - kvz_sample_14bit_quarterpel_luma(state->encoder_control, + kvz_sample_quarterpel_luma_hi(state->encoder_control, ext_origin, ext_s, block_width, @@ -219,7 +219,7 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state, mv_param); } -static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state, +static void inter_recon_frac_chroma_hi(const encoder_state_t *const state, const kvz_picture *const ref, int32_t xpos, int32_t ypos, @@ -264,7 +264,7 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state, epol_args.ext_s = &ext_s; kvz_get_extended_block(&epol_args); - kvz_sample_14bit_octpel_chroma(state->encoder_control, + kvz_sample_octpel_chroma_hi(state->encoder_control, ext_origin, ext_s, block_width / 2, @@ -278,7 +278,7 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state, // Chroma V epol_args.src = ref->v; kvz_get_extended_block(&epol_args); - kvz_sample_14bit_octpel_chroma(state->encoder_control, + kvz_sample_octpel_chroma_hi(state->encoder_control, ext_origin, ext_s, block_width / 2, @@ -378,7 +378,7 @@ static void inter_recon_unipred(const encoder_state_t * const state, if (fractional_luma) { // With a fractional MV, do interpolation. if (state->encoder_control->cfg.bipred && hi_prec_out) { - inter_recon_14bit_frac_luma(state, ref, + inter_recon_frac_luma_hi(state, ref, pu_in_tile.x, pu_in_tile.y, width, height, mv_param, hi_prec_out); @@ -418,7 +418,7 @@ static void inter_recon_unipred(const encoder_state_t * const state, if (fractional_luma || fractional_chroma) { // With a fractional MV, do interpolation. if (state->encoder_control->cfg.bipred && hi_prec_out) { - inter_recon_14bit_frac_chroma(state, ref, + inter_recon_frac_chroma_hi(state, ref, pu_in_tile.x, pu_in_tile.y, width, height, mv_param, hi_prec_out); diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 8db217aa..c3ea92e3 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -409,7 +409,7 @@ int16_t dst_stride) } } -static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter, +static void kvz_ipol_4tap_hor_px_im_avx2(int8_t *filter, int width, int height, kvz_pixel *src, @@ -473,7 +473,7 @@ static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter, } } -static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter, +static void kvz_ipol_4tap_ver_im_px_avx2(int8_t *filter, int width, int height, int16_t *src, @@ -547,7 +547,7 @@ static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter, } } -static void kvz_ipol_4tap_ver_hi_hi_avx2(int8_t *filter, +static void kvz_ipol_4tap_ver_im_hi_avx2(int8_t *filter, int width, int height, int16_t *src, @@ -1155,7 +1155,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco } -static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, +static void kvz_sample_quarterpel_luma_hi_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, @@ -1204,11 +1204,11 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; int16_t hor_stride = LCU_WIDTH_C; - kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); - kvz_ipol_4tap_ver_hi_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); + kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); + kvz_ipol_4tap_ver_im_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } -static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const encoder, +static void kvz_sample_octpel_chroma_hi_avx2(const encoder_control_t *const encoder, kvz_pixel *src, int16_t src_stride, int width, @@ -1221,7 +1221,7 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e { // TODO: Optimizations for rest of the blocks (for example 2x8). if (width % 4 != 0) { - kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + kvz_sample_octpel_chroma_hi_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); return; } int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7]; @@ -1232,8 +1232,8 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; int16_t hor_stride = LCU_WIDTH_C; - kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); - kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); + kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); + kvz_ipol_4tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride); } #endif //COMPILE_INTEL_AVX2 && defined X86_64 @@ -1249,8 +1249,8 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "avx2", 40, &kvz_filter_qpel_blocks_diag_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "avx2", 40, &kvz_sample_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "avx2", 40, &kvz_sample_octpel_chroma_avx2); - success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); - success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); + success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "avx2", 40, &kvz_sample_quarterpel_luma_hi_avx2); + success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "avx2", 40, &kvz_sample_octpel_chroma_hi_avx2); } #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success; diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index b9024c98..4b777c86 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, } } -void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) +void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) { //TODO: horizontal and vertical only filtering int32_t x, y; @@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k } } -void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) +void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) { //TODO: horizontal and vertical only filtering int32_t x, y; @@ -780,8 +780,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic); success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic); success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic); - success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic); - success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic); + success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic); + success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic); success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic); return success; diff --git a/src/strategies/generic/ipol-generic.h b/src/strategies/generic/ipol-generic.h index f176b4cd..85eb2931 100644 --- a/src/strategies/generic/ipol-generic.h +++ b/src/strategies/generic/ipol-generic.h @@ -32,9 +32,9 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth); void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); -void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); -void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); #endif //STRATEGIES_IPOL_GENERIC_H_ diff --git a/src/strategies/strategies-ipol.c b/src/strategies/strategies-ipol.c index f889b716..0a8990f1 100644 --- a/src/strategies/strategies-ipol.c +++ b/src/strategies/strategies-ipol.c @@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma; epol_func *kvz_get_extended_block; kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma; kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma; -kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma; -kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma; +kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi; +kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi; int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) { diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index 31d15bc4..b47da64f 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -70,8 +70,8 @@ typedef void(epol_func)(kvz_epol_args *args); typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); -typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); -typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); // Declare function pointers. extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma; @@ -81,8 +81,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma; extern epol_func * kvz_get_extended_block; extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma; extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma; -extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma; -extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma; +extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi; +extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi; int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth); @@ -95,8 +95,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth); {"filter_qpel_blocks_diag_luma", (void**) &kvz_filter_qpel_blocks_diag_luma}, \ {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \ {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \ - {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \ - {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \ + {"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \ + {"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \ {"get_extended_block", (void**) &kvz_get_extended_block}, \ From 475f1d79d5175970aed03f8ba96aa9e62010ad96 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 7 Mar 2021 01:27:55 +0200 Subject: [PATCH 14/19] Add some defines for important interpolation related sizes --- src/inter.c | 16 ++++++++-------- src/search_inter.c | 10 +++++----- src/strategies/avx2/ipol-avx2.c | 24 ++++++++++++------------ src/strategies/generic/ipol-generic.c | 16 ++++++++-------- src/strategies/strategies-ipol.h | 17 ++++++++++++++++- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/src/inter.c b/src/inter.c index 95351a5e..ff95741f 100644 --- a/src/inter.c +++ b/src/inter.c @@ -53,9 +53,9 @@ static void inter_recon_frac_luma(const encoder_state_t *const state, int mv_frac_y = (mv_param[1] & 3); // Space for extrapolated pixels and the part from the picture. - // One extra row for AVX2. + // Some extra for AVX2. // The extrapolation function will set the pointers and stride. - kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)]; + kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD]; kvz_pixel *ext = NULL; kvz_pixel *ext_origin = NULL; int ext_s = 0; @@ -107,9 +107,9 @@ static void inter_recon_frac_luma_hi(const encoder_state_t *const state, int mv_frac_y = (mv_param[1] & 3); // Space for extrapolated pixels and the part from the picture. - // One extra row for AVX2. + // Some extra for AVX2. // The extrapolation function will set the pointers and stride. - kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)]; + kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD]; kvz_pixel *ext = NULL; kvz_pixel *ext_origin = NULL; int ext_s = 0; @@ -161,9 +161,9 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state, int mv_frac_y = (mv_param[1] & 7); // Space for extrapolated pixels and the part from the picture. - // Three extra rows for AVX2. + // Some extra for AVX2. // The extrapolation function will set the pointers and stride. - kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)]; + kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD]; kvz_pixel *ext = NULL; kvz_pixel *ext_origin = NULL; int ext_s = 0; @@ -232,9 +232,9 @@ static void inter_recon_frac_chroma_hi(const encoder_state_t *const state, int mv_frac_y = (mv_param[1] & 7); // Space for extrapolated pixels and the part from the picture. - // Three extra rows for AVX2. + // Some extra for AVX2. // The extrapolation function will set the pointers and stride. - kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)]; + kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD]; kvz_pixel *ext = NULL; kvz_pixel *ext_origin = NULL; int ext_s = 0; diff --git a/src/search_inter.c b/src/search_inter.c index 81e6fb92..a7611248 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -992,11 +992,11 @@ static void search_frac(inter_search_info_t *info) unsigned costs[4] = { 0 }; - ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH]; + ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE]; // Storage buffers for intermediate horizontally filtered results. // Have the first columns in contiguous memory for vectorization. - ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD]; int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1]; const kvz_picture *ref = info->ref; @@ -1013,9 +1013,9 @@ static void search_frac(inter_search_info_t *info) int8_t sample_off_y = 0; // Space for (possibly) extrapolated pixels and the part from the picture - // One extra column for ME and two extra columns for ME and AVX2 + // One extra row and column compared to normal interpolation and some extra for AVX2. // The extrapolation function will set the pointers and stride. - kvz_pixel ext_buffer[(KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 2)]; + kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD]; kvz_pixel *ext = NULL; kvz_pixel *ext_origin = NULL; int ext_s = 0; @@ -1031,7 +1031,7 @@ static void search_frac(inter_search_info_t *info) .pad_l = KVZ_LUMA_FILTER_OFFSET, .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, - .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, }; // Initialize separately. Gets rid of warning diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index c3ea92e3..46173a53 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -618,8 +618,8 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -730,8 +730,8 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -809,8 +809,8 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -989,8 +989,8 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -1147,7 +1147,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco // Buffer for intermediate values with one extra row // because the loop writes two rows each iteration. - ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD]; int16_t hor_stride = LCU_WIDTH; kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); @@ -1172,7 +1172,7 @@ static void kvz_sample_quarterpel_luma_hi_avx2(const encoder_control_t * const e // Buffer for intermediate values with one extra row // because the loop writes two rows each iteration. - ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD]; int16_t hor_stride = LCU_WIDTH; kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); @@ -1201,7 +1201,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder // Buffer for intermediate values with 3 extra rows // because the loop writes four rows each iteration. - ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; + ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD]; int16_t hor_stride = LCU_WIDTH_C; kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); @@ -1229,7 +1229,7 @@ static void kvz_sample_octpel_chroma_hi_avx2(const encoder_control_t *const enco // Buffer for intermediate values with 3 extra rows // because the loop writes four rows each iteration. - ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C]; + ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD]; int16_t hor_stride = LCU_WIDTH_C; kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride); diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 4b777c86..8b5d76c3 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) @@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], - int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + kvz_pixel filtered[4][LCU_LUMA_SIZE], + int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t hpel_off_x, int8_t hpel_off_y) diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index b47da64f..31680ec7 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -31,11 +31,26 @@ #include "kvazaar.h" #include "search_inter.h" +// AVX2 implementation of horizontal filter reads and +// writes two rows for luma and four for chroma at a time. +// Extra vertical padding is added to prevent segfaults. +// Horizontal padding is not needed even if one extra byte +// is read because kvz_image_alloc adds enough padding. +#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA) +#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA) +#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH) +#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C) + +// On top of basic interpolation, FME needs one extra +// column and row for ME (left and up). Adding the +// extra row happens to satisfy AVX2 requirements for +// row count. No other extra rows are needed. +#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1)) typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block; typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, - kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], int8_t sample_off_x, int8_t sample_off_y); typedef struct { From 7ce68761c289bb6876ac9cd7825a0bea6f41c594 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 7 Mar 2021 17:32:17 +0200 Subject: [PATCH 15/19] Add a reminder to fix a rare case for bipred --- src/strategies/avx2/ipol-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 46173a53..fa72cf27 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -398,7 +398,7 @@ int16_t dst_stride) __m256i sum0123 = _mm256_add_epi32(dot01, dot23); __m256i sum4567 = _mm256_add_epi32(dot45, dot67); __m256i sum = _mm256_add_epi32(sum0123, sum4567); - sum = _mm256_srai_epi32(sum, shift2); + sum = _mm256_srai_epi32(sum, shift2); // TODO: -8192 offsetting for extreme values sum = _mm256_packs_epi32(sum, sum); int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x]; From 33295bf350f521b1122591d57469996575438822 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Sun, 7 Mar 2021 19:36:20 +0200 Subject: [PATCH 16/19] Use AVX2 luma interpolation for SMP and AMP as well --- src/strategies/avx2/ipol-avx2.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index fa72cf27..e63cfb8a 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -1136,12 +1136,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco int8_t ver_flag, const int16_t mv[2]) { - // TODO: Optimize SMP and AMP - if (width != height) { - kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); - return; - } - + // TODO: horizontal and vertical only filtering int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; From b72ab583b4d3fcc6abdf023eaf82eaa19c9cd076 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 8 Mar 2021 16:05:34 +0200 Subject: [PATCH 17/19] Handle "don't care" rows in the end separately --- src/image.c | 1 + src/inter.c | 12 ++++++++---- src/search_inter.c | 1 + src/strategies/generic/ipol-generic.c | 15 ++++++++++++--- src/strategies/strategies-ipol.h | 1 + 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/image.c b/src/image.c index 71e791dd..7572b1f3 100644 --- a/src/image.c +++ b/src/image.c @@ -497,6 +497,7 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic, .pad_r = 0, .pad_t = 0, .pad_b = 0, + .pad_b_simd = 0, }; // Initialize separately. Gets rid of warning diff --git a/src/inter.c b/src/inter.c index ff95741f..b96b082c 100644 --- a/src/inter.c +++ b/src/inter.c @@ -71,7 +71,8 @@ static void inter_recon_frac_luma(const encoder_state_t *const state, .pad_l = KVZ_LUMA_FILTER_OFFSET, .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, - .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_b_simd = 1 // One row for AVX2 }; // Initialize separately. Gets rid of warning @@ -125,7 +126,8 @@ static void inter_recon_frac_luma_hi(const encoder_state_t *const state, .pad_l = KVZ_LUMA_FILTER_OFFSET, .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, - .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2 + .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_b_simd = 1 // One row for AVX2 }; // Initialize separately. Gets rid of warning @@ -182,7 +184,8 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state, .pad_l = KVZ_CHROMA_FILTER_OFFSET, .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, .pad_t = KVZ_CHROMA_FILTER_OFFSET, - .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 + .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, + .pad_b_simd = 3 // Three rows for AVX2 }; // Initialize separately. Gets rid of warning @@ -253,7 +256,8 @@ static void inter_recon_frac_chroma_hi(const encoder_state_t *const state, .pad_l = KVZ_CHROMA_FILTER_OFFSET, .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, .pad_t = KVZ_CHROMA_FILTER_OFFSET, - .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2 + .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET, + .pad_b_simd = 3 // Three rows for AVX2 }; // Initialize separately. Gets rid of warning diff --git a/src/search_inter.c b/src/search_inter.c index a7611248..029291f7 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1032,6 +1032,7 @@ static void search_frac(inter_search_info_t *info) .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, .pad_t = KVZ_LUMA_FILTER_OFFSET, .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET, + .pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h }; // Initialize separately. Gets rid of warning diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 8b5d76c3..461cc5c2 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -731,7 +731,7 @@ void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder void kvz_get_extended_block_generic(kvz_epol_args *args) { int min_y = args->blk_y - args->pad_t; - int max_y = args->blk_y + args->blk_h + args->pad_b - 1; + int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1; bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h); int min_x = args->blk_x - args->pad_l; @@ -744,12 +744,15 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) { *args->ext_s = args->pad_l + args->blk_w + args->pad_r; *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l; + // Note that stride equals width here. int cnt_l = CLIP(0, *args->ext_s, -min_x); int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1)); int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r); - // For each row including padding - for (int y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) { + // For each row including real padding. + // Don't read "don't care" values (SIMD padding). Zero them out. + int y; + for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) { int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y); kvz_pixel sample_l = *(args->src + clipped_y * args->src_s); @@ -762,6 +765,12 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) { for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i); for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r; } + + for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) { + kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s); + FILL_ARRAY(dst, 0, *args->ext_s); + } + } else { *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l); diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index 31680ec7..79d4dd4c 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -69,6 +69,7 @@ typedef struct { int pad_r; // Right int pad_t; // Top int pad_b; // Bottom + int pad_b_simd; // "Don't care" rows in the end. Zeroed out. // Buffer for possible extrapolation. Free memory provided by the caller. kvz_pixel *buf; From dad3d6818ef18cef7183cc079a0ffab6fb678e44 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 8 Mar 2021 16:49:37 +0200 Subject: [PATCH 18/19] Only read left and right border pixels if necessary --- src/strategies/generic/ipol-generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 461cc5c2..e5c5cef6 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -755,15 +755,15 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) { for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) { int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y); - kvz_pixel sample_l = *(args->src + clipped_y * args->src_s); - kvz_pixel sample_r = *(args->src + clipped_y * args->src_s + args->src_w - 1); + kvz_pixel *sample_l = args->src + clipped_y * args->src_s; + kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1; kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0); kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s); kvz_pixel *dst_m = dst_l + cnt_l; kvz_pixel *dst_r = dst_m + cnt_m; - for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l; + for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l; for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i); - for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r; + for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r; } for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) { From 5bc4cdf401114e0abedd5fde292b6111113dfbf8 Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Mon, 8 Mar 2021 22:01:27 +0200 Subject: [PATCH 19/19] Update TSAN suppressions --- tests/tsan_suppressions.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/tsan_suppressions.txt b/tests/tsan_suppressions.txt index d2f565b6..9f43b6d5 100644 --- a/tests/tsan_suppressions.txt +++ b/tests/tsan_suppressions.txt @@ -1,3 +1,4 @@ -race:kvz_eight_tap_filter_hor_8x1_avx2 +# AVX2 interpolation reads some extra pixels +race:kvz_ipol_8tap_hor_px_im_avx2 race:kvz_filter_hpel_blocks_hor_ver_luma_avx2 -race:kvz_eight_tap_filter_hor_avx2 \ No newline at end of file +race:kvz_eight_tap_filter_hor_avx2