From 84222cf3e7f3c3d20e68b4d0f3dba6eac8b7fe65 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 19:48:57 +0200
Subject: [PATCH 01/19] Replace old block extrapolation with more capable one.

Separate paddings for different directions can be now specified.
---
 src/image.c                           |  55 ++--
 src/inter.c                           | 386 ++++++++++++++------------
 src/search_inter.c                    |  44 ++-
 src/strategies/avx2/ipol-avx2.c       | 145 +++-------
 src/strategies/generic/ipol-generic.c |  77 +++--
 src/strategies/strategies-ipol.h      |  30 +-
 6 files changed, 364 insertions(+), 373 deletions(-)

diff --git a/src/image.c b/src/image.c
index 404bc48c..9fa47c64 100644
--- a/src/image.c
+++ b/src/image.c
@@ -477,33 +477,42 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
                              ref->stride) >> (KVZ_BIT_DEPTH - 8);
   } else {
     // Extrapolate pixels from outside the frame.
-    kvz_extended_block block;
-    kvz_get_extended_block(pic_x,
-                           pic_y,
-                           ref_x - pic_x,
-                           ref_y - pic_y,
-                           0,
-                           0,
-                           ref->y,
-                           ref->width,
-                           ref->height,
-                           0,
-                           block_width,
-                           block_height,
-                           &block);
+
+    // Space for extrapolated pixels and the part from the picture
+    // The extrapolation function will set the pointers and stride.
+    kvz_pixel ext_buffer[LCU_LUMA_SIZE];
+    kvz_pixel *ext = NULL;
+    kvz_pixel *ext_origin = NULL;
+    int ext_s = 0;
+    kvz_epol_args epol_args = {
+      .src = ref->y,
+      .src_w = ref->width,
+      .src_h = ref->height,
+      .src_s = ref->stride,
+      .blk_x = ref_x,
+      .blk_y = ref_y,
+      .blk_w = block_width,
+      .blk_h = block_height,
+      .pad_l = 0,
+      .pad_r = 0,
+      .pad_t = 0,
+      .pad_b = 0,
+      .buf = ext_buffer,
+      .ext = &ext,
+      .ext_origin = &ext_origin,
+      .ext_s = &ext_s
+    };
+
+    kvz_get_extended_block(&epol_args);
 
     const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
 
     unsigned satd = kvz_satd_any_size(block_width,
-                                      block_height,
-                                      pic_data,
-                                      pic->stride,
-                                      block.buffer,
-                                      block.stride) >> (KVZ_BIT_DEPTH - 8);
-
-    if (block.malloc_used) {
-      FREE_POINTER(block.buffer);
-    }
+      block_height,
+      pic_data,
+      pic->stride,
+      ext_origin,
+      ext_s) >> (KVZ_BIT_DEPTH - 8);
 
     return satd;
   }
diff --git a/src/inter.c b/src/inter.c
index 63718483..f34a2a2a 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -40,224 +40,242 @@ typedef struct {
 } merge_candidates_t;
 
 
-static void inter_recon_frac_luma(const encoder_state_t * const state,
-                                  const kvz_picture * const ref,
-                                  int32_t xpos,
-                                  int32_t ypos,
-                                  int32_t block_width,
-                                  int32_t block_height,
-                                  const int16_t mv_param[2],
-                                  lcu_t *lcu)
+static void inter_recon_frac_luma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
   int mv_frac_x = (mv_param[0] & 3);
   int mv_frac_y = (mv_param[1] & 3);
 
-  // Fractional luma 1/4-pel
-  kvz_extended_block src = {0, 0, 0, 0};
+  // Space for extrapolated pixels and the part from the picture.
+  // One extra row for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .buf = ext_buffer,
+    .ext = &ext,
+    .ext_origin = &ext_origin,
+    .ext_s = &ext_s
+  };
 
-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
+  kvz_get_extended_block(&epol_args);
   kvz_sample_quarterpel_luma(state->encoder_control,
-                                     src.orig_topleft,
-                                     src.stride,
-                                     block_width,
-                                     block_height,
-                                     lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                     LCU_WIDTH,
-                                     mv_frac_x,
-                                     mv_frac_y,
-                                     mv_param);
-
-  if (src.malloc_used) free(src.buffer);
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }
 
-static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
-                                        const kvz_picture * const ref,
-                                        int32_t xpos,
-                                        int32_t ypos,
-                                        int32_t block_width,
-                                        int32_t block_height,
-                                        const int16_t mv_param[2],
-                                        hi_prec_buf_t *hi_prec_out)
+static void inter_recon_14bit_frac_luma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
   int mv_frac_x = (mv_param[0] & 3);
   int mv_frac_y = (mv_param[1] & 3);
 
-  // Fractional luma 1/4-pel
-  kvz_extended_block src = { 0, 0, 0, 0 };
+  // Space for extrapolated pixels and the part from the picture.
+  // One extra row for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .buf = ext_buffer,
+    .ext = &ext,
+    .ext_origin = &ext_origin,
+    .ext_s = &ext_s
+  };
 
-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
+  kvz_get_extended_block(&epol_args);
   kvz_sample_14bit_quarterpel_luma(state->encoder_control,
-                                           src.orig_topleft,
-                                           src.stride,
-                                           block_width,
-                                           block_height,
-                                           hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                           LCU_WIDTH,
-                                           mv_frac_x,
-                                           mv_frac_y,
-                                           mv_param);
-
-  if (src.malloc_used) free(src.buffer);
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }
 
-static void inter_recon_frac_chroma(const encoder_state_t * const state,
-                                    const kvz_picture * const ref,
-                                    int32_t xpos,
-                                    int32_t ypos,
-                                    int32_t block_width,
-                                    int32_t block_height,
-                                    const int16_t mv_param[2],
-                                    lcu_t *lcu)
+static void inter_recon_frac_chroma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
   int mv_frac_x = (mv_param[0] & 7);
   int mv_frac_y = (mv_param[1] & 7);
 
-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Three extra rows for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
 
-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
+    .buf = ext_buffer,
+    .ext = &ext,
+    .ext_origin = &ext_origin,
+    .ext_s = &ext_s
+  };
 
-  //Fractional chroma U
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
-  kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
-    block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 
-  //Fractional chroma V
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
-  kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
-    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
-
-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }
 
-static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
-                                          const kvz_picture * const ref,
-                                          int32_t xpos,
-                                          int32_t ypos,
-                                          int32_t block_width,
-                                          int32_t block_height,
-                                          const int16_t mv_param[2],
-                                          hi_prec_buf_t *hi_prec_out)
+static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
   int mv_frac_x = (mv_param[0] & 7);
   int mv_frac_y = (mv_param[1] & 7);
 
-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Three extra rows for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
 
-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
+    .buf = ext_buffer,
+    .ext = &ext,
+    .ext_origin = &ext_origin,
+    .ext_s = &ext_s
+  };
 
-  //Fractional chroma U
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
+  kvz_get_extended_block(&epol_args);
   kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_u.orig_topleft,
-                                         src_u.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 
-  //Fractional chroma V
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
   kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_v.orig_topleft,
-                                         src_v.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
-
-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }
 
 
diff --git a/src/search_inter.c b/src/search_inter.c
index e13a491e..0fcd70c6 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -992,7 +992,6 @@ static void search_frac(inter_search_info_t *info)
 
   unsigned costs[4] = { 0 };
 
-  kvz_extended_block src = { 0, 0, 0, 0 };
   ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
 
   // Storage buffers for intermediate horizontally filtered results.
@@ -1013,20 +1012,41 @@ static void search_frac(inter_search_info_t *info)
   int8_t sample_off_x = 0;
   int8_t sample_off_y = 0;
 
-  kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
-                state->tile->offset_x,
-                state->tile->offset_y,
-                ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
-                internal_width+1, internal_height+1,
-                &src);
+  // Space for (possibly) extrapolated pixels and the part from the picture
+  // One extra column for ME and two extra columns for ME and AVX2
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[(KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 2)];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + orig.x + mv.x - 1,
+    .blk_y = state->tile->offset_y + orig.y + mv.y - 1,
+    .blk_w = internal_width + 1,  // TODO: real width
+    .blk_h = internal_height + 1, // TODO: real height
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .buf = ext_buffer,
+    .ext = &ext,
+    .ext_origin = &ext_origin,
+    .ext_s = &ext_s
+  };
+
+  kvz_get_extended_block(&epol_args);
 
   kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
   int tmp_stride = pic->stride;
                   
   // Search integer position
   costs[0] = kvz_satd_any_size(width, height,
-                            tmp_pic, tmp_stride,
-                            src.orig_topleft + src.stride + 1, src.stride);
+    tmp_pic, tmp_stride,
+    ext_origin + ext_s + 1, ext_s);
 
   costs[0] += info->mvd_cost_func(state,
                                   mv.x, mv.y, 2,
@@ -1056,8 +1076,8 @@ static void search_frac(inter_search_info_t *info)
     const int mv_shift = (step < 2) ? 1 : 0;
 
     filter_steps[step](state->encoder_control,
-      src.orig_topleft,
-      src.stride,
+      ext_origin,
+      ext_s,
       internal_width,
       internal_height,
       filtered,
@@ -1131,8 +1151,6 @@ static void search_frac(inter_search_info_t *info)
   info->best_mv = mv;
   info->best_cost = best_cost;
   info->best_bitcost = best_bitcost;
-
-  if (src.malloc_used) free(src.buffer);
 }
 
 /**
diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index c6fd0d8a..8ba28e66 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -25,25 +25,23 @@
 #include "strategies/avx2/ipol-avx2.h"
 
 #if COMPILE_INTEL_AVX2
-#include "kvazaar.h"
-
 #include <immintrin.h>
 #include <stdio.h>
 #include <string.h>
 
 #include "encoder.h"
+#include "kvazaar.h"
 #include "search_inter.h"
 #include "strategies/generic/picture-generic.h"
 #include "strategies/strategies-ipol.h"
 #include "strategyselector.h"
 #include "strategies/generic/ipol-generic.h"
 
-#if KVZ_BIT_DEPTH == 8
 
 extern int8_t kvz_g_luma_filter[4][8];
 extern int8_t kvz_g_chroma_filter[8][4];
 
-static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, uint8_t *data)
+static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
 {
   __m128i fir = _mm_loadl_epi64((__m128i*)filter);
   __m128i row = _mm_loadl_epi64((__m128i*)data);
@@ -102,7 +100,7 @@ static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) {
   filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45
 }
 
-static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out,
+static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out,
   __m256i *shuf_01_23, __m256i *shuf_45_67,
   __m256i *taps_01_23, __m256i *taps_45_67) {
 
@@ -119,7 +117,7 @@ static void kvz_eight_tap_filter_hor_8x1_avx2(uint8_t *data, int16_t * out,
   _mm_storeu_si128((__m128i*)out, filtered);
 }
 
-static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t * out, int out_stride,
+static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
   __m256i *shuf_01, __m256i *shuf_23,
   __m256i *taps_01, __m256i *taps_23) {
 
@@ -145,7 +143,7 @@ static void kvz_four_tap_filter_hor_4x4_avx2(uint8_t *data, int stride, int16_t
   _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper));
 }
 
-static void kvz_four_tap_filter_hor_4xN_avx2(uint8_t *data, int stride, int16_t * out, int out_stride,
+static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
   __m256i *shuf_01_23, __m256i *taps_01_23,
   int rows) {
 
@@ -179,7 +177,7 @@ static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data
   return filtered;
 }
 
-static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out)
+static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out)
 {
   // Interpolation filter shifts
   int32_t shift2 = 6;
@@ -245,7 +243,7 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat
   _mm_storel_epi64((__m128i*)out, filtered);
 }
 
-static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, uint8_t *out, int16_t out_stride)
+static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride)
 {
   // Interpolation filter shifts
   int32_t shift2 = 6;
@@ -368,7 +366,7 @@ static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int1
   _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23));
 }
 
-INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, uint8_t * out, int64_t out_stride)
+INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride)
 {
   // Interpolation filter shifts
   int32_t shift2 = 6;
@@ -591,7 +589,7 @@ INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t s
   _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6);
 }
 
-INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, uint8_t *out, int out_stride)
+INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride)
 {
   // Filter even rows
   filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
@@ -612,11 +610,11 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil
 }
 
 static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
-  uint8_t *src,
+  kvz_pixel *src,
   int16_t src_stride,
   int width,
   int height,
-  uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
   int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@@ -697,10 +695,10 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   }
 
   // VERTICAL STEP
-  uint8_t *out_l = filtered[0];
-  uint8_t *out_r = filtered[1];
-  uint8_t *out_t = filtered[2];
-  uint8_t *out_b = filtered[3];
+  kvz_pixel *out_l = filtered[0];
+  kvz_pixel *out_r = filtered[1];
+  kvz_pixel *out_t = filtered[2];
+  kvz_pixel *out_b = filtered[3];
 
   __m256i taps[4];
   kvz_init_ver_filter_taps(fir0, taps);
@@ -748,11 +746,11 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 }
 
 static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
-  uint8_t *src,
+  kvz_pixel *src,
   int16_t src_stride,
   int width,
   int height,
-  uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
   int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@@ -776,10 +774,10 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   int16_t *col_pos2 = hor_first_cols[2];
 
   // VERTICAL STEP
-  uint8_t *out_tl = filtered[0];
-  uint8_t *out_tr = filtered[1];
-  uint8_t *out_bl = filtered[2];
-  uint8_t *out_br = filtered[3];
+  kvz_pixel *out_tl = filtered[0];
+  kvz_pixel *out_tr = filtered[1];
+  kvz_pixel *out_bl = filtered[2];
+  kvz_pixel *out_br = filtered[3];
 
  __m256i taps[4];
   kvz_init_ver_filter_taps(fir2, taps);
@@ -831,11 +829,11 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
 }
 
 static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
-  uint8_t *src,
+  kvz_pixel *src,
   int16_t src_stride,
   int width,
   int height,
-  uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
   int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@@ -930,10 +928,10 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   }
 
   // VERTICAL STEP
-  uint8_t *out_l = filtered[0];
-  uint8_t *out_r = filtered[1];
-  uint8_t *out_t = filtered[2];
-  uint8_t *out_b = filtered[3];
+  kvz_pixel *out_l = filtered[0];
+  kvz_pixel *out_r = filtered[1];
+  kvz_pixel *out_t = filtered[2];
+  kvz_pixel *out_b = filtered[3];
 
   int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
   int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
@@ -1058,11 +1056,11 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 }
 
 static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
-  uint8_t *src,
+  kvz_pixel *src,
   int16_t src_stride,
   int width,
   int height,
-  uint8_t filtered[4][LCU_WIDTH * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
   int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
@@ -1090,10 +1088,10 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   int16_t hor_stride = LCU_WIDTH;
 
   // VERTICAL STEP
-  uint8_t *out_tl = filtered[0];
-  uint8_t *out_tr = filtered[1];
-  uint8_t *out_bl = filtered[2];
-  uint8_t *out_br = filtered[3];
+  kvz_pixel *out_tl = filtered[0];
+  kvz_pixel *out_tr = filtered[1];
+  kvz_pixel *out_bl = filtered[2];
+  kvz_pixel *out_br = filtered[3];
 
   int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
   int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
@@ -1216,11 +1214,11 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
 }
 
 static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
-  uint8_t *src, 
+  kvz_pixel *src, 
   int16_t src_stride, 
   int width, 
   int height, 
-  uint8_t *dst, 
+  kvz_pixel *dst, 
   int16_t dst_stride, 
   int8_t hor_flag, 
   int8_t ver_flag, 
@@ -1270,7 +1268,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
 }
 
 static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
-  uint8_t *src, 
+  kvz_pixel *src, 
   int16_t src_stride, 
   int width, 
   int height, 
@@ -1325,11 +1323,11 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
 
 
 static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder,
-  uint8_t *src,
+  kvz_pixel *src,
   int16_t src_stride,
   int width,
   int height,
-  uint8_t *dst,
+  kvz_pixel *dst,
   int16_t dst_stride,
   int8_t hor_flag,
   int8_t ver_flag,
@@ -1387,7 +1385,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode
 }
 
 static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder,
-  uint8_t *src, 
+  kvz_pixel *src, 
   int16_t src_stride, 
   int width, 
   int height, 
@@ -1449,73 +1447,12 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const
   }
 }
 
-#endif //KVZ_BIT_DEPTH == 8
-
-void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out) {
-
-  int half_filter_size = filter_size >> 1;
-
-  out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
-  out->stride = ref_width;
-  out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-  out->malloc_used = 0;
-
-  int min_y = ypos - half_filter_size + off_y + mv_y;
-  int max_y = min_y + height + filter_size;
-  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
-
-  int min_x = xpos - half_filter_size + off_x + mv_x;
-  int max_x = min_x + width + filter_size;
-  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
-
-  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
-
-  if (sample_out_of_bounds){
-    // Alloc 5 pixels more than we actually use because AVX2 filter
-    // functions read up to 5 pixels past the last pixel.
-    out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size) + 5);
-    if (!out->buffer){
-      fprintf(stderr, "Memory allocation failed!\n");
-      assert(0);
-    }
-    out->stride = width + filter_size;
-    out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-    out->malloc_used = 1;
-
-    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-
-    for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
-
-      // calculate y-pixel offset
-      coord_y = y + off_y + mv_y;
-      coord_y = CLIP(0, (ref_height)-1, coord_y);
-      coord_y *= ref_width;
-
-      if (!out_of_bounds_x){
-        memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
-      } else {
-        for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
-
-          coord_x = x + off_x + mv_x;
-          coord_x = CLIP(0, (ref_width)-1, coord_x);
-
-          // Store source block data (with extended borders)
-          out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
-        }
-      }
-    }
-  }
-}
-
 #endif //COMPILE_INTEL_AVX2
 
 int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
 #if COMPILE_INTEL_AVX2
-#if KVZ_BIT_DEPTH == 8
-
   if (bitdepth == 8){
     success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2);
@@ -1526,10 +1463,6 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
     success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2);
   }
-#endif //KVZ_BIT_DEPTH == 8
-
-  success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2);
-
 #endif //COMPILE_INTEL_AVX2
   return success;
 }
diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index f71615c8..75c0eee9 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -728,59 +728,46 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
 }
 
 
-void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out) {
+void kvz_get_extended_block_generic(kvz_epol_args args) {
 
-  int half_filter_size = filter_size >> 1;
+  int min_y = args.blk_y - args.pad_t;
+  int max_y = args.blk_y + args.blk_h + args.pad_b - 1;
+  bool out_of_bounds_y = (min_y < 0) || (max_y >= args.src_h);
 
-  out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
-  out->stride = ref_width;
-  out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-  out->malloc_used = 0;
+  int min_x = args.blk_x - args.pad_l;
+  int max_x = args.blk_x + args.blk_w + args.pad_r - 1;
+  bool out_of_bounds_x = (min_x < 0) || (max_x >= args.src_w);
 
-  int min_y = ypos - half_filter_size + off_y + mv_y;
-  int max_y = min_y + height + filter_size;
-  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
+  if (out_of_bounds_y || out_of_bounds_x) {
 
-  int min_x = xpos - half_filter_size + off_x + mv_x;
-  int max_x = min_x + width + filter_size;
-  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
+    *args.ext = args.buf;
+    *args.ext_s = args.pad_l + args.blk_w + args.pad_r;
+    *args.ext_origin = args.buf + args.pad_t * (*args.ext_s) + args.pad_l;
 
-  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
+    int cnt_l = CLIP(0, *args.ext_s, -min_x);
+    int cnt_r = CLIP(0, *args.ext_s, max_x - (args.src_w - 1));
+    int cnt_m = CLIP(0, *args.ext_s, *args.ext_s - cnt_l - cnt_r);
 
-  if (sample_out_of_bounds){
-    out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
-    if (!out->buffer){
-      fprintf(stderr, "Memory allocation failed!\n");
-      assert(0);
+    // For each row including padding
+    for (int y = -args.pad_t; y < args.blk_h + args.pad_b; ++y) {
+
+      int clipped_y = CLIP(0, args.src_h - 1, args.blk_y + y);
+      kvz_pixel sample_l = *(args.src + clipped_y * args.src_s);
+      kvz_pixel sample_r = *(args.src + clipped_y * args.src_s + args.src_w - 1);
+      kvz_pixel *src_m = args.src + clipped_y * args.src_s + MAX(min_x, 0);
+      kvz_pixel *dst_l = args.buf + (y + args.pad_t) * (*args.ext_s);
+      kvz_pixel *dst_m = dst_l + cnt_l;
+      kvz_pixel *dst_r = dst_m + cnt_m;
+      for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l;
+      for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
+      for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r;
     }
-    out->stride = width + filter_size;
-    out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-    out->malloc_used = 1;
+  } else {
 
-    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-
-    for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
-
-      // calculate y-pixel offset
-      coord_y = y + off_y + mv_y;
-      coord_y = CLIP(0, (ref_height)-1, coord_y);
-      coord_y *= ref_width;
-
-      if (!out_of_bounds_x){
-        memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
-      } else {
-        for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
-
-          coord_x = x + off_x + mv_x;
-          coord_x = CLIP(0, (ref_width)-1, coord_x);
-
-          // Store source block data (with extended borders)
-          out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
-        }
-      }
-    }
-  } 
+    *args.ext = args.src + (args.blk_y - args.pad_t) * args.src_s + (args.blk_x - args.pad_l);
+    *args.ext_origin = args.src + args.blk_y * args.src_s + args.blk_x;
+    *args.ext_s = args.src_s;
+  }
 }
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h
index ce6608f7..0566507d 100644
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@@ -38,8 +38,34 @@ typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src
   kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], 
   int8_t sample_off_x, int8_t sample_off_y);
 
-typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out);
+typedef struct {
+  // Source samples
+  kvz_pixel *src; // Top-left sample
+  int src_w; // Width
+  int src_h; // Height
+  int src_s; // Stride
+
+  // Requested sampling position, base dimensions, and padding
+  int blk_x;
+  int blk_y;
+  int blk_w; // Width
+  int blk_h; // Height
+  int pad_l; // Left
+  int pad_r; // Right
+  int pad_t; // Top
+  int pad_b; // Bottom
+
+  // Buffer for possible extrapolation. Free memory provided by the caller.
+  kvz_pixel *buf;
+
+  // Extended block data. These are set by the function.
+  kvz_pixel **ext; // Top-left sample with padding
+  kvz_pixel **ext_origin; // Top-left sample without padding
+  int *ext_s; // Stride
+} kvz_epol_args;
+
+typedef unsigned(epol_func)(kvz_epol_args *args);
+
 
 typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

From d9a3225ae5c7cedc55ed59d290cb89daab1808e1 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Mon, 24 Aug 2020 01:51:24 +0300
Subject: [PATCH 02/19] Add new AVX2 vertical ip filter for high-precision

---
 src/strategies/avx2/ipol-avx2.c | 97 +++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 10 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 8ba28e66..1a491407 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -1267,6 +1267,90 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
   }
 }
 
+static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter,
+int width,
+int height,
+int16_t *src,
+int16_t src_stride,
+int16_t *dst,
+int16_t dst_stride)
+{
+  const int shift2 = 6;
+
+  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter);
+  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
+  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2));
+  __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3));
+
+  for (int x = 0; x + 3 < width; x += 4) {
+
+    int16_t *strip_ptr = src + 0 * src_stride + x;
+
+    // Initial values
+    // Broadcasted rows in both lanes
+    // __m256i r0; // Unused
+    // __m256i r1; // Unused
+    __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride));
+    __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride));
+    __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride));
+    __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride));
+    __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride));
+    __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride));
+    __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride));
+
+    // Consecutive rows in low and high lanes
+    // __m256i r0_r1; // Unused
+    // __m256i r1_r2; // Unused
+    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+    __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0);
+    __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0);
+    __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+    __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+    // Paired samples of consecutive rows
+    __m256i r01_r12;
+    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+    __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6);
+    __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+    for (int y = 0; y < height; y += 2) {
+
+      strip_ptr = src + y * src_stride + x;
+
+      // Slide window
+      r01_r12 = r23_r34;
+      r23_r34 = r45_r56;
+      r45_r56 = r67_r78;
+      r6 = r8;
+      r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride));
+      r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride));
+      r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+      r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+      r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
+      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
+      __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45);
+      __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi32(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi32(dot45, dot67);
+      __m256i sum = _mm256_add_epi32(sum0123, sum4567);
+      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_packs_epi32(sum, sum);
+
+      int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];
+      int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x];
+      _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum));
+      _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1));
+    }
+  }
+}
+
 static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
   kvz_pixel *src, 
   int16_t src_stride, 
@@ -1289,8 +1373,8 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
 
-  int16_t hor_stride = LCU_WIDTH;
-  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
+  int16_t hor_stride = width;
+  ALIGNED(64) int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
 
   // HORIZONTAL STEP
   __m256i shuf_01_23, shuf_45_67;
@@ -1311,14 +1395,7 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
   }
 
   // VERTICAL STEP
-  __m256i taps[4];
-  kvz_init_ver_filter_taps(ver_fir, taps);
-
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
-    }
-  }
+  kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
 

From f5b0e3c52b394d6d72eb0a67decdc855c443dd84 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 21:01:40 +0200
Subject: [PATCH 03/19] Add new AVX2 horizontal ip filter capable of every luma
 PB

---
 src/strategies/avx2/ipol-avx2.c | 130 +++++++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 28 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 1a491407..6332f652 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -1267,6 +1267,101 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
   }
 }
 
+static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
+  int width,
+  int height,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int16_t *dst,
+  int16_t dst_stride) {
+  __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+                                    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+                                    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+                                    4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+  __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+                                    6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+  __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0));
+  __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2));
+  __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4));
+  __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6));
+
+  int y_offset = -KVZ_LUMA_FILTER_OFFSET;
+  int x_offset = -KVZ_LUMA_FILTER_OFFSET;
+
+  kvz_pixel *top_left = src + src_stride * y_offset + x_offset;
+
+  int y = 0;
+  int x = 0;
+
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
+
+    for (x = 0; x + 7 < width; x += 8) {
+
+      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
+      __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride));
+      __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride));
+      __m256i r0_r1 = _mm256_castsi128_si256(r0);
+      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
+
+      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
+      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
+      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
+      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
+
+      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
+      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
+      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
+      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
+      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
+
+      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
+      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
+      __m128i sum_r0 = _mm256_castsi256_si128(sum);
+      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
+      _mm_storeu_si128(dst_r0, sum_r0);
+      _mm_storeu_si128(dst_r1, sum_r1);
+    }
+  }
+
+  if (x < width) {
+    for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
+
+      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
+      __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride));
+      __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride));
+      __m256i r0_r1 = _mm256_castsi128_si256(r0);
+      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
+
+      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
+      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
+      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
+      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
+
+      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
+      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
+      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
+      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
+      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
+
+      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
+      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
+      __m128i sum_r0 = _mm256_castsi256_si128(sum);
+      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
+      _mm_storel_epi64(dst_r0, sum_r0);
+      _mm_storel_epi64(dst_r1, sum_r1);
+    }
+  }
+}
+
 static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter,
 int width,
 int height,
@@ -1362,40 +1457,19 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
   int8_t ver_flag, 
   const int16_t mv[2])
 {
-  // TODO: Optimize SMP and AMP
-  if (width != height) {
-    kvz_sample_14bit_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
-    return;
-  }
   // TODO: horizontal and vertical only filtering
   int x, y;
 
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
+  
+  // Buffer for intermediate values with one extra row 
+  // because the loop writes two rows each iteration.
+  ALIGNED(64) int16_t hor_filtered[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  int16_t hor_stride = LCU_WIDTH;
 
-  int16_t hor_stride = width;
-  ALIGNED(64) int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
-
-  // HORIZONTAL STEP
-  __m256i shuf_01_23, shuf_45_67;
-  __m256i taps_01_23, taps_45_67;
-
-  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
-  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x],
-        &shuf_01_23, &shuf_45_67,
-        &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
-
-  // VERTICAL STEP
-  kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
+  kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_filtered, hor_stride);
+  kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_filtered, hor_stride, dst, dst_stride);
 }
 
 

From 2175023843faa8b2a0e75c434fb965614c60e62b Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 21:12:39 +0200
Subject: [PATCH 04/19] Relocate function

---
 src/strategies/avx2/ipol-avx2.c | 109 ++++++++++++++++----------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 6332f652..fd7182b6 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -1213,60 +1213,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   }
 }
 
-static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
-  kvz_pixel *src, 
-  int16_t src_stride, 
-  int width, 
-  int height, 
-  kvz_pixel *dst, 
-  int16_t dst_stride, 
-  int8_t hor_flag, 
-  int8_t ver_flag, 
-  const int16_t mv[2])
-{
-  // TODO: Optimize SMP and AMP
-  if (width != height) {
-    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
-    return;
-  }
-
-  int x, y;
-
-  int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
-  int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
-
-  int16_t hor_stride = LCU_WIDTH;
-  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
-
-  // HORIZONTAL STEP
-  __m256i shuf_01_23, shuf_45_67;
-  __m256i taps_01_23, taps_45_67;
-
-  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
-  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x],
-        &shuf_01_23, &shuf_45_67,
-        &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
-
-  // VERTICAL STEP
-  __m256i taps[4];
-  kvz_init_ver_filter_taps(ver_fir, taps);
-
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
-    }
-  }
-}
-
 static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
   int width,
   int height,
@@ -1446,6 +1392,61 @@ int16_t dst_stride)
   }
 }
 
+static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
+  kvz_pixel *src, 
+  int16_t src_stride, 
+  int width, 
+  int height, 
+  kvz_pixel *dst, 
+  int16_t dst_stride, 
+  int8_t hor_flag, 
+  int8_t ver_flag, 
+  const int16_t mv[2])
+{
+  // TODO: Optimize SMP and AMP
+  if (width != height) {
+    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
+
+  int x, y;
+
+  int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
+  int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
+
+  int16_t hor_stride = LCU_WIDTH;
+  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
+
+  // HORIZONTAL STEP
+  __m256i shuf_01_23, shuf_45_67;
+  __m256i taps_01_23, taps_45_67;
+
+  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
+  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
+
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
+
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x],
+        &shuf_01_23, &shuf_45_67,
+        &taps_01_23, &taps_45_67); //TODO: >> shift1
+    }
+  }
+
+  // VERTICAL STEP
+  __m256i taps[4];
+  kvz_init_ver_filter_taps(ver_fir, taps);
+
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
+    }
+  }
+}
+
+
 static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
   kvz_pixel *src, 
   int16_t src_stride, 

From 9e4b62a891bbaa3db78cb9af6286ac68b6577b3c Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 21:14:28 +0200
Subject: [PATCH 05/19] Use the new horizontal filter for pixel precision as
 well

---
 src/strategies/avx2/ipol-avx2.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index fd7182b6..8431cdb1 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -1414,26 +1414,12 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
 
+  // Buffer for intermediate values with one extra row 
+  // because the loop writes two rows each iteration.
+  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
   int16_t hor_stride = LCU_WIDTH;
-  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH];
 
-  // HORIZONTAL STEP
-  __m256i shuf_01_23, shuf_45_67;
-  __m256i taps_01_23, taps_45_67;
-
-  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
-  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x],
-        &shuf_01_23, &shuf_45_67,
-        &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
+  kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
 
   // VERTICAL STEP
   __m256i taps[4];

From e572066e46d4ef7900fd8106ecc3caf0c5199f50 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 21:53:15 +0200
Subject: [PATCH 06/19] Add new AVX2 vertical ip filter for pixel precision

---
 src/strategies/avx2/ipol-avx2.c | 593 +++++++++++++++++---------------
 1 file changed, 308 insertions(+), 285 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 8431cdb1..44ffdac4 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -609,6 +609,277 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil
 
 }
 
+static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
+  int width,
+  int height,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int16_t *dst,
+  int16_t dst_stride) {
+  __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+                                    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+                                    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+                                    4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+  __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+                                    6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
+
+  __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0));
+  __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2));
+  __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4));
+  __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6));
+
+  int y_offset = -KVZ_LUMA_FILTER_OFFSET;
+  int x_offset = -KVZ_LUMA_FILTER_OFFSET;
+
+  kvz_pixel *top_left = src + src_stride * y_offset + x_offset;
+
+  int y = 0;
+  int x = 0;
+
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
+
+    for (x = 0; x + 7 < width; x += 8) {
+
+      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
+      __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride));
+      __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride));
+      __m256i r0_r1 = _mm256_castsi128_si256(r0);
+      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
+
+      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
+      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
+      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
+      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
+
+      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
+      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
+      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
+      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
+      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
+
+      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
+      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
+      __m128i sum_r0 = _mm256_castsi256_si128(sum);
+      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
+      _mm_storeu_si128(dst_r0, sum_r0);
+      _mm_storeu_si128(dst_r1, sum_r1);
+    }
+  }
+
+  if (x < width) {
+    for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
+
+      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
+      __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride));
+      __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride));
+      __m256i r0_r1 = _mm256_castsi128_si256(r0);
+      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
+
+      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
+      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
+      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
+      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
+
+      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
+      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
+      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
+      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
+      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
+
+      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
+      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
+      __m128i sum_r0 = _mm256_castsi256_si128(sum);
+      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
+      _mm_storel_epi64(dst_r0, sum_r0);
+      _mm_storel_epi64(dst_r1, sum_r1);
+    }
+  }
+}
+
+static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter,
+  int width,
+  int height,
+  int16_t *src,
+  int16_t src_stride,
+  kvz_pixel *dst,
+  int16_t dst_stride)
+{
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
+
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
+  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter);
+  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
+  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2));
+  __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3));
+
+  for (int x = 0; x + 3 < width; x += 4) {
+
+    int16_t *strip_ptr = src + 0 * src_stride + x;
+
+    // Initial values
+    // Broadcasted rows in both lanes
+    // __m256i r0; // Unused
+    // __m256i r1; // Unused
+    __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride));
+    __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride));
+    __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride));
+    __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride));
+    __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride));
+    __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride));
+    __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride));
+
+    // Consecutive rows in low and high lanes
+    // __m256i r0_r1; // Unused
+    // __m256i r1_r2; // Unused
+    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+    __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0);
+    __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0);
+    __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+    __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+    // Paired samples of consecutive rows
+    __m256i r01_r12;
+    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+    __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6);
+    __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+    for (int y = 0; y < height; y += 2) {
+
+      strip_ptr = src + y * src_stride + x;
+
+      // Slide window
+      r01_r12 = r23_r34;
+      r23_r34 = r45_r56;
+      r45_r56 = r67_r78;
+      r6 = r8;
+      r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride));
+      r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride));
+      r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+      r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+      r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
+      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
+      __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45);
+      __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi32(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi32(dot45, dot67);
+      __m256i sum = _mm256_add_epi32(sum0123, sum4567);
+      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_add_epi32(sum, _mm256_set1_epi32(wp_offset1));
+      sum = _mm256_srai_epi32(sum, wp_shift1);
+      sum = _mm256_packs_epi32(sum, sum);
+      sum = _mm256_packus_epi16(sum, sum);
+
+      kvz_pixel *dst_addr0 = &dst[(y + 0) * dst_stride + x];
+      kvz_pixel *dst_addr1 = &dst[(y + 1) * dst_stride + x];
+      *(uint32_t*)dst_addr0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum));
+      *(uint32_t*)dst_addr1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sum, 1));
+    }
+  }
+}
+
+static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter,
+int width,
+int height,
+int16_t *src,
+int16_t src_stride,
+int16_t *dst,
+int16_t dst_stride)
+{
+  const int shift2 = 6;
+
+  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter);
+  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
+  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2));
+  __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3));
+
+  for (int x = 0; x + 3 < width; x += 4) {
+
+    int16_t *strip_ptr = src + 0 * src_stride + x;
+
+    // Initial values
+    // Broadcasted rows in both lanes
+    // __m256i r0; // Unused
+    // __m256i r1; // Unused
+    __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride));
+    __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride));
+    __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride));
+    __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride));
+    __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride));
+    __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride));
+    __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride));
+
+    // Consecutive rows in low and high lanes
+    // __m256i r0_r1; // Unused
+    // __m256i r1_r2; // Unused
+    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+    __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0);
+    __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0);
+    __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+    __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+    // Paired samples of consecutive rows
+    __m256i r01_r12;
+    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+    __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6);
+    __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+    for (int y = 0; y < height; y += 2) {
+
+      strip_ptr = src + y * src_stride + x;
+
+      // Slide window
+      r01_r12 = r23_r34;
+      r23_r34 = r45_r56;
+      r45_r56 = r67_r78;
+      r6 = r8;
+      r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride));
+      r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride));
+      r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
+      r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
+
+      r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
+
+      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
+      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
+      __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45);
+      __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67);
+
+      __m256i sum0123 = _mm256_add_epi32(dot01, dot23);
+      __m256i sum4567 = _mm256_add_epi32(dot45, dot67);
+      __m256i sum = _mm256_add_epi32(sum0123, sum4567);
+      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_packs_epi32(sum, sum);
+
+      int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];
+      int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x];
+      _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum));
+      _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1));
+    }
+  }
+}
+
 static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
   kvz_pixel *src,
   int16_t src_stride,
@@ -675,16 +946,7 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
   kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67);
 
-  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos2[y * hor_stride + x],
-                                            &shuf_01_23, &shuf_45_67,
-                                            &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
+  kvz_ipol_8tap_hor_px_im_avx2(fir2, width, height + 1, src + 1, src_stride, hor_pos2, hor_stride);
 
   // Write the first column in contiguous memory
   x = 0;
@@ -704,12 +966,9 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   kvz_init_ver_filter_taps(fir0, taps);
 
   // Right
-  for (y = 0; y + 7 < height; y+=8) {
-
-    for (x = 0; x + 7 < width ; x+=8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[(y + 1) * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride);
-    }
-  }
+  int16_t *im = &hor_pos2[hor_stride];
+  kvz_pixel *dst = out_r;
+  kvz_ipol_8tap_ver_im_px_avx2(fir0, width, height, im, hor_stride, dst, dst_stride);
 
   // Left
   // Copy from the right filtered block and filter the extra column
@@ -725,11 +984,9 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 
   kvz_init_ver_filter_taps(fir2, taps);
   // Top
-  for (y = 0; y + 7 < height; y+=8) {
-    for (x = 0; x + 7 < width; x+=8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos0[y * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = hor_pos0;
+  dst = out_t;
+  kvz_ipol_8tap_ver_im_px_avx2(fir2, width, height, im, hor_stride, dst, dst_stride);
 
   // Bottom
   // Copy what can be copied from the top filtered values.
@@ -782,11 +1039,9 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
  __m256i taps[4];
   kvz_init_ver_filter_taps(fir2, taps);
   // Top-Right
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[y * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride);
-    }
-  }
+  int16_t *im = hor_pos2;
+  kvz_pixel *dst = out_tr;
+  kvz_ipol_8tap_ver_im_px_avx2(fir2, width, height, im, hor_stride, dst, dst_stride);
 
   // Top-left
   // Copy from the top-right filtered block and filter the extra column
@@ -885,17 +1140,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67);
   
   int sample_off_y = hpel_off_y < 0 ? 0 : 1;
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_l[y * hor_stride + x],
-        &shuf_01_23, &shuf_45_67,
-        &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
+  kvz_ipol_8tap_hor_px_im_avx2(hor_fir_l, width, height + 1, src + 1, src_stride, hor_pos_l, hor_stride);
 
   // Write the first column in contiguous memory
   x = 0;
@@ -907,17 +1152,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 
   // Right QPEL
   kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67);
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
-      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
-      kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_r[y * hor_stride + x],
-        &shuf_01_23, &shuf_45_67,
-        &taps_01_23, &taps_45_67); //TODO: >> shift1
-    }
-  }
+  kvz_ipol_8tap_hor_px_im_avx2(hor_fir_r, width, height + 1, src + 1, src_stride, hor_pos_r, hor_stride);
 
   // Write the first column in contiguous memory
   x = 0;
@@ -944,12 +1179,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   // Filter block and then filter column and align if neccessary
   kvz_init_ver_filter_taps(ver_fir_l, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + sample_off_y;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_l[y * dst_stride + x], dst_stride);
-    }
-  }
+  int16_t *im = &hor_pos_l[sample_off_y * hor_stride];
+  kvz_pixel *dst = out_l;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_l, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_l) {
     for (y = 0; y < height; ++y) {
@@ -972,12 +1204,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   // Filter block and then filter column and align if neccessary
   kvz_init_ver_filter_taps(ver_fir_r, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + sample_off_y;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_pos_r[sample_off_y * hor_stride];
+  dst = out_r;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_r, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_r) {
     for (y = 0; y < height; ++y) {
@@ -1002,12 +1231,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
   kvz_init_ver_filter_taps(ver_fir_t, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_t;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_hpel_pos[off_y_fir_t * hor_stride];
+  dst = out_t;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride);
 
   if (!sample_off_x) {
     for (y = 0; y < height; ++y) {
@@ -1030,12 +1256,9 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   // Filter block and then filter column and align if neccessary
   kvz_init_ver_filter_taps(ver_fir_b, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_b;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_b[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_hpel_pos[off_y_fir_b * hor_stride];
+  dst = out_b;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride);
 
   if (!sample_off_x) {
     for (y = 0; y < height; ++y) {
@@ -1107,12 +1330,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   // Filter block and then filter column and align if neccessary
   kvz_init_ver_filter_taps(ver_fir_t, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_t;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_tl[y * dst_stride + x], dst_stride);
-    }
-  }
+  int16_t *im = &hor_pos_l[off_y_fir_t * hor_stride];
+  kvz_pixel *dst = out_tl;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_l) {
     for (y = 0; y < height; ++y) {
@@ -1134,12 +1354,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   // Top-right QPEL
   // Filter block and then filter column and align if neccessary
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_t;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_pos_r[off_y_fir_t * hor_stride];
+  dst = out_tr;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_r) {
     for (y = 0; y < height; ++y) {
@@ -1162,12 +1379,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   // Filter block and then filter column and align if neccessary
   kvz_init_ver_filter_taps(ver_fir_b, taps);
 
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_b;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_bl[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_pos_l[off_y_fir_b * hor_stride];
+  dst = out_bl;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_l) {
     for (y = 0; y < height; ++y) {
@@ -1188,12 +1402,9 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
 
   // Bottom-right QPEL
   // Filter block and then filter column and align if neccessary
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      int ypos = y + off_y_fir_b;
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_br[y * dst_stride + x], dst_stride);
-    }
-  }
+  im = &hor_pos_r[off_y_fir_b * hor_stride];
+  dst = out_br;
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride);
 
   if (!off_x_fir_r) {
     for (y = 0; y < height; ++y) {
@@ -1213,185 +1424,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   }
 }
 
-static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
-  int width,
-  int height,
-  kvz_pixel *src,
-  int16_t src_stride,
-  int16_t *dst,
-  int16_t dst_stride) {
-  __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-                                    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
-  __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-                                    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
-  __m256i shuf45 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-                                    4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
-  __m256i shuf67 = _mm256_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-                                    6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
-
-  __m256i all_w01 = _mm256_set1_epi16(*(uint16_t *)(filter + 0));
-  __m256i all_w23 = _mm256_set1_epi16(*(uint16_t *)(filter + 2));
-  __m256i all_w45 = _mm256_set1_epi16(*(uint16_t *)(filter + 4));
-  __m256i all_w67 = _mm256_set1_epi16(*(uint16_t *)(filter + 6));
-
-  int y_offset = -KVZ_LUMA_FILTER_OFFSET;
-  int x_offset = -KVZ_LUMA_FILTER_OFFSET;
-
-  kvz_pixel *top_left = src + src_stride * y_offset + x_offset;
-
-  int y = 0;
-  int x = 0;
-
-  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
-
-    for (x = 0; x + 7 < width; x += 8) {
-
-      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
-      __m128i r0 = _mm_loadu_si128((__m128i*)(chunk_ptr + 0 * src_stride));
-      __m128i r1 = _mm_loadu_si128((__m128i*)(chunk_ptr + 1 * src_stride));
-      __m256i r0_r1 = _mm256_castsi128_si256(r0);
-      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
-
-      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
-      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
-      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
-      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
-
-      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
-      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
-      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
-      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
-
-      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
-      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
-      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
-
-      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
-      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
-      __m128i sum_r0 = _mm256_castsi256_si128(sum);
-      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
-      _mm_storeu_si128(dst_r0, sum_r0);
-      _mm_storeu_si128(dst_r1, sum_r1);
-    }
-  }
-
-  if (x < width) {
-    for (int y = 0; y < height + KVZ_EXT_PADDING_LUMA; y += 2) {
-
-      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
-      __m128i r0 = _mm_loadu_si128((__m128i *)(chunk_ptr + 0 * src_stride));
-      __m128i r1 = _mm_loadu_si128((__m128i *)(chunk_ptr + 1 * src_stride));
-      __m256i r0_r1 = _mm256_castsi128_si256(r0);
-      r0_r1 = _mm256_inserti128_si256(r0_r1, r1, 1);
-
-      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0_r1, shuf01);
-      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0_r1, shuf23);
-      __m256i r0_r1_45 = _mm256_shuffle_epi8(r0_r1, shuf45);
-      __m256i r0_r1_67 = _mm256_shuffle_epi8(r0_r1, shuf67);
-
-      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
-      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
-      __m256i dot45 = _mm256_maddubs_epi16(r0_r1_45, all_w45);
-      __m256i dot67 = _mm256_maddubs_epi16(r0_r1_67, all_w67);
-
-      __m256i sum0123 = _mm256_add_epi16(dot01, dot23);
-      __m256i sum4567 = _mm256_add_epi16(dot45, dot67);
-      __m256i sum = _mm256_add_epi16(sum0123, sum4567);
-
-      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
-      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
-      __m128i sum_r0 = _mm256_castsi256_si128(sum);
-      __m128i sum_r1 = _mm256_extracti128_si256(sum, 1);
-      _mm_storel_epi64(dst_r0, sum_r0);
-      _mm_storel_epi64(dst_r1, sum_r1);
-    }
-  }
-}
-
-static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter,
-int width,
-int height,
-int16_t *src,
-int16_t src_stride,
-int16_t *dst,
-int16_t dst_stride)
-{
-  const int shift2 = 6;
-
-  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter);
-  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
-  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
-  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
-  __m256i all_w45 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(2, 2, 2, 2));
-  __m256i all_w67 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(3, 3, 3, 3));
-
-  for (int x = 0; x + 3 < width; x += 4) {
-
-    int16_t *strip_ptr = src + 0 * src_stride + x;
-
-    // Initial values
-    // Broadcasted rows in both lanes
-    // __m256i r0; // Unused
-    // __m256i r1; // Unused
-    __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride));
-    __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride));
-    __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride));
-    __m256i r5 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride));
-    __m256i r6 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride));
-    __m256i r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 5 * src_stride));
-    __m256i r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 6 * src_stride));
-
-    // Consecutive rows in low and high lanes
-    // __m256i r0_r1; // Unused
-    // __m256i r1_r2; // Unused
-    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
-    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
-    __m256i r4_r5 = _mm256_blend_epi32(r4, r5, 0xF0);
-    __m256i r5_r6 = _mm256_blend_epi32(r5, r6, 0xF0);
-    __m256i r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
-    __m256i r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
-
-    // Paired samples of consecutive rows
-    __m256i r01_r12;
-    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
-    __m256i r45_r56 = _mm256_unpacklo_epi16(r4_r5, r5_r6);
-    __m256i r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
-
-    for (int y = 0; y < height; y += 2) {
-
-      strip_ptr = src + y * src_stride + x;
-
-      // Slide window
-      r01_r12 = r23_r34;
-      r23_r34 = r45_r56;
-      r45_r56 = r67_r78;
-      r6 = r8;
-      r7 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 7 * src_stride));
-      r8 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 8 * src_stride));
-      r6_r7 = _mm256_blend_epi32(r6, r7, 0xF0);
-      r7_r8 = _mm256_blend_epi32(r7, r8, 0xF0);
-
-      r67_r78 = _mm256_unpacklo_epi16(r6_r7, r7_r8);
-
-      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
-      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
-      __m256i dot45 = _mm256_madd_epi16(r45_r56, all_w45);
-      __m256i dot67 = _mm256_madd_epi16(r67_r78, all_w67);
-
-      __m256i sum0123 = _mm256_add_epi32(dot01, dot23);
-      __m256i sum4567 = _mm256_add_epi32(dot45, dot67);
-      __m256i sum = _mm256_add_epi32(sum0123, sum4567);
-      sum = _mm256_srai_epi32(sum, shift2);
-      sum = _mm256_packs_epi32(sum, sum);
-
-      int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];
-      int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x];
-      _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum));
-      _mm_storel_epi64((__m128i *)dst_addr1, _mm256_extracti128_si256(sum, 1));
-    }
-  }
-}
-
 static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
   kvz_pixel *src, 
   int16_t src_stride, 
@@ -1420,16 +1452,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
   int16_t hor_stride = LCU_WIDTH;
 
   kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
-
-  // VERTICAL STEP
-  __m256i taps[4];
-  kvz_init_ver_filter_taps(ver_fir, taps);
-
-  for (y = 0; y + 7 < height; y += 8) {
-    for (x = 0; x + 7 < width; x += 8) {
-      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
-    }
-  }
+  kvz_ipol_8tap_ver_im_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
 

From 3476fc62c7ddfa06139d3bce8c9f7dcce5b569fa Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 21:59:05 +0200
Subject: [PATCH 07/19] Fix parameter to signed

---
 src/strategies/avx2/ipol-avx2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 44ffdac4..27fc1fde 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -609,7 +609,7 @@ INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *fil
 
 }
 
-static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
+static void kvz_ipol_8tap_hor_px_im_avx2(int8_t *filter,
   int width,
   int height,
   kvz_pixel *src,
@@ -704,7 +704,7 @@ static void kvz_ipol_8tap_hor_px_im_avx2(uint8_t *filter,
   }
 }
 
-static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter,
+static void kvz_ipol_8tap_ver_im_px_avx2(int8_t *filter,
   int width,
   int height,
   int16_t *src,
@@ -796,7 +796,7 @@ static void kvz_ipol_8tap_ver_im_px_avx2(uint8_t *filter,
   }
 }
 
-static void kvz_ipol_8tap_ver_im_hi_avx2(uint8_t *filter,
+static void kvz_ipol_8tap_ver_im_hi_avx2(int8_t *filter,
 int width,
 int height,
 int16_t *src,

From 7e6ba9750f89ed444550b50f8839c5ea8962edf1 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Tue, 2 Mar 2021 22:17:50 +0200
Subject: [PATCH 08/19] Add new AVX2 ip filters for chroma

---
 src/strategies/avx2/ipol-avx2.c | 331 ++++++++++++++++++++++----------
 1 file changed, 231 insertions(+), 100 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 27fc1fde..22ea3641 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -872,6 +872,210 @@ int16_t dst_stride)
       sum = _mm256_srai_epi32(sum, shift2);
       sum = _mm256_packs_epi32(sum, sum);
 
+      int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];
+      int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x];
+      _mm_storel_epi64((__m128i*)dst_addr0, _mm256_castsi256_si128(sum));
+      _mm_storel_epi64((__m128i*)dst_addr1, _mm256_extracti128_si256(sum, 1));
+    }
+  }
+}
+
+static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter,
+  int width,
+  int height,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int16_t *dst,
+  int16_t dst_stride) {
+
+  __m256i shuf01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4,
+    8, 9, 9, 10, 10, 11, 11, 12,
+    0, 1, 1, 2, 2, 3, 3, 4,
+    8, 9, 9, 10, 10, 11, 11, 12);
+
+  __m256i shuf23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6,
+    10, 11, 11, 12, 12, 13, 13, 14,
+    2, 3, 3, 4, 4, 5, 5, 6,
+    10, 11, 11, 12, 12, 13, 13, 14);
+
+  __m256i all_w01 = _mm256_set1_epi16(*(uint16_t*)(filter + 0));
+  __m256i all_w23 = _mm256_set1_epi16(*(uint16_t*)(filter + 2));
+
+  int y_offset = -KVZ_CHROMA_FILTER_OFFSET;
+  int x_offset = -KVZ_CHROMA_FILTER_OFFSET;
+
+  kvz_pixel *top_left = src + src_stride * y_offset + x_offset;
+
+  int y = 0;
+  int x = 0;
+
+  for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; y += 4) {
+
+    for (x = 0; x + 3 < width; x += 4) {
+
+      kvz_pixel *chunk_ptr = top_left + src_stride * y + x;
+      __m128i r0r1 = _mm_loadl_epi64((__m128i*)(chunk_ptr + 0 * src_stride));
+      __m128i r2r3 = _mm_loadl_epi64((__m128i*)(chunk_ptr + 2 * src_stride));
+      r0r1 = _mm_insert_epi64(r0r1, *(uint64_t*)(chunk_ptr + 1 * src_stride), 1);
+      r2r3 = _mm_insert_epi64(r2r3, *(uint64_t*)(chunk_ptr + 3 * src_stride), 1);
+
+      __m256i r0r1_r2r3 = _mm256_castsi128_si256(r0r1);
+      r0r1_r2r3 = _mm256_inserti128_si256(r0r1_r2r3, r2r3, 1);
+
+      __m256i r0_r1_01 = _mm256_shuffle_epi8(r0r1_r2r3, shuf01);
+      __m256i r0_r1_23 = _mm256_shuffle_epi8(r0r1_r2r3, shuf23);
+
+      __m256i dot01 = _mm256_maddubs_epi16(r0_r1_01, all_w01);
+      __m256i dot23 = _mm256_maddubs_epi16(r0_r1_23, all_w23);
+
+      __m256i sum = _mm256_add_epi16(dot01, dot23);
+
+      __m128i *dst_r0 = (__m128i*)(dst + (y + 0) * dst_stride + x);
+      __m128i *dst_r1 = (__m128i*)(dst + (y + 1) * dst_stride + x);
+      __m128i *dst_r2 = (__m128i*)(dst + (y + 2) * dst_stride + x);
+      __m128i *dst_r3 = (__m128i*)(dst + (y + 3) * dst_stride + x);
+      __m128i sum_r0r1 = _mm256_castsi256_si128(sum);
+      __m128i sum_r2r3 = _mm256_extracti128_si256(sum, 1);
+      _mm_storel_epi64(dst_r0, sum_r0r1);
+      _mm_storeh_pd((double*)dst_r1, _mm_castsi128_pd(sum_r0r1));
+      _mm_storel_epi64(dst_r2, sum_r2r3);
+      _mm_storeh_pd((double*)dst_r3, _mm_castsi128_pd(sum_r2r3));
+    }
+  }
+}
+
+static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter,
+  int width,
+  int height,
+  int16_t *src,
+  int16_t src_stride,
+  kvz_pixel *dst,
+  int16_t dst_stride)
+{
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
+
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
+  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t*)filter);
+  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
+  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
+
+  for (int x = 0; x + 3 < width; x += 4) {
+
+    int16_t *strip_ptr = src + 0 * src_stride + x;
+
+    // Initial values
+    // Broadcasted rows in both lanes
+    // __m256i r0; // Unused
+    // __m256i r1; // Unused
+    __m256i r2 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 0 * src_stride));
+    __m256i r3 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 1 * src_stride));
+    __m256i r4 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 2 * src_stride));
+
+    // Consecutive rows in low and high lanes
+    // __m256i r0_r1; // Unused
+    // __m256i r1_r2; // Unused
+    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+
+    // Paired samples of consecutive rows
+    __m256i r01_r12;
+    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+
+    for (int y = 0; y < height; y += 2) {
+
+      strip_ptr = src + y * src_stride + x;
+
+      // Slide window
+      r01_r12 = r23_r34;
+      r2 = r4;
+      r3 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 3 * src_stride));
+      r4 = _mm256_set1_epi64x(*(uint64_t*)(strip_ptr + 4 * src_stride));
+      r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+      r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+
+      r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+
+      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
+      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
+
+      __m256i sum = _mm256_add_epi32(dot01, dot23);
+      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_add_epi32(sum, _mm256_set1_epi32(wp_offset1));
+      sum = _mm256_srai_epi32(sum, wp_shift1);
+      sum = _mm256_packs_epi32(sum, sum);
+      sum = _mm256_packus_epi16(sum, sum);
+
+      kvz_pixel *dst_addr0 = &dst[(y + 0) * dst_stride + x];
+      kvz_pixel *dst_addr1 = &dst[(y + 1) * dst_stride + x];
+      *(uint32_t*)dst_addr0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum));
+      *(uint32_t*)dst_addr1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sum, 1));
+    }
+  }
+}
+
+static void kvz_ipol_4tap_ver_hi_hi_avx2(int8_t *filter,
+  int width,
+  int height,
+  int16_t *src,
+  int16_t src_stride,
+  int16_t *dst,
+  int16_t dst_stride)
+{
+  const int shift2 = 6;
+
+  __m128i weights_8b = _mm_set1_epi64x(*(uint64_t *)filter);
+  __m256i weights_16b = _mm256_cvtepi8_epi16(weights_8b);
+  __m256i all_w01 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i all_w23 = _mm256_shuffle_epi32(weights_16b, _MM_SHUFFLE(1, 1, 1, 1));
+
+  for (int x = 0; x + 3 < width; x += 4) {
+
+    int16_t *strip_ptr = src + 0 * src_stride + x;
+
+    // Initial values
+    // Broadcasted rows in both lanes
+    // __m256i r0; // Unused
+    // __m256i r1; // Unused
+    __m256i r2 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 0 * src_stride));
+    __m256i r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 1 * src_stride));
+    __m256i r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 2 * src_stride));
+
+    // Consecutive rows in low and high lanes
+    // __m256i r0_r1; // Unused
+    // __m256i r1_r2; // Unused
+    __m256i r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+    __m256i r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+
+    // Paired samples of consecutive rows
+    __m256i r01_r12;
+    __m256i r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+
+    for (int y = 0; y < height; y += 2) {
+
+      strip_ptr = src + y * src_stride + x;
+
+      // Slide window
+      r01_r12 = r23_r34;
+      r2 = r4;
+      r3 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 3 * src_stride));
+      r4 = _mm256_set1_epi64x(*(uint64_t *)(strip_ptr + 4 * src_stride));
+      r2_r3 = _mm256_blend_epi32(r2, r3, 0xF0);
+      r3_r4 = _mm256_blend_epi32(r3, r4, 0xF0);
+
+      r23_r34 = _mm256_unpacklo_epi16(r2_r3, r3_r4);
+
+      __m256i dot01 = _mm256_madd_epi16(r01_r12, all_w01);
+      __m256i dot23 = _mm256_madd_epi16(r23_r34, all_w23);
+
+      __m256i sum = _mm256_add_epi32(dot01, dot23);
+      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_packs_epi32(sum, sum);
+
       int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];
       int16_t *dst_addr1 = &dst[(y + 1) * dst_stride + x];
       _mm_storel_epi64((__m128i *)dst_addr0, _mm256_castsi256_si128(sum));
@@ -1441,8 +1645,6 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
     return;
   }
 
-  int x, y;
-
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
 
@@ -1468,22 +1670,20 @@ static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * cons
   const int16_t mv[2])
 {
   // TODO: horizontal and vertical only filtering
-  int x, y;
-
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
   
   // Buffer for intermediate values with one extra row 
   // because the loop writes two rows each iteration.
-  ALIGNED(64) int16_t hor_filtered[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
   int16_t hor_stride = LCU_WIDTH;
 
-  kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_filtered, hor_stride);
-  kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_filtered, hor_stride, dst, dst_stride);
+  kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
+  kvz_ipol_8tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
 
-static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder,
+static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder,
   kvz_pixel *src,
   int16_t src_stride,
   int width,
@@ -1494,118 +1694,49 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encode
   int8_t ver_flag,
   const int16_t mv[2])
 {
-  // TODO: Optimize SMP and AMP
-  if (width != height) {
+  // TODO: Optimizations for rest of the blocks (for example 2x8).
+  if (width % 4 != 0) {
     kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
     return;
   }
-  int x, y;
-
   int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7];
   int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7];
 
+  // Buffer for intermediate values with 3 extra rows 
+  // because the loop writes four rows each iteration.
+  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
   int16_t hor_stride = LCU_WIDTH_C;
-  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C];
 
-  // HORIZONTAL STEP
-  __m256i shuf_01, shuf_23;
-  __m256i taps_01, taps_23;
-
-  kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23);
-  kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23);
-
-  for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) {
-
-    for (x = 0; x + 3 < width; x += 4) {
-      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
-      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
-      kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride,
-        &shuf_01, &shuf_23,
-        &taps_01, &taps_23); //TODO: >> shift1
-    }
-  }
-
-  __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0));
-  
-  int rows = 3;
-  for (x = 0; x + 3 < width; x += 4) {
-    int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
-    int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
-    kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride,
-      &shuf_01_23, &taps_01_23,
-      rows); //TODO: >> shift1
-  }
-
-  // VERTICAL STEP
-  for (y = 0; y + 3 < height; y += 4) {
-    for (x = 0; x + 3 < width; x += 4) {
-      kvz_four_tap_filter_ver_16bit_4x4_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
-    }
-  }
+  kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
+  kvz_ipol_4tap_ver_hi_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
-static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder,
-  kvz_pixel *src, 
-  int16_t src_stride, 
-  int width, 
-  int height, 
-  int16_t *dst, 
-  int16_t dst_stride, 
-  int8_t hor_flag, 
-  int8_t ver_flag, 
+static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  int16_t *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
   const int16_t mv[2])
 {
-  // TODO: Optimize SMP and AMP
-  if (width != height) {
+  // TODO: Optimizations for rest of the blocks (for example 2x8).
+  if (width % 4 != 0) {
     kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
     return;
   }
-  // TODO: horizontal and vertical only filtering
-  int x, y;
-
   int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7];
   int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7];
 
+  // Buffer for intermediate values with 3 extra rows 
+  // because the loop writes four rows each iteration.
+  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
   int16_t hor_stride = LCU_WIDTH_C;
-  int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C];
 
-  // HORIZONTAL STEP
-  __m256i shuf_01, shuf_23;
-  __m256i taps_01, taps_23;
-
-  kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23);
-  kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23);
-
-  for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) {
-
-    for (x = 0; x + 3 < width; x += 4) {
-      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
-      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
-      kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride,
-        &shuf_01, &shuf_23,
-        &taps_01, &taps_23); //TODO: >> shift1
-    }
-  }
-
-  __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0));
-  
-  int rows = 3;
-  for (x = 0; x + 3 < width; x += 4) {
-    int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
-    int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
-    kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride,
-      &shuf_01_23, &taps_01_23,
-      rows); //TODO: >> shift1
-  }
-
-  // VERTICAL STEP
-  for (y = 0; y + 3 < height; y += 4) {
-    for (x = 0; x + 3 < width; x += 4) {
-      kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride);
-    }
-  }
+  kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
+  kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
 #endif //COMPILE_INTEL_AVX2

From e38219e489d490535b0e2b01a2d93bd1594637bc Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Fri, 5 Mar 2021 18:14:27 +0200
Subject: [PATCH 09/19] Fix epol_func signature and function definition

---
 src/strategies/generic/ipol-generic.c | 44 +++++++++++++--------------
 src/strategies/strategies-ipol.h      |  2 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index 75c0eee9..b9024c98 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -728,34 +728,34 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
 }
 
 
-void kvz_get_extended_block_generic(kvz_epol_args args) {
+void kvz_get_extended_block_generic(kvz_epol_args *args) {
 
-  int min_y = args.blk_y - args.pad_t;
-  int max_y = args.blk_y + args.blk_h + args.pad_b - 1;
-  bool out_of_bounds_y = (min_y < 0) || (max_y >= args.src_h);
+  int min_y = args->blk_y - args->pad_t;
+  int max_y = args->blk_y + args->blk_h + args->pad_b - 1;
+  bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
 
-  int min_x = args.blk_x - args.pad_l;
-  int max_x = args.blk_x + args.blk_w + args.pad_r - 1;
-  bool out_of_bounds_x = (min_x < 0) || (max_x >= args.src_w);
+  int min_x = args->blk_x - args->pad_l;
+  int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
+  bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
 
   if (out_of_bounds_y || out_of_bounds_x) {
 
-    *args.ext = args.buf;
-    *args.ext_s = args.pad_l + args.blk_w + args.pad_r;
-    *args.ext_origin = args.buf + args.pad_t * (*args.ext_s) + args.pad_l;
+    *args->ext = args->buf;
+    *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
+    *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
 
-    int cnt_l = CLIP(0, *args.ext_s, -min_x);
-    int cnt_r = CLIP(0, *args.ext_s, max_x - (args.src_w - 1));
-    int cnt_m = CLIP(0, *args.ext_s, *args.ext_s - cnt_l - cnt_r);
+    int cnt_l = CLIP(0, *args->ext_s, -min_x);
+    int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
+    int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
 
     // For each row including padding
-    for (int y = -args.pad_t; y < args.blk_h + args.pad_b; ++y) {
+    for (int y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
 
-      int clipped_y = CLIP(0, args.src_h - 1, args.blk_y + y);
-      kvz_pixel sample_l = *(args.src + clipped_y * args.src_s);
-      kvz_pixel sample_r = *(args.src + clipped_y * args.src_s + args.src_w - 1);
-      kvz_pixel *src_m = args.src + clipped_y * args.src_s + MAX(min_x, 0);
-      kvz_pixel *dst_l = args.buf + (y + args.pad_t) * (*args.ext_s);
+      int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
+      kvz_pixel sample_l = *(args->src + clipped_y * args->src_s);
+      kvz_pixel sample_r = *(args->src + clipped_y * args->src_s + args->src_w - 1);
+      kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
+      kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
       kvz_pixel *dst_m = dst_l + cnt_l;
       kvz_pixel *dst_r = dst_m + cnt_m;
       for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l;
@@ -764,9 +764,9 @@ void kvz_get_extended_block_generic(kvz_epol_args args) {
     }
   } else {
 
-    *args.ext = args.src + (args.blk_y - args.pad_t) * args.src_s + (args.blk_x - args.pad_l);
-    *args.ext_origin = args.src + args.blk_y * args.src_s + args.blk_x;
-    *args.ext_s = args.src_s;
+    *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
+    *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
+    *args->ext_s = args->src_s;
   }
 }
 
diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h
index 0566507d..31d15bc4 100644
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@@ -64,7 +64,7 @@ typedef struct {
   int *ext_s; // Stride
 } kvz_epol_args;
 
-typedef unsigned(epol_func)(kvz_epol_args *args);
+typedef void(epol_func)(kvz_epol_args *args);
 
 
 typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

From d8e7aac380c584557157eb6147106a52fb6223a3 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Fri, 5 Mar 2021 18:50:00 +0200
Subject: [PATCH 10/19] Do not use nonstandard extension for struct
 initialization.

---
 src/image.c        | 11 +++++++----
 src/inter.c        | 44 ++++++++++++++++++++++++++++----------------
 src/search_inter.c | 11 +++++++----
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/image.c b/src/image.c
index 9fa47c64..71e791dd 100644
--- a/src/image.c
+++ b/src/image.c
@@ -497,12 +497,15 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
       .pad_r = 0,
       .pad_t = 0,
       .pad_b = 0,
-      .buf = ext_buffer,
-      .ext = &ext,
-      .ext_origin = &ext_origin,
-      .ext_s = &ext_s
     };
 
+    // Initialize separately. Gets rid of warning
+    // about using nonstandard extension.
+    epol_args.buf = ext_buffer;
+    epol_args.ext = &ext;
+    epol_args.ext_origin = &ext_origin;
+    epol_args.ext_s = &ext_s;
+
     kvz_get_extended_block(&epol_args);
 
     const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
diff --git a/src/inter.c b/src/inter.c
index f34a2a2a..11ed73ac 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -72,12 +72,15 @@ static void inter_recon_frac_luma(const encoder_state_t *const state,
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
-    .buf = ext_buffer,
-    .ext = &ext,
-    .ext_origin = &ext_origin,
-    .ext_s = &ext_s
   };
 
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
   kvz_get_extended_block(&epol_args);
   kvz_sample_quarterpel_luma(state->encoder_control,
     ext_origin,
@@ -123,12 +126,15 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t *const state,
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
-    .buf = ext_buffer,
-    .ext = &ext,
-    .ext_origin = &ext_origin,
-    .ext_s = &ext_s
   };
 
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
   kvz_get_extended_block(&epol_args);
   kvz_sample_14bit_quarterpel_luma(state->encoder_control,
     ext_origin,
@@ -177,12 +183,15 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
-    .buf = ext_buffer,
-    .ext = &ext,
-    .ext_origin = &ext_origin,
-    .ext_s = &ext_s
   };
 
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
   kvz_get_extended_block(&epol_args);
   kvz_sample_octpel_chroma(state->encoder_control,
     ext_origin,
@@ -245,12 +254,15 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
-    .buf = ext_buffer,
-    .ext = &ext,
-    .ext_origin = &ext_origin,
-    .ext_s = &ext_s
   };
 
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
   kvz_get_extended_block(&epol_args);
   kvz_sample_14bit_octpel_chroma(state->encoder_control,
     ext_origin,
diff --git a/src/search_inter.c b/src/search_inter.c
index 0fcd70c6..81e6fb92 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1032,12 +1032,15 @@ static void search_frac(inter_search_info_t *info)
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
-    .buf = ext_buffer,
-    .ext = &ext,
-    .ext_origin = &ext_origin,
-    .ext_s = &ext_s
   };
 
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
   kvz_get_extended_block(&epol_args);
 
   kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;

From 563165146985c302fdd8d0ae74af696050cfc186 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Fri, 5 Mar 2021 18:31:32 +0200
Subject: [PATCH 11/19] Remove unused functions and variables

---
 src/strategies/avx2/ipol-avx2.c | 503 --------------------------------
 1 file changed, 503 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 22ea3641..987461c6 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -56,111 +56,6 @@ static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
   return filtered;
 }
 
-static void kvz_init_shuffle_masks(__m256i *shuf_01_23, __m256i *shuf_45_67) {
-  // Shuffle pairs
-  *shuf_01_23 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-                                 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
-  *shuf_45_67 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-                                 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
-}
-
-static void kvz_init_shuffle_masks_chroma(__m256i *shuf_01, __m256i *shuf_23) {
-  // Shuffle pairs
-  *shuf_01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12,
-                              0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
-  *shuf_23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14,
-                              2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
-}
-
-static void kvz_init_filter_taps(int8_t *filter,
-  __m256i *taps_01_23, __m256i *taps_45_67) {
-  // Filter weights
-  __m256i all_taps = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter));
-  __m256i perm_01 = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
-  __m256i perm_23 = _mm256_setr_epi32(2, 2, 2, 2, 3, 3, 3, 3);
-  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
-  *taps_01_23 = _mm256_permutevar8x32_epi32(all_taps, perm_01);
-  *taps_45_67 = _mm256_permutevar8x32_epi32(all_taps, perm_23);
-}
-
-static void kvz_init_filter_taps_chroma(int8_t *filter,
-  __m256i *taps_01, __m256i *taps_23) {
-  // Filter weights
-  __m256i all_taps = _mm256_set1_epi32(*(int32_t*)filter);
-  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
-  *taps_01 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
-  *taps_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
-}
-
-static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) {
-  for (int i = 0; i < 4; ++i) filters[i] = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&filter[2 * i]));
-  filters[0] = _mm256_inserti128_si256(filters[0], _mm256_castsi256_si128(filters[3]), 1); // Pairs 01 67
-  filters[1] = _mm256_inserti128_si256(filters[1], _mm256_castsi256_si128(filters[0]), 1); // Pairs 23 01
-  filters[2] = _mm256_inserti128_si256(filters[2], _mm256_castsi256_si128(filters[1]), 1); // Pairs 45 23
-  filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45
-}
-
-static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out,
-  __m256i *shuf_01_23, __m256i *shuf_45_67,
-  __m256i *taps_01_23, __m256i *taps_45_67) {
-
-  __m256i row = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)data));
-
-  __m256i pairs_01_23 = _mm256_shuffle_epi8(row, *shuf_01_23);
-  __m256i pairs_45_67 = _mm256_shuffle_epi8(row, *shuf_45_67);
-  
-  __m256i temp0 = _mm256_maddubs_epi16(pairs_01_23, *taps_01_23);
-  __m256i temp1 = _mm256_maddubs_epi16(pairs_45_67, *taps_45_67);
-
-  __m256i sum = _mm256_add_epi16(temp0, temp1);
-  __m128i filtered = _mm_add_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-  _mm_storeu_si128((__m128i*)out, filtered);
-}
-
-static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
-  __m256i *shuf_01, __m256i *shuf_23,
-  __m256i *taps_01, __m256i *taps_23) {
-
-  __m256i four_rows = _mm256_setr_epi64x(
-    *(int64_t*)&data[0 * stride],
-    *(int64_t*)&data[1 * stride],
-    *(int64_t*)&data[2 * stride],
-    *(int64_t*)&data[3 * stride]);
-
-  __m256i pairs_l = _mm256_shuffle_epi8(four_rows, *shuf_01);
-  __m256i pairs_r = _mm256_shuffle_epi8(four_rows, *shuf_23);
-
-  __m256i temp_l = _mm256_maddubs_epi16(pairs_l, *taps_01);
-  __m256i temp_r = _mm256_maddubs_epi16(pairs_r, *taps_23);
-
-  __m256i sum = _mm256_add_epi16(temp_l, temp_r);
-
-  __m128i lower = _mm256_castsi256_si128(sum);
-  __m128i upper = _mm256_extracti128_si256(sum, 1);
-  _mm_storel_epi64((__m128i*)(out + 0 * out_stride), lower);
-  _mm_storeh_pd((double*)(out + 1 * out_stride), _mm_castsi128_pd(lower));
-  _mm_storel_epi64((__m128i*)(out + 2 * out_stride), upper);
-  _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper));
-}
-
-static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
-  __m256i *shuf_01_23, __m256i *taps_01_23,
-  int rows) {
-
-  for (int i = 0; i < rows; ++i) {
-    __m256i row = _mm256_set1_epi64x(*(int64_t*)&data[i * stride]);
-
-    __m256i pairs_l_r = _mm256_shuffle_epi8(row, *shuf_01_23);
-    __m256i temp_l_r = _mm256_maddubs_epi16(pairs_l_r, *taps_01_23);
-
-    __m128i temp_l = _mm256_castsi256_si128(temp_l_r);
-    __m128i temp_r = _mm256_extracti128_si256(temp_l_r, 1);
-    __m128i sum = _mm_add_epi16(temp_l, temp_r);
-
-    _mm_storel_epi64((__m128i*)(out + i * out_stride), sum);
-  }
-}
-
 static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
 {
   __m128i fir = _mm_loadl_epi64((__m128i*)filter);
@@ -243,372 +138,6 @@ static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *dat
   _mm_storel_epi64((__m128i*)out, filtered);
 }
 
-static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride)
-{
-  // Interpolation filter shifts
-  int32_t shift2 = 6;
-
-  // Weighted prediction offset and shift
-  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
-  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
-
-  // Filter weights
-  __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter));
-  __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
-  __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
-
-  __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]);
-  __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]);
-  __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]);
-  __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]);
-  __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]);
-  __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]);
-  __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]);
-
-  __m128i pairs01 = _mm_unpacklo_epi16(row0, row1);
-  __m128i pairs23 = _mm_unpacklo_epi16(row2, row3);
-  __m128i temp01 = _mm_madd_epi16(pairs01, taps_01);
-  __m128i temp23 = _mm_madd_epi16(pairs23, taps_23);
-  __m128i sum0123 = _mm_add_epi32(temp01, temp23);
-
-  __m128i pairs12 = _mm_unpacklo_epi16(row1, row2);
-  __m128i pairs34 = _mm_unpacklo_epi16(row3, row4);
-  __m128i temp12 = _mm_madd_epi16(pairs12, taps_01);
-  __m128i temp34 = _mm_madd_epi16(pairs34, taps_23);
-  __m128i sum1234 = _mm_add_epi32(temp12, temp34);
-
-  __m128i pairs45 = _mm_unpacklo_epi16(row4, row5);
-  __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01);
-  __m128i temp45 = _mm_madd_epi16(pairs45, taps_23);
-  __m128i sum2345 = _mm_add_epi32(temp23_2, temp45);
-
-  __m128i pairs56 = _mm_unpacklo_epi16(row5, row6);
-  __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01);
-  __m128i temp56 = _mm_madd_epi16(pairs56, taps_23);
-  __m128i sum3456 = _mm_add_epi32(temp34_2, temp56);
-
-  sum0123 = _mm_srai_epi32(sum0123, shift2);
-  sum1234 = _mm_srai_epi32(sum1234, shift2);
-  sum2345 = _mm_srai_epi32(sum2345, shift2);
-  sum3456 = _mm_srai_epi32(sum3456, shift2);
-
-  __m128i offset = _mm_set1_epi32(wp_offset1);
-  sum0123 = _mm_add_epi32(sum0123, offset);
-  sum1234 = _mm_add_epi32(sum1234, offset);
-  sum2345 = _mm_add_epi32(sum2345, offset);
-  sum3456 = _mm_add_epi32(sum3456, offset);
-
-  sum0123 = _mm_srai_epi32(sum0123, wp_shift1);
-  sum1234 = _mm_srai_epi32(sum1234, wp_shift1);
-  sum2345 = _mm_srai_epi32(sum2345, wp_shift1);
-  sum3456 = _mm_srai_epi32(sum3456, wp_shift1);
-
-  __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234);
-  __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456);
-  __m128i filtered = _mm_packus_epi16(filtered01, filtered23);
-
-  *(int32_t*)&out[0 * out_stride] = _mm_cvtsi128_si32(filtered);
-  *(int32_t*)&out[1 * out_stride] = _mm_extract_epi32(filtered, 1);
-  *(int32_t*)&out[2 * out_stride] = _mm_extract_epi32(filtered, 2);
-  *(int32_t*)&out[3 * out_stride] = _mm_extract_epi32(filtered, 3);
-}
-
-static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int16_t *data, int16_t stride, int16_t *out, int16_t out_stride)
-{
-  int32_t shift2 = 6;
-
-  // Filter weights
-  __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter));
-  __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
-  __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
-
-  __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]);
-  __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]);
-  __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]);
-  __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]);
-  __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]);
-  __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]);
-  __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]);
-
-  __m128i pairs01 = _mm_unpacklo_epi16(row0, row1);
-  __m128i pairs23 = _mm_unpacklo_epi16(row2, row3);
-  __m128i temp01 = _mm_madd_epi16(pairs01, taps_01);
-  __m128i temp23 = _mm_madd_epi16(pairs23, taps_23);
-  __m128i sum0123 = _mm_add_epi32(temp01, temp23);
-
-  __m128i pairs12 = _mm_unpacklo_epi16(row1, row2);
-  __m128i pairs34 = _mm_unpacklo_epi16(row3, row4);
-  __m128i temp12 = _mm_madd_epi16(pairs12, taps_01);
-  __m128i temp34 = _mm_madd_epi16(pairs34, taps_23);
-  __m128i sum1234 = _mm_add_epi32(temp12, temp34);
-
-  __m128i pairs45 = _mm_unpacklo_epi16(row4, row5);
-  __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01);
-  __m128i temp45 = _mm_madd_epi16(pairs45, taps_23);
-  __m128i sum2345 = _mm_add_epi32(temp23_2, temp45);
-
-  __m128i pairs56 = _mm_unpacklo_epi16(row5, row6);
-  __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01);
-  __m128i temp56 = _mm_madd_epi16(pairs56, taps_23);
-  __m128i sum3456 = _mm_add_epi32(temp34_2, temp56);
-
-  sum0123 = _mm_srai_epi32(sum0123, shift2);
-  sum1234 = _mm_srai_epi32(sum1234, shift2);
-  sum2345 = _mm_srai_epi32(sum2345, shift2);
-  sum3456 = _mm_srai_epi32(sum3456, shift2);
-
-  __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234);
-  __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456);
-
-  _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered01));
-  _mm_storeh_pi((__m64*)&out[1 * out_stride], _mm_castsi128_ps(filtered01));
-  _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered23));
-  _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23));
-}
-
-INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride)
-{
-  // Interpolation filter shifts
-  int32_t shift2 = 6;
-
-  // Weighted prediction offset and shift
-  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
-  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
-
-  __m256i pairs_lo, pairs_hi;
-
-  // Filter 01 later with 67
-  __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride)));
-  __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride)));
-
-  __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride)));
-  __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br2, br3);
-  pairs_hi = _mm256_unpackhi_epi16(br2, br3);
-  __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01
-  __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01
-
-  __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride)));
-  __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br4, br5);
-  pairs_hi = _mm256_unpackhi_epi16(br4, br5);
-  __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23
-  __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23
-
-  __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride)));
-  __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br6, br7);
-  pairs_hi = _mm256_unpackhi_epi16(br6, br7);
-  __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45
-  __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45
-  __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01
-  __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01
-
-  __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride)));
-  __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br8, br9);
-  pairs_hi = _mm256_unpackhi_epi16(br8, br9);
-  // Filter rows02 later
-  __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23
-  __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23
-
-  __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride)));
-  __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br10, br11);
-  pairs_hi = _mm256_unpackhi_epi16(br10, br11);
-  __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45
-  __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45
-
-  // Deferred
-  __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0));
-  pairs_lo = _mm256_unpacklo_epi16(r08, r19);
-  pairs_hi = _mm256_unpackhi_epi16(r08, r19);
-  __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67
-  __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67
-
-  __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride)));
-  __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride)));
-
-  __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0));
-  pairs_lo = _mm256_unpacklo_epi16(r412, r513);
-  pairs_hi = _mm256_unpackhi_epi16(r412, r513);
-  __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67
-  __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67
-
-  __m256i accu02_lo, accu02_hi;
-  __m256i accu46_lo, accu46_hi;
-
-  accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo);
-  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo);
-  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo);
-
-  accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi);
-  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi);
-  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi);
-
-  accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo);
-  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo);
-  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo);
-
-  accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi);
-  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi);
-  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi);
-
-  accu02_lo = _mm256_srai_epi32(accu02_lo, shift2);
-  accu02_hi = _mm256_srai_epi32(accu02_hi, shift2);
-  accu46_lo = _mm256_srai_epi32(accu46_lo, shift2);
-  accu46_hi = _mm256_srai_epi32(accu46_hi, shift2);
-
-  __m256i offset = _mm256_set1_epi32(wp_offset1);
-  accu02_lo = _mm256_add_epi32(accu02_lo, offset);
-  accu02_hi = _mm256_add_epi32(accu02_hi, offset);
-  accu46_lo = _mm256_add_epi32(accu46_lo, offset);
-  accu46_hi = _mm256_add_epi32(accu46_hi, offset);
-
-  accu02_lo = _mm256_srai_epi32(accu02_lo, wp_shift1);
-  accu02_hi = _mm256_srai_epi32(accu02_hi, wp_shift1);
-  accu46_lo = _mm256_srai_epi32(accu46_lo, wp_shift1);
-  accu46_hi = _mm256_srai_epi32(accu46_hi, wp_shift1);
-
-  __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi);
-  __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi);
-
-  __m256i filtered04_26 = _mm256_packus_epi16(rows02, rows46);
-  __m128i filtered04 = _mm256_castsi256_si128(filtered04_26);
-  __m128i filtered26 = _mm256_extracti128_si256(filtered04_26, 1);
-
-  _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered04));
-  _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered26));
-  _mm_storeh_pi((__m64*)&out[4 * out_stride], _mm_castsi128_ps(filtered04));
-  _mm_storeh_pi((__m64*)&out[6 * out_stride], _mm_castsi128_ps(filtered26));
-}
-
-INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t stride, __m256i *taps, int16_t *out, int64_t out_stride) {
-
-  int32_t shift2 = 6;
-
-  __m256i pairs_lo, pairs_hi;
-
-  // Filter 01 later with 67
-  __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride)));
-  __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride)));
-
-  __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride)));
-  __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br2, br3);
-  pairs_hi = _mm256_unpackhi_epi16(br2, br3);
-  __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01
-  __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01
-
-  __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride)));
-  __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br4, br5);
-  pairs_hi = _mm256_unpackhi_epi16(br4, br5);
-  __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23
-  __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23
-
-  __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride)));
-  __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br6, br7);
-  pairs_hi = _mm256_unpackhi_epi16(br6, br7);
-  __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45
-  __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45
-  __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01
-  __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01
-
-  __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride)));
-  __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br8, br9);
-  pairs_hi = _mm256_unpackhi_epi16(br8, br9);
-  // Filter rows02 later
-  __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23
-  __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23
-
-  __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride)));
-  __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride)));
-  pairs_lo = _mm256_unpacklo_epi16(br10, br11);
-  pairs_hi = _mm256_unpackhi_epi16(br10, br11);
-  __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45
-  __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45
-
-                                                                  // Deferred
-  __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0));
-  pairs_lo = _mm256_unpacklo_epi16(r08, r19);
-  pairs_hi = _mm256_unpackhi_epi16(r08, r19);
-  __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67
-  __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67
-
-  __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride)));
-  __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride)));
-
-  __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0));
-  __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0));
-  pairs_lo = _mm256_unpacklo_epi16(r412, r513);
-  pairs_hi = _mm256_unpackhi_epi16(r412, r513);
-  __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67
-  __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67
-
-  __m256i accu02_lo, accu02_hi;
-  __m256i accu46_lo, accu46_hi;
-
-  accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo);
-  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo);
-  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo);
-
-  accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi);
-  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi);
-  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi);
-
-  accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo);
-  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo);
-  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo);
-
-  accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi);
-  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi);
-  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi);
-
-  accu02_lo = _mm256_srai_epi32(accu02_lo, shift2);
-  accu02_hi = _mm256_srai_epi32(accu02_hi, shift2);
-  accu46_lo = _mm256_srai_epi32(accu46_lo, shift2);
-  accu46_hi = _mm256_srai_epi32(accu46_hi, shift2);
-
-  __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi);
-  __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi);
-
-  __m128i filtered0 = _mm256_castsi256_si128(rows02);
-  __m128i filtered2 = _mm256_extracti128_si256(rows02, 1);
-  __m128i filtered4 = _mm256_castsi256_si128(rows46);
-  __m128i filtered6 = _mm256_extracti128_si256(rows46, 1);
-
-  _mm_storeu_si128((__m128i*)(out + 0 * out_stride), filtered0);
-  _mm_storeu_si128((__m128i*)(out + 2 * out_stride), filtered2);
-  _mm_storeu_si128((__m128i*)(out + 4 * out_stride), filtered4);
-  _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6);
-}
-
-INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride)
-{
-  // Filter even rows
-  filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
-
-  // Filter odd rows
-  filter_row_ver_16b_8x1_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7
- 
-}
-
-INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *filters, int16_t *data, int16_t stride, int16_t *out, int out_stride)
-{
-  // Filter even rows
-  filter_row_ver_16b_8x1_no_round_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
-
-  // Filter odd rows
-  filter_row_ver_16b_8x1_no_round_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7
-
-}
-
 static void kvz_ipol_8tap_hor_px_im_avx2(int8_t *filter,
   int width,
   int height,
@@ -1122,9 +651,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 
   // HORIZONTAL STEP
   // Integer pixels
-  __m256i shuf_01_23, shuf_45_67;
-  __m256i taps_01_23, taps_45_67;
-
   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
     
     for (x = 0; x + 7 < width; x += 8) {
@@ -1147,9 +673,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   }
 
   // Half pixels
-  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
-  kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67);
-
   kvz_ipol_8tap_hor_px_im_avx2(fir2, width, height + 1, src + 1, src_stride, hor_pos2, hor_stride);
 
   // Write the first column in contiguous memory
@@ -1166,9 +689,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   kvz_pixel *out_t = filtered[2];
   kvz_pixel *out_b = filtered[3];
 
-  __m256i taps[4];
-  kvz_init_ver_filter_taps(fir0, taps);
-
   // Right
   int16_t *im = &hor_pos2[hor_stride];
   kvz_pixel *dst = out_r;
@@ -1186,7 +706,6 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
     out_l[y * dst_stride + x] = sample;
   }
 
-  kvz_init_ver_filter_taps(fir2, taps);
   // Top
   im = hor_pos0;
   dst = out_t;
@@ -1240,8 +759,6 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   kvz_pixel *out_bl = filtered[2];
   kvz_pixel *out_br = filtered[3];
 
- __m256i taps[4];
-  kvz_init_ver_filter_taps(fir2, taps);
   // Top-Right
   int16_t *im = hor_pos2;
   kvz_pixel *dst = out_tr;
@@ -1336,13 +853,7 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
 
   // HORIZONTAL STEP
-  __m256i shuf_01_23, shuf_45_67;
-  __m256i taps_01_23, taps_45_67;
-
   // Left QPEL
-  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
-  kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67);
-  
   int sample_off_y = hpel_off_y < 0 ? 0 : 1;
   kvz_ipol_8tap_hor_px_im_avx2(hor_fir_l, width, height + 1, src + 1, src_stride, hor_pos_l, hor_stride);
 
@@ -1355,7 +866,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   }
 
   // Right QPEL
-  kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67);
   kvz_ipol_8tap_hor_px_im_avx2(hor_fir_r, width, height + 1, src + 1, src_stride, hor_pos_r, hor_stride);
 
   // Write the first column in contiguous memory
@@ -1377,12 +887,8 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
   int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
 
-  __m256i taps[4];
-
   // Left QPEL (1/4 or 3/4 x positions) 
   // Filter block and then filter column and align if neccessary
-  kvz_init_ver_filter_taps(ver_fir_l, taps);
-
   int16_t *im = &hor_pos_l[sample_off_y * hor_stride];
   kvz_pixel *dst = out_l;
   kvz_ipol_8tap_ver_im_px_avx2(ver_fir_l, width, height, im, hor_stride, dst, dst_stride);
@@ -1406,8 +912,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 
   // Right QPEL (3/4 or 1/4 x positions)
   // Filter block and then filter column and align if neccessary
-  kvz_init_ver_filter_taps(ver_fir_r, taps);
-
   im = &hor_pos_r[sample_off_y * hor_stride];
   dst = out_r;
   kvz_ipol_8tap_ver_im_px_avx2(ver_fir_r, width, height, im, hor_stride, dst, dst_stride);
@@ -1433,7 +937,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   // Top QPEL (1/4 or 3/4 y positions)
   // Filter block and then filter column and align if neccessary
   int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
-  kvz_init_ver_filter_taps(ver_fir_t, taps);
 
   im = &hor_hpel_pos[off_y_fir_t * hor_stride];
   dst = out_t;
@@ -1458,7 +961,6 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
 
   // Bottom QPEL (3/4 or 1/4 y positions)
   // Filter block and then filter column and align if neccessary
-  kvz_init_ver_filter_taps(ver_fir_b, taps);
 
   im = &hor_hpel_pos[off_y_fir_b * hor_stride];
   dst = out_b;
@@ -1529,11 +1031,8 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
   int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
 
-  __m256i taps[4];
   // Top-left QPEL
   // Filter block and then filter column and align if neccessary
-  kvz_init_ver_filter_taps(ver_fir_t, taps);
-
   int16_t *im = &hor_pos_l[off_y_fir_t * hor_stride];
   kvz_pixel *dst = out_tl;
   kvz_ipol_8tap_ver_im_px_avx2(ver_fir_t, width, height, im, hor_stride, dst, dst_stride);
@@ -1581,8 +1080,6 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
 
   // Bottom-left QPEL
   // Filter block and then filter column and align if neccessary
-  kvz_init_ver_filter_taps(ver_fir_b, taps);
-
   im = &hor_pos_l[off_y_fir_b * hor_stride];
   dst = out_bl;
   kvz_ipol_8tap_ver_im_px_avx2(ver_fir_b, width, height, im, hor_stride, dst, dst_stride);

From 5a70b49f690713cdae2604933a12428e402a7958 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Sat, 6 Mar 2021 18:09:57 +0200
Subject: [PATCH 12/19] Require 64-bit build for AVX2 interpolation filter
 functions

---
 src/strategies/avx2/ipol-avx2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 987461c6..8db217aa 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -24,7 +24,7 @@
 
 #include "strategies/avx2/ipol-avx2.h"
 
-#if COMPILE_INTEL_AVX2
+#if COMPILE_INTEL_AVX2 && defined X86_64
 #include <immintrin.h>
 #include <stdio.h>
 #include <string.h>
@@ -1236,12 +1236,12 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e
   kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
-#endif //COMPILE_INTEL_AVX2
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
 
 int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
-#if COMPILE_INTEL_AVX2
+#if COMPILE_INTEL_AVX2 && defined X86_64
   if (bitdepth == 8){
     success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2);
@@ -1252,6 +1252,6 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
     success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2);
   }
-#endif //COMPILE_INTEL_AVX2
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
   return success;
 }

From 4314f3a9a7980657f15ad012c53afa04b8af3fe5 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Sat, 6 Mar 2021 19:46:36 +0200
Subject: [PATCH 13/19] Rename some interpolation functions and strategies for
 consistency

---
 src/inter.c                           | 14 +++++++-------
 src/strategies/avx2/ipol-avx2.c       | 24 ++++++++++++------------
 src/strategies/generic/ipol-generic.c |  8 ++++----
 src/strategies/generic/ipol-generic.h |  4 ++--
 src/strategies/strategies-ipol.c      |  4 ++--
 src/strategies/strategies-ipol.h      | 12 ++++++------
 6 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/inter.c b/src/inter.c
index 11ed73ac..95351a5e 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -94,7 +94,7 @@ static void inter_recon_frac_luma(const encoder_state_t *const state,
     mv_param);
 }
 
-static void inter_recon_14bit_frac_luma(const encoder_state_t *const state,
+static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
   const kvz_picture *const ref,
   int32_t xpos,
   int32_t ypos,
@@ -136,7 +136,7 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t *const state,
   epol_args.ext_s = &ext_s;
 
   kvz_get_extended_block(&epol_args);
-  kvz_sample_14bit_quarterpel_luma(state->encoder_control,
+  kvz_sample_quarterpel_luma_hi(state->encoder_control,
     ext_origin,
     ext_s,
     block_width,
@@ -219,7 +219,7 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state,
     mv_param);
 }
 
-static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state,
+static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
   const kvz_picture *const ref,
   int32_t xpos,
   int32_t ypos,
@@ -264,7 +264,7 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state,
   epol_args.ext_s = &ext_s;
 
   kvz_get_extended_block(&epol_args);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
     ext_origin,
     ext_s,
     block_width / 2,
@@ -278,7 +278,7 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t *const state,
   // Chroma V
   epol_args.src = ref->v;
   kvz_get_extended_block(&epol_args);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
     ext_origin,
     ext_s,
     block_width / 2,
@@ -378,7 +378,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
     if (fractional_luma) {
       // With a fractional MV, do interpolation.
       if (state->encoder_control->cfg.bipred && hi_prec_out) {
-        inter_recon_14bit_frac_luma(state, ref,
+        inter_recon_frac_luma_hi(state, ref,
           pu_in_tile.x, pu_in_tile.y,
           width, height,
           mv_param, hi_prec_out);
@@ -418,7 +418,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
   if (fractional_luma || fractional_chroma) {
     // With a fractional MV, do interpolation.
     if (state->encoder_control->cfg.bipred && hi_prec_out) {
-      inter_recon_14bit_frac_chroma(state, ref,
+      inter_recon_frac_chroma_hi(state, ref,
                                     pu_in_tile.x, pu_in_tile.y,
                                     width, height,
                                     mv_param, hi_prec_out);
diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 8db217aa..c3ea92e3 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -409,7 +409,7 @@ int16_t dst_stride)
   }
 }
 
-static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter,
+static void kvz_ipol_4tap_hor_px_im_avx2(int8_t *filter,
   int width,
   int height,
   kvz_pixel *src,
@@ -473,7 +473,7 @@ static void kvz_ipol_4tap_hor_px_hi_avx2(int8_t *filter,
   }
 }
 
-static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter,
+static void kvz_ipol_4tap_ver_im_px_avx2(int8_t *filter,
   int width,
   int height,
   int16_t *src,
@@ -547,7 +547,7 @@ static void kvz_ipol_4tap_ver_hi_px_avx2(int8_t *filter,
   }
 }
 
-static void kvz_ipol_4tap_ver_hi_hi_avx2(int8_t *filter,
+static void kvz_ipol_4tap_ver_im_hi_avx2(int8_t *filter,
   int width,
   int height,
   int16_t *src,
@@ -1155,7 +1155,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
 }
 
 
-static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
+static void kvz_sample_quarterpel_luma_hi_avx2(const encoder_control_t * const encoder,
   kvz_pixel *src, 
   int16_t src_stride, 
   int width, 
@@ -1204,11 +1204,11 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder
   ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
   int16_t hor_stride = LCU_WIDTH_C;
 
-  kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
-  kvz_ipol_4tap_ver_hi_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
+  kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
+  kvz_ipol_4tap_ver_im_px_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
-static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const encoder,
+static void kvz_sample_octpel_chroma_hi_avx2(const encoder_control_t *const encoder,
   kvz_pixel *src,
   int16_t src_stride,
   int width,
@@ -1221,7 +1221,7 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e
 {
   // TODO: Optimizations for rest of the blocks (for example 2x8).
   if (width % 4 != 0) {
-    kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    kvz_sample_octpel_chroma_hi_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
     return;
   }
   int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7];
@@ -1232,8 +1232,8 @@ static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t *const e
   ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
   int16_t hor_stride = LCU_WIDTH_C;
 
-  kvz_ipol_4tap_hor_px_hi_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
-  kvz_ipol_4tap_ver_hi_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
+  kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
+  kvz_ipol_4tap_ver_im_hi_avx2(ver_fir, width, height, hor_intermediate, hor_stride, dst, dst_stride);
 }
 
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
@@ -1249,8 +1249,8 @@ int kvz_strategy_register_ipol_avx2(void* opaque, uint8_t bitdepth)
     success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "avx2", 40, &kvz_filter_qpel_blocks_diag_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "avx2", 40, &kvz_sample_quarterpel_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "avx2", 40, &kvz_sample_octpel_chroma_avx2);
-    success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2);
-    success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "avx2", 40, &kvz_sample_quarterpel_luma_hi_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "avx2", 40, &kvz_sample_octpel_chroma_hi_avx2);
   }
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
   return success;
diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index b9024c98..4b777c86 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
   }
 }
 
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
@@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
   }
 }
 
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
@@ -780,8 +780,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
   success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
   success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
   success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
   success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
 
   return success;
diff --git a/src/strategies/generic/ipol-generic.h b/src/strategies/generic/ipol-generic.h
index f176b4cd..85eb2931 100644
--- a/src/strategies/generic/ipol-generic.h
+++ b/src/strategies/generic/ipol-generic.h
@@ -32,9 +32,9 @@
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
 
 #endif //STRATEGIES_IPOL_GENERIC_H_
diff --git a/src/strategies/strategies-ipol.c b/src/strategies/strategies-ipol.c
index f889b716..0a8990f1 100644
--- a/src/strategies/strategies-ipol.c
+++ b/src/strategies/strategies-ipol.c
@@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 epol_func *kvz_get_extended_block;
 kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h
index 31d15bc4..b47da64f 100644
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@@ -70,8 +70,8 @@ typedef void(epol_func)(kvz_epol_args *args);
 typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
-typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 
 // Declare function pointers.
 extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
@@ -81,8 +81,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 extern epol_func * kvz_get_extended_block;
 extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@@ -95,8 +95,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
   {"filter_qpel_blocks_diag_luma",    (void**) &kvz_filter_qpel_blocks_diag_luma}, \
   {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
   {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
-  {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
-  {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
+  {"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
+  {"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
   {"get_extended_block", (void**) &kvz_get_extended_block}, \
 
 

From 475f1d79d5175970aed03f8ba96aa9e62010ad96 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Sun, 7 Mar 2021 01:27:55 +0200
Subject: [PATCH 14/19] Add some defines for important interpolation related
 sizes

---
 src/inter.c                           | 16 ++++++++--------
 src/search_inter.c                    | 10 +++++-----
 src/strategies/avx2/ipol-avx2.c       | 24 ++++++++++++------------
 src/strategies/generic/ipol-generic.c | 16 ++++++++--------
 src/strategies/strategies-ipol.h      | 17 ++++++++++++++++-
 5 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/src/inter.c b/src/inter.c
index 95351a5e..ff95741f 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -53,9 +53,9 @@ static void inter_recon_frac_luma(const encoder_state_t *const state,
   int mv_frac_y = (mv_param[1] & 3);
 
   // Space for extrapolated pixels and the part from the picture.
-  // One extra row for AVX2.
+  // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
-  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)];
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
   kvz_pixel *ext = NULL;
   kvz_pixel *ext_origin = NULL;
   int ext_s = 0;
@@ -107,9 +107,9 @@ static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
   int mv_frac_y = (mv_param[1] & 3);
 
   // Space for extrapolated pixels and the part from the picture.
-  // One extra row for AVX2.
+  // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
-  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_LUMA * (KVZ_EXT_BLOCK_W_LUMA + 1)];
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
   kvz_pixel *ext = NULL;
   kvz_pixel *ext_origin = NULL;
   int ext_s = 0;
@@ -161,9 +161,9 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state,
   int mv_frac_y = (mv_param[1] & 7);
 
   // Space for extrapolated pixels and the part from the picture.
-  // Three extra rows for AVX2.
+  // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
-  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)];
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
   kvz_pixel *ext = NULL;
   kvz_pixel *ext_origin = NULL;
   int ext_s = 0;
@@ -232,9 +232,9 @@ static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
   int mv_frac_y = (mv_param[1] & 7);
 
   // Space for extrapolated pixels and the part from the picture.
-  // Three extra rows for AVX2.
+  // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
-  kvz_pixel ext_buffer[KVZ_EXT_BLOCK_W_CHROMA * (KVZ_EXT_BLOCK_W_CHROMA + 3)];
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
   kvz_pixel *ext = NULL;
   kvz_pixel *ext_origin = NULL;
   int ext_s = 0;
diff --git a/src/search_inter.c b/src/search_inter.c
index 81e6fb92..a7611248 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -992,11 +992,11 @@ static void search_frac(inter_search_info_t *info)
 
   unsigned costs[4] = { 0 };
 
-  ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
+  ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];
 
   // Storage buffers for intermediate horizontally filtered results.
   // Have the first columns in contiguous memory for vectorization.
-  ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];
 
   const kvz_picture *ref = info->ref;
@@ -1013,9 +1013,9 @@ static void search_frac(inter_search_info_t *info)
   int8_t sample_off_y = 0;
 
   // Space for (possibly) extrapolated pixels and the part from the picture
-  // One extra column for ME and two extra columns for ME and AVX2
+  // One extra row and column compared to normal interpolation and some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
-  kvz_pixel ext_buffer[(KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 2)];
+  kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
   kvz_pixel *ext = NULL;
   kvz_pixel *ext_origin = NULL;
   int ext_s = 0;
@@ -1031,7 +1031,7 @@ static void search_frac(inter_search_info_t *info)
     .pad_l = KVZ_LUMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
-    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
   };
 
   // Initialize separately. Gets rid of warning
diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index c3ea92e3..46173a53 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -618,8 +618,8 @@ static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -730,8 +730,8 @@ static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -809,8 +809,8 @@ static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * e
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -989,8 +989,8 @@ static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * enco
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -1147,7 +1147,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
 
   // Buffer for intermediate values with one extra row 
   // because the loop writes two rows each iteration.
-  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
   int16_t hor_stride = LCU_WIDTH;
 
   kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
@@ -1172,7 +1172,7 @@ static void kvz_sample_quarterpel_luma_hi_avx2(const encoder_control_t * const e
   
   // Buffer for intermediate values with one extra row 
   // because the loop writes two rows each iteration.
-  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
   int16_t hor_stride = LCU_WIDTH;
 
   kvz_ipol_8tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
@@ -1201,7 +1201,7 @@ static void kvz_sample_octpel_chroma_avx2(const encoder_control_t *const encoder
 
   // Buffer for intermediate values with 3 extra rows 
   // because the loop writes four rows each iteration.
-  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
+  ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD];
   int16_t hor_stride = LCU_WIDTH_C;
 
   kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
@@ -1229,7 +1229,7 @@ static void kvz_sample_octpel_chroma_hi_avx2(const encoder_control_t *const enco
 
   // Buffer for intermediate values with 3 extra rows 
   // because the loop writes four rows each iteration.
-  ALIGNED(64) int16_t hor_intermediate[(KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C];
+  ALIGNED(64) int16_t hor_intermediate[KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD];
   int16_t hor_stride = LCU_WIDTH_C;
 
   kvz_ipol_4tap_hor_px_im_avx2(hor_fir, width, height, src, src_stride, hor_intermediate, hor_stride);
diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index 4b777c86..8b5d76c3 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
@@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
   int16_t src_stride,
   int width,
   int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
   int8_t fme_level,
   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t hpel_off_x, int8_t hpel_off_y)
diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h
index b47da64f..31680ec7 100644
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@@ -31,11 +31,26 @@
 #include "kvazaar.h"
 #include "search_inter.h"
 
+// AVX2 implementation of horizontal filter reads and
+// writes two rows for luma and four for chroma at a time.
+// Extra vertical padding is added to prevent segfaults.
+// Horizontal padding is not needed even if one extra byte
+// is read because kvz_image_alloc adds enough padding.
+#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
+#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
+#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
+#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
+
+// On top of basic interpolation, FME needs one extra
+// column and row for ME (left and up). Adding the
+// extra row happens to satisfy AVX2 requirements for
+// row count. No other extra rows are needed.
+#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))
 
 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
 
 typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], 
+  kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
   int8_t sample_off_x, int8_t sample_off_y);
 
 typedef struct {

From 7ce68761c289bb6876ac9cd7825a0bea6f41c594 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Sun, 7 Mar 2021 17:32:17 +0200
Subject: [PATCH 15/19] Add a reminder to fix a rare case for bipred

---
 src/strategies/avx2/ipol-avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index 46173a53..fa72cf27 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -398,7 +398,7 @@ int16_t dst_stride)
       __m256i sum0123 = _mm256_add_epi32(dot01, dot23);
       __m256i sum4567 = _mm256_add_epi32(dot45, dot67);
       __m256i sum = _mm256_add_epi32(sum0123, sum4567);
-      sum = _mm256_srai_epi32(sum, shift2);
+      sum = _mm256_srai_epi32(sum, shift2); // TODO: -8192 offsetting for extreme values
       sum = _mm256_packs_epi32(sum, sum);
 
       int16_t *dst_addr0 = &dst[(y + 0) * dst_stride + x];

From 33295bf350f521b1122591d57469996575438822 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Sun, 7 Mar 2021 19:36:20 +0200
Subject: [PATCH 16/19] Use AVX2 luma interpolation for SMP and AMP as well

---
 src/strategies/avx2/ipol-avx2.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c
index fa72cf27..e63cfb8a 100644
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@@ -1136,12 +1136,7 @@ static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const enco
   int8_t ver_flag, 
   const int16_t mv[2])
 {
-  // TODO: Optimize SMP and AMP
-  if (width != height) {
-    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
-    return;
-  }
-
+  // TODO: horizontal and vertical only filtering
   int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3];
   int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3];
 

From b72ab583b4d3fcc6abdf023eaf82eaa19c9cd076 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Mon, 8 Mar 2021 16:05:34 +0200
Subject: [PATCH 17/19] Handle "don't care" rows in the end separately

---
 src/image.c                           |  1 +
 src/inter.c                           | 12 ++++++++----
 src/search_inter.c                    |  1 +
 src/strategies/generic/ipol-generic.c | 15 ++++++++++++---
 src/strategies/strategies-ipol.h      |  1 +
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/image.c b/src/image.c
index 71e791dd..7572b1f3 100644
--- a/src/image.c
+++ b/src/image.c
@@ -497,6 +497,7 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
       .pad_r = 0,
       .pad_t = 0,
       .pad_b = 0,
+      .pad_b_simd = 0,
     };
 
     // Initialize separately. Gets rid of warning
diff --git a/src/inter.c b/src/inter.c
index ff95741f..b96b082c 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -71,7 +71,8 @@ static void inter_recon_frac_luma(const encoder_state_t *const state,
     .pad_l = KVZ_LUMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
-    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
   };
 
   // Initialize separately. Gets rid of warning
@@ -125,7 +126,8 @@ static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
     .pad_l = KVZ_LUMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
-    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET + 1, // One row for AVX2
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
   };
 
   // Initialize separately. Gets rid of warning
@@ -182,7 +184,8 @@ static void inter_recon_frac_chroma(const encoder_state_t *const state,
     .pad_l = KVZ_CHROMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
-    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
   };
 
   // Initialize separately. Gets rid of warning
@@ -253,7 +256,8 @@ static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
     .pad_l = KVZ_CHROMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
-    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET + 3, // Three rows for AVX2
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
   };
 
   // Initialize separately. Gets rid of warning
diff --git a/src/search_inter.c b/src/search_inter.c
index a7611248..029291f7 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1032,6 +1032,7 @@ static void search_frac(inter_search_info_t *info)
     .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
     .pad_t = KVZ_LUMA_FILTER_OFFSET,
     .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
   };
 
   // Initialize separately. Gets rid of warning
diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index 8b5d76c3..461cc5c2 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -731,7 +731,7 @@ void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder
 void kvz_get_extended_block_generic(kvz_epol_args *args) {
 
   int min_y = args->blk_y - args->pad_t;
-  int max_y = args->blk_y + args->blk_h + args->pad_b - 1;
+  int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
   bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
 
   int min_x = args->blk_x - args->pad_l;
@@ -744,12 +744,15 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) {
     *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
     *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
 
+    // Note that stride equals width here.
     int cnt_l = CLIP(0, *args->ext_s, -min_x);
     int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
     int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
 
-    // For each row including padding
-    for (int y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
+    // For each row including real padding.
+    // Don't read "don't care" values (SIMD padding). Zero them out.
+    int y;
+    for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
 
       int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
       kvz_pixel sample_l = *(args->src + clipped_y * args->src_s);
@@ -762,6 +765,12 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) {
       for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
       for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r;
     }
+
+    for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
+      kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
+      FILL_ARRAY(dst, 0, *args->ext_s);
+    }
+
   } else {
 
     *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h
index 31680ec7..79d4dd4c 100644
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@@ -69,6 +69,7 @@ typedef struct {
   int pad_r; // Right
   int pad_t; // Top
   int pad_b; // Bottom
+  int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
 
   // Buffer for possible extrapolation. Free memory provided by the caller.
   kvz_pixel *buf;

From dad3d6818ef18cef7183cc079a0ffab6fb678e44 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Mon, 8 Mar 2021 16:49:37 +0200
Subject: [PATCH 18/19] Only read left and right border pixels if necessary

---
 src/strategies/generic/ipol-generic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c
index 461cc5c2..e5c5cef6 100644
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@@ -755,15 +755,15 @@ void kvz_get_extended_block_generic(kvz_epol_args *args) {
     for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
 
       int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
-      kvz_pixel sample_l = *(args->src + clipped_y * args->src_s);
-      kvz_pixel sample_r = *(args->src + clipped_y * args->src_s + args->src_w - 1);
+      kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
+      kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
       kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
       kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
       kvz_pixel *dst_m = dst_l + cnt_l;
       kvz_pixel *dst_r = dst_m + cnt_m;
-      for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = sample_l;
+      for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
       for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
-      for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = sample_r;
+      for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
     }
 
     for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {

From 5bc4cdf401114e0abedd5fde292b6111113dfbf8 Mon Sep 17 00:00:00 2001
From: Ari Lemmetti <ari.lemmetti@gmail.com>
Date: Mon, 8 Mar 2021 22:01:27 +0200
Subject: [PATCH 19/19] Update TSAN suppressions

---
 tests/tsan_suppressions.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/tsan_suppressions.txt b/tests/tsan_suppressions.txt
index d2f565b6..9f43b6d5 100644
--- a/tests/tsan_suppressions.txt
+++ b/tests/tsan_suppressions.txt
@@ -1,3 +1,4 @@
-race:kvz_eight_tap_filter_hor_8x1_avx2
+# AVX2 interpolation reads some extra pixels
+race:kvz_ipol_8tap_hor_px_im_avx2
 race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
-race:kvz_eight_tap_filter_hor_avx2
\ No newline at end of file
+race:kvz_eight_tap_filter_hor_avx2