Merge branch 'interpolation-2021'

2024-11-23 18:14:06 +00:00 · 2021-03-08 22:36:34 +02:00 · 2021-03-08 22:36:34 +02:00 · c36d423a8c
parent 17c9fc5cc9 5bc4cdf401
commit c36d423a8c
9 changed files with 934 additions and 1109 deletions
--- a/src/image.c
+++ b/src/image.c
@ -477,33 +477,46 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
                             ref->stride) >> (KVZ_BIT_DEPTH - 8);
  } else {
    // Extrapolate pixels from outside the frame.
-    kvz_extended_block block;
-    kvz_get_extended_block(pic_x,
-                           pic_y,
-                           ref_x - pic_x,
-                           ref_y - pic_y,
-                           0,
-                           0,
-                           ref->y,
-                           ref->width,
-                           ref->height,
-                           0,
-                           block_width,
-                           block_height,
-                           &block);
+
+    // Space for extrapolated pixels and the part from the picture
+    // The extrapolation function will set the pointers and stride.
+    kvz_pixel ext_buffer[LCU_LUMA_SIZE];
+    kvz_pixel *ext = NULL;
+    kvz_pixel *ext_origin = NULL;
+    int ext_s = 0;
+    kvz_epol_args epol_args = {
+      .src = ref->y,
+      .src_w = ref->width,
+      .src_h = ref->height,
+      .src_s = ref->stride,
+      .blk_x = ref_x,
+      .blk_y = ref_y,
+      .blk_w = block_width,
+      .blk_h = block_height,
+      .pad_l = 0,
+      .pad_r = 0,
+      .pad_t = 0,
+      .pad_b = 0,
+      .pad_b_simd = 0,
+    };
+
+    // Initialize separately. Gets rid of warning
+    // about using nonstandard extension.
+    epol_args.buf = ext_buffer;
+    epol_args.ext = &ext;
+    epol_args.ext_origin = &ext_origin;
+    epol_args.ext_s = &ext_s;
+
+    kvz_get_extended_block(&epol_args);

    const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];

    unsigned satd = kvz_satd_any_size(block_width,
-                                      block_height,
-                                      pic_data,
-                                      pic->stride,
-                                      block.buffer,
-                                      block.stride) >> (KVZ_BIT_DEPTH - 8);
-
-    if (block.malloc_used) {
-      FREE_POINTER(block.buffer);
-    }
+      block_height,
+      pic_data,
+      pic->stride,
+      ext_origin,
+      ext_s) >> (KVZ_BIT_DEPTH - 8);

    return satd;
  }
--- a/src/inter.c
+++ b/src/inter.c
@ -40,224 +40,258 @@ typedef struct {
 } merge_candidates_t;


-static void inter_recon_frac_luma(const encoder_state_t * const state,
-                                  const kvz_picture * const ref,
-                                  int32_t xpos,
-                                  int32_t ypos,
-                                  int32_t block_width,
-                                  int32_t block_height,
-                                  const int16_t mv_param[2],
-                                  lcu_t *lcu)
+static void inter_recon_frac_luma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
  int mv_frac_x = (mv_param[0] & 3);
  int mv_frac_y = (mv_param[1] & 3);

-  // Fractional luma 1/4-pel
-  kvz_extended_block src = {0, 0, 0, 0};
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
+  };

-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
+  kvz_get_extended_block(&epol_args);
  kvz_sample_quarterpel_luma(state->encoder_control,
-                                     src.orig_topleft,
-                                     src.stride,
-                                     block_width,
-                                     block_height,
-                                     lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                     LCU_WIDTH,
-                                     mv_frac_x,
-                                     mv_frac_y,
-                                     mv_param);
-
-  if (src.malloc_used) free(src.buffer);
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
-                                        const kvz_picture * const ref,
-                                        int32_t xpos,
-                                        int32_t ypos,
-                                        int32_t block_width,
-                                        int32_t block_height,
-                                        const int16_t mv_param[2],
-                                        hi_prec_buf_t *hi_prec_out)
+static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
  int mv_frac_x = (mv_param[0] & 3);
  int mv_frac_y = (mv_param[1] & 3);

-  // Fractional luma 1/4-pel
-  kvz_extended_block src = { 0, 0, 0, 0 };
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
+    .blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
+    .blk_w = block_width,
+    .blk_h = block_height,
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 1 // One row for AVX2
+  };

-  // Fractional luma
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         mv_param[0] >> 2,
-                         mv_param[1] >> 2,
-                         state->tile->offset_x,
-                         state->tile->offset_y,
-                         ref->y,
-                         ref->width,
-                         ref->height,
-                         KVZ_LUMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src);
-  kvz_sample_14bit_quarterpel_luma(state->encoder_control,
-                                           src.orig_topleft,
-                                           src.stride,
-                                           block_width,
-                                           block_height,
-                                           hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-                                           LCU_WIDTH,
-                                           mv_frac_x,
-                                           mv_frac_y,
-                                           mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  if (src.malloc_used) free(src.buffer);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_quarterpel_luma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width,
+    block_height,
+    hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
+    LCU_WIDTH,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_frac_chroma(const encoder_state_t * const state,
-                                    const kvz_picture * const ref,
-                                    int32_t xpos,
-                                    int32_t ypos,
-                                    int32_t block_width,
-                                    int32_t block_height,
-                                    const int16_t mv_param[2],
-                                    lcu_t *lcu)
+static void inter_recon_frac_chroma(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  lcu_t *lcu)
 {
  int mv_frac_x = (mv_param[0] & 7);
  int mv_frac_y = (mv_param[1] & 7);

-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;

-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
+  };

-  //Fractional chroma U
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
-  kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
-    block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  //Fractional chroma V
-  kvz_get_extended_block(xpos, ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
-  kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
-    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);

-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }

-static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
-                                          const kvz_picture * const ref,
-                                          int32_t xpos,
-                                          int32_t ypos,
-                                          int32_t block_width,
-                                          int32_t block_height,
-                                          const int16_t mv_param[2],
-                                          hi_prec_buf_t *hi_prec_out)
+static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
+  const kvz_picture *const ref,
+  int32_t xpos,
+  int32_t ypos,
+  int32_t block_width,
+  int32_t block_height,
+  const int16_t mv_param[2],
+  hi_prec_buf_t *hi_prec_out)
 {
  int mv_frac_x = (mv_param[0] & 7);
  int mv_frac_y = (mv_param[1] & 7);

-  // Translate to chroma
-  xpos >>= 1;
-  ypos >>= 1;
-  block_width >>= 1;
-  block_height >>= 1;
+  // Space for extrapolated pixels and the part from the picture.
+  // Some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;

-  // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0, 0 };
+  // Chroma U
+  // Divisions by 2 due to 4:2:0 chroma subsampling
+  kvz_epol_args epol_args = {
+    .src = ref->u,
+    .src_w = ref->width / 2,
+    .src_h = ref->height / 2,
+    .src_s = ref->stride / 2,
+    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
+    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
+    .blk_w = block_width / 2,
+    .blk_h = block_height / 2,
+    .pad_l = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_t = KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
+    .pad_b_simd = 3 // Three rows for AVX2
+  };

-  //Fractional chroma U
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->u,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_u);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_u.orig_topleft,
-                                         src_u.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;

-  //Fractional chroma V
-  kvz_get_extended_block(xpos,
-                         ypos,
-                         (mv_param[0] >> 2) >> 1,
-                         (mv_param[1] >> 2) >> 1,
-                         state->tile->offset_x >> 1,
-                         state->tile->offset_y >> 1,
-                         ref->v,
-                         ref->width >> 1,
-                         ref->height >> 1,
-                         KVZ_CHROMA_FILTER_TAPS,
-                         block_width,
-                         block_height,
-                         &src_v);
-  kvz_sample_14bit_octpel_chroma(state->encoder_control,
-                                         src_v.orig_topleft,
-                                         src_v.stride,
-                                         block_width,
-                                         block_height,
-                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
-                                         LCU_WIDTH_C,
-                                         mv_frac_x,
-                                         mv_frac_y,
-                                         mv_param);
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);

-  if (src_u.malloc_used) free(src_u.buffer);
-  if (src_v.malloc_used) free(src_v.buffer);
+  // Chroma V
+  epol_args.src = ref->v;
+  kvz_get_extended_block(&epol_args);
+  kvz_sample_octpel_chroma_hi(state->encoder_control,
+    ext_origin,
+    ext_s,
+    block_width / 2,
+    block_height / 2,
+    hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
+    LCU_WIDTH_C,
+    mv_frac_x,
+    mv_frac_y,
+    mv_param);
 }


@ -348,7 +382,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
    if (fractional_luma) {
      // With a fractional MV, do interpolation.
      if (state->encoder_control->cfg.bipred && hi_prec_out) {
-        inter_recon_14bit_frac_luma(state, ref,
+        inter_recon_frac_luma_hi(state, ref,
          pu_in_tile.x, pu_in_tile.y,
          width, height,
          mv_param, hi_prec_out);
@ -388,7 +422,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
  if (fractional_luma || fractional_chroma) {
    // With a fractional MV, do interpolation.
    if (state->encoder_control->cfg.bipred && hi_prec_out) {
-      inter_recon_14bit_frac_chroma(state, ref,
+      inter_recon_frac_chroma_hi(state, ref,
                                    pu_in_tile.x, pu_in_tile.y,
                                    width, height,
                                    mv_param, hi_prec_out);
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -992,12 +992,11 @@ static void search_frac(inter_search_info_t *info)

  unsigned costs[4] = { 0 };

-  kvz_extended_block src = { 0, 0, 0, 0 };
-  ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
+  ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];

  // Storage buffers for intermediate horizontally filtered results.
  // Have the first columns in contiguous memory for vectorization.
-  ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
+  ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];

  const kvz_picture *ref = info->ref;
@ -1013,20 +1012,45 @@ static void search_frac(inter_search_info_t *info)
  int8_t sample_off_x = 0;
  int8_t sample_off_y = 0;

-  kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
-                state->tile->offset_x,
-                state->tile->offset_y,
-                ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
-                internal_width+1, internal_height+1,
-                &src);
+  // Space for (possibly) extrapolated pixels and the part from the picture
+  // One extra row and column compared to normal interpolation and some extra for AVX2.
+  // The extrapolation function will set the pointers and stride.
+  kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
+  kvz_pixel *ext = NULL;
+  kvz_pixel *ext_origin = NULL;
+  int ext_s = 0;
+  kvz_epol_args epol_args = {
+    .src = ref->y,
+    .src_w = ref->width,
+    .src_h = ref->height,
+    .src_s = ref->stride,
+    .blk_x = state->tile->offset_x + orig.x + mv.x - 1,
+    .blk_y = state->tile->offset_y + orig.y + mv.y - 1,
+    .blk_w = internal_width + 1,  // TODO: real width
+    .blk_h = internal_height + 1, // TODO: real height
+    .pad_l = KVZ_LUMA_FILTER_OFFSET,
+    .pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_t = KVZ_LUMA_FILTER_OFFSET,
+    .pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
+    .pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
+  };
+
+  // Initialize separately. Gets rid of warning
+  // about using nonstandard extension.
+  epol_args.buf = ext_buffer;
+  epol_args.ext = &ext;
+  epol_args.ext_origin = &ext_origin;
+  epol_args.ext_s = &ext_s;
+
+  kvz_get_extended_block(&epol_args);

  kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
  int tmp_stride = pic->stride;
                  
  // Search integer position
  costs[0] = kvz_satd_any_size(width, height,
-                            tmp_pic, tmp_stride,
-                            src.orig_topleft + src.stride + 1, src.stride);
+    tmp_pic, tmp_stride,
+    ext_origin + ext_s + 1, ext_s);

  costs[0] += info->mvd_cost_func(state,
                                  mv.x, mv.y, 2,
@ -1056,8 +1080,8 @@ static void search_frac(inter_search_info_t *info)
    const int mv_shift = (step < 2) ? 1 : 0;

    filter_steps[step](state->encoder_control,
-      src.orig_topleft,
-      src.stride,
+      ext_origin,
+      ext_s,
      internal_width,
      internal_height,
      filtered,
@ -1131,8 +1155,6 @@ static void search_frac(inter_search_info_t *info)
  info->best_mv = mv;
  info->best_cost = best_cost;
  info->best_bitcost = best_bitcost;
-
-  if (src.malloc_used) free(src.buffer);
 }

 /**
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
  }
 }

-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
  //TODO: horizontal and vertical only filtering
  int32_t x, y;
@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
  int16_t src_stride,
  int width,
  int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
-  int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
+  kvz_pixel filtered[4][LCU_LUMA_SIZE],
+  int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
  int8_t fme_level,
  int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t hpel_off_x, int8_t hpel_off_y)
@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
  }
 }

-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
 {
  //TODO: horizontal and vertical only filtering
  int32_t x, y;
@ -728,59 +728,55 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
 }


-void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out) {
+void kvz_get_extended_block_generic(kvz_epol_args *args) {

-  int half_filter_size = filter_size >> 1;
+  int min_y = args->blk_y - args->pad_t;
+  int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
+  bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);

-  out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
-  out->stride = ref_width;
-  out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-  out->malloc_used = 0;
+  int min_x = args->blk_x - args->pad_l;
+  int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
+  bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);

-  int min_y = ypos - half_filter_size + off_y + mv_y;
-  int max_y = min_y + height + filter_size;
-  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
+  if (out_of_bounds_y || out_of_bounds_x) {

-  int min_x = xpos - half_filter_size + off_x + mv_x;
-  int max_x = min_x + width + filter_size;
-  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
+    *args->ext = args->buf;
+    *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
+    *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;

-  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
+    // Note that stride equals width here.
+    int cnt_l = CLIP(0, *args->ext_s, -min_x);
+    int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
+    int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);

-  if (sample_out_of_bounds){
-    out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
-    if (!out->buffer){
-      fprintf(stderr, "Memory allocation failed!\n");
-      assert(0);
+    // For each row including real padding.
+    // Don't read "don't care" values (SIMD padding). Zero them out.
+    int y;
+    for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
+
+      int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
+      kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
+      kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
+      kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
+      kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
+      kvz_pixel *dst_m = dst_l + cnt_l;
+      kvz_pixel *dst_r = dst_m + cnt_m;
+      for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
+      for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
+      for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
    }
-    out->stride = width + filter_size;
-    out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
-    out->malloc_used = 1;

-    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-
-    for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
-
-      // calculate y-pixel offset
-      coord_y = y + off_y + mv_y;
-      coord_y = CLIP(0, (ref_height)-1, coord_y);
-      coord_y *= ref_width;
-
-      if (!out_of_bounds_x){
-        memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
-      } else {
-        for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
-
-          coord_x = x + off_x + mv_x;
-          coord_x = CLIP(0, (ref_width)-1, coord_x);
-
-          // Store source block data (with extended borders)
-          out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
-        }
-      }
+    for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
+      kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
+      FILL_ARRAY(dst, 0, *args->ext_s);
    }
-  } 
+
+  } else {
+
+    *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
+    *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
+    *args->ext_s = args->src_s;
+  }
 }

 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
@ -793,8 +789,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
  success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
-  success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
  success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);

  return success;
--- a/src/strategies/generic/ipol-generic.h
+++ b/src/strategies/generic/ipol-generic.h
@ -32,9 +32,9 @@

 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);


 #endif //STRATEGIES_IPOL_GENERIC_H_
--- a/src/strategies/strategies-ipol.c
+++ b/src/strategies/strategies-ipol.c
@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 epol_func *kvz_get_extended_block;
 kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;


 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@ -31,21 +31,63 @@
 #include "kvazaar.h"
 #include "search_inter.h"

+// AVX2 implementation of horizontal filter reads and
+// writes two rows for luma and four for chroma at a time.
+// Extra vertical padding is added to prevent segfaults.
+// Horizontal padding is not needed even if one extra byte
+// is read because kvz_image_alloc adds enough padding.
+#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
+#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
+#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
+#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
+
+// On top of basic interpolation, FME needs one extra
+// column and row for ME (left and up). Adding the
+// extra row happens to satisfy AVX2 requirements for
+// row count. No other extra rows are needed.
+#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))

 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;

 typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
-  kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], 
+  kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
  int8_t sample_off_x, int8_t sample_off_y);

-typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filter_size, int width, int height, kvz_extended_block *out);
+typedef struct {
+  // Source samples
+  kvz_pixel *src; // Top-left sample
+  int src_w; // Width
+  int src_h; // Height
+  int src_s; // Stride
+
+  // Requested sampling position, base dimensions, and padding
+  int blk_x;
+  int blk_y;
+  int blk_w; // Width
+  int blk_h; // Height
+  int pad_l; // Left
+  int pad_r; // Right
+  int pad_t; // Top
+  int pad_b; // Bottom
+  int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
+
+  // Buffer for possible extrapolation. Free memory provided by the caller.
+  kvz_pixel *buf;
+
+  // Extended block data. These are set by the function.
+  kvz_pixel **ext; // Top-left sample with padding
+  kvz_pixel **ext_origin; // Top-left sample without padding
+  int *ext_s; // Stride
+} kvz_epol_args;
+
+typedef void(epol_func)(kvz_epol_args *args);
+

 typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
 typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

-typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
-typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
+typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);

 // Declare function pointers.
 extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
@ -55,8 +97,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 extern epol_func * kvz_get_extended_block;
 extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
-extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
-extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
+extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
+extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;


 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@ -69,8 +111,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
  {"filter_qpel_blocks_diag_luma",    (void**) &kvz_filter_qpel_blocks_diag_luma}, \
  {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
  {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
-  {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
-  {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
+  {"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
+  {"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
  {"get_extended_block", (void**) &kvz_get_extended_block}, \


--- a/tests/tsan_suppressions.txt
+++ b/tests/tsan_suppressions.txt
@ -1,3 +1,4 @@
-race:kvz_eight_tap_filter_hor_8x1_avx2
+# AVX2 interpolation reads some extra pixels
+race:kvz_ipol_8tap_hor_px_im_avx2
 race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
-race:kvz_eight_tap_filter_hor_avx2
+race:kvz_eight_tap_filter_hor_avx2