Changes to extend border function.

Now outputs a pointer to a block with guaranteed padding for filtering. Only generate extra pixels if samples are needed out of bounds. Use memcpy otherwise.
2024-11-27 19:24:06 +00:00 · 2015-08-14 18:39:39 +03:00 · 2015-08-14 18:39:39 +03:00 · d82582c37c
parent 4dcc0d876d
commit d82582c37c
5 changed files with 111 additions and 73 deletions
--- a/src/inter.c
+++ b/src/inter.c
@ -72,14 +72,15 @@ void inter_recon_frac_luma(const encoder_state_t * const state, const kvz_pictur
 #define FILTER_SIZE_Y 8 //Luma filter size

  // Fractional luma 1/4-pel
-  kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)];
-  kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)];
+  extended_block src = {0, 0, 0};

  // Fractional luma
  extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH,
-    ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y);
-  sample_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width,
+    ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src);
+  sample_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width,
    block_width, lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param);
+
+  if (src.malloc_used) free(src.buffer);
 }

 void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out)
@ -90,14 +91,15 @@ void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_
 #define FILTER_SIZE_Y 8 //Luma filter size

  // Fractional luma 1/4-pel
-  kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)];
-  kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)];
+  extended_block src = {0, 0, 0};

  // Fractional luma
  extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH,
-    ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y);
-  sample_14bit_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width,
+    ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src);
+  sample_14bit_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width,
    block_width, hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param);
+
+  if (src.malloc_used) free(src.buffer);
 }

 void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], lcu_t *lcu)
@ -113,20 +115,23 @@ void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_pict
 #define FILTER_SIZE_C 4 //Chroma filter size

  // Fractional chroma 1/8-pel
-  kvz_pixel octpel_src[((LCU_WIDTH_C) + FILTER_SIZE_C) * ((LCU_WIDTH_C) + FILTER_SIZE_C)];
-  kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)];
+  extended_block src_u = { 0, 0, 0 };
+  extended_block src_v = { 0, 0, 0 };

  //Fractional chroma U
  extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
-  sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
+    ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u);
+  sample_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
    block_width, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);

  //Fractional chroma V
  extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
-  sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
+    ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v);
+  sample_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_u.stride, block_width,
    block_width, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+
+  if (src_u.malloc_used) free(src_u.buffer);
+  if (src_v.malloc_used) free(src_v.buffer);
 }

 void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out)
@ -142,20 +147,23 @@ void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kv
 #define FILTER_SIZE_C 4 //Chroma filter size

  // Fractional chroma 1/8-pel
-  kvz_pixel octpel_src[((LCU_WIDTH_C)+FILTER_SIZE_C) * ((LCU_WIDTH_C)+FILTER_SIZE_C)];
-  kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)];
+  extended_block src_u = {0, 0, 0};
+  extended_block src_v = { 0, 0, 0 };

  //Fractional chroma U
  extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
-  sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
+    ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u);
+  sample_14bit_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
    block_width, hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);

  //Fractional chroma V
  extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
-  sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
+    ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v);
+  sample_14bit_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
    block_width, hi_prec_out->v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+
+  if (src_u.malloc_used) free(src_u.buffer);
+  if (src_v.malloc_used) free(src_v.buffer);
 }

 /**
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -815,10 +815,7 @@ static unsigned search_frac(const encoder_state_t * const state,
  #define FILTER_SIZE 8
  #define HALF_FILTER (FILTER_SIZE>>1)

-  //create buffer for block + extra for filter
-  int src_stride = block_width+FILTER_SIZE+1;
-  kvz_pixel src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
-  kvz_pixel* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
+  extended_block src = {0, 0, 0};

  //destination buffer for interpolation
  int dst_stride = (block_width+1)*4;
@ -828,11 +825,12 @@ static unsigned search_frac(const encoder_state_t * const state,
  extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
                state->tile->lcu_offset_x * LCU_WIDTH,
                state->tile->lcu_offset_y * LCU_WIDTH,
-                ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
+                ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, &src);

-  filter_inter_quarterpel_luma(state->encoder_control, src_off, src_stride, block_width+1,
+  filter_inter_quarterpel_luma(state->encoder_control, src.orig_topleft, src.stride, block_width+1,
      block_width+1, dst, dst_stride, 1, 1);

+  if (src.malloc_used) free(src.buffer);

  //Set mv to half-pixel precision
  mv.x <<= 1;
--- a/src/strategies/avx2/ipol-avx2.c
+++ b/src/strategies/avx2/ipol-avx2.c
@ -482,37 +482,52 @@ void filter_inter_octpel_chroma_avx2(const encoder_control_t * const encoder, kv
 }

 void extend_borders_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filterSize, int width, int height, kvz_pixel *dst) {
+  int filterSize, int width, int height, extended_block *out) {

-  int16_t mv[2] = { mv_x, mv_y };
  int halfFilterSize = filterSize >> 1;

-  int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-  int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp;
+  out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x);
+  out->stride = ref_width;
+  out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
+  out->malloc_used = 0;

-  for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
+  int min_y = ypos - halfFilterSize + off_y + mv_y;
+  int max_y = min_y + height + filterSize;
+  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);

-    // calculate y-pixel offset
-    coord_y = y + off_y + mv[1];
+  int min_x = xpos - halfFilterSize + off_x + mv_x;
+  int max_x = min_x + width + filterSize;
+  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);

-    // On y-overflow set coord_y accordingly
-    overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
-    overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
-    if (overflow_neg_y_temp)      coord_y = 0;
-    else if (overflow_pos_y_temp) coord_y = (ref_height)-1;
-    coord_y *= ref_width;
+  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;

-    for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
-      coord_x = x + off_x + mv[0];
+  if (sample_out_of_bounds){
+    out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize));
+    out->stride = width + filterSize;
+    out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
+    out->malloc_used = 1;

-      // On x-overflow set coord_x accordingly
-      overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
-      overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0;
-      if (overflow_neg_x_temp)      coord_x = 0;
-      else if (overflow_pos_x_temp) coord_x = ref_width - 1;
+    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;

-      // Store source block data (with extended borders)
-      dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
+    for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
+
+      // calculate y-pixel offset
+      coord_y = y + off_y + mv_y;
+      coord_y = CLIP(0, (ref_height)-1, coord_y);
+      coord_y *= ref_width;
+
+      if (!out_of_bounds_x){
+        memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel));
+      } else {
+        for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
+
+          coord_x = x + off_x + mv_x;
+          coord_x = CLIP(0, (ref_width)-1, coord_x);
+
+          // Store source block data (with extended borders)
+          out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
+        }
+      }
    }
  }
 }
--- a/src/strategies/generic/ipol-generic.c
+++ b/src/strategies/generic/ipol-generic.c
@ -481,40 +481,56 @@ void sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder,
  }
 }

-void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filterSize, int width, int height, kvz_pixel *dst) {

-  int16_t mv[2] = { mv_x, mv_y };
+void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
+  int filterSize, int width, int height, extended_block *out) {
+
  int halfFilterSize = filterSize >> 1;

-  int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
-  int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp;
+  out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x);
+  out->stride = ref_width;
+  out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
+  out->malloc_used = 0;

-  for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
+  int min_y = ypos - halfFilterSize + off_y + mv_y;
+  int max_y = min_y + height + filterSize;
+  int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);

-    // calculate y-pixel offset
-    coord_y = y + off_y + mv[1];
+  int min_x = xpos - halfFilterSize + off_x + mv_x;
+  int max_x = min_x + width + filterSize;
+  int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);

-    // On y-overflow set coord_y accordingly
-    overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
-    overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
-    if (overflow_neg_y_temp)      coord_y = 0;
-    else if (overflow_pos_y_temp) coord_y = (ref_height)-1;
-    coord_y *= ref_width;
+  int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;

-    for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
-      coord_x = x + off_x + mv[0];
+  if (sample_out_of_bounds){
+    out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize));
+    out->stride = width + filterSize;
+    out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
+    out->malloc_used = 1;

-      // On x-overflow set coord_x accordingly
-      overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
-      overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0;
-      if (overflow_neg_x_temp)      coord_x = 0;
-      else if (overflow_pos_x_temp) coord_x = ref_width - 1;
+    int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;

-      // Store source block data (with extended borders)
-      dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
+    for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
+
+      // calculate y-pixel offset
+      coord_y = y + off_y + mv_y;
+      coord_y = CLIP(0, (ref_height)-1, coord_y);
+      coord_y *= ref_width;
+
+      if (!out_of_bounds_x){
+        memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel));
+      } else {
+        for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
+
+          coord_x = x + off_x + mv_x;
+          coord_x = CLIP(0, (ref_width)-1, coord_x);
+
+          // Store source block data (with extended borders)
+          out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
+        }
+      }
    }
-  }
+  } 
 }


--- a/src/strategies/strategies-ipol.h
+++ b/src/strategies/strategies-ipol.h
@ -24,12 +24,13 @@

 #include "encoder.h"

+typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } extended_block;

 typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
  int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);

 typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
-  int filterSize, int width, int height, kvz_pixel *dst);
+  int filterSize, int width, int height, extended_block *out);


 // Declare function pointers.