From d82582c37cff3856d27faafec6e0b8d624ebd12f Mon Sep 17 00:00:00 2001 From: Ari Lemmetti Date: Fri, 14 Aug 2015 18:39:39 +0300 Subject: [PATCH] Changes to extend border function. Now outputs a pointer to a block with guaranteed padding for filtering. Only generate extra pixels if samples are needed out of bounds. Use memcpy otherwise. --- src/inter.c | 48 +++++++++++--------- src/search_inter.c | 10 ++--- src/strategies/avx2/ipol-avx2.c | 59 +++++++++++++++--------- src/strategies/generic/ipol-generic.c | 64 +++++++++++++++++---------- src/strategies/strategies-ipol.h | 3 +- 5 files changed, 111 insertions(+), 73 deletions(-) diff --git a/src/inter.c b/src/inter.c index 2a57faa1..862c0071 100644 --- a/src/inter.c +++ b/src/inter.c @@ -72,14 +72,15 @@ void inter_recon_frac_luma(const encoder_state_t * const state, const kvz_pictur #define FILTER_SIZE_Y 8 //Luma filter size // Fractional luma 1/4-pel - kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)]; - kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)]; + extended_block src = {0, 0, 0}; // Fractional luma extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH, - ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y); - sample_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width, + ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src); + sample_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width, block_width, lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param); + + if (src.malloc_used) free(src.buffer); } void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out) @@ -90,14 +91,15 @@ void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_ #define FILTER_SIZE_Y 8 //Luma filter size // Fractional luma 1/4-pel - kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)]; - kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)]; + extended_block src = {0, 0, 0}; // Fractional luma extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH, - ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y); - sample_14bit_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width, + ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src); + sample_14bit_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width, block_width, hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param); + + if (src.malloc_used) free(src.buffer); } void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], lcu_t *lcu) @@ -113,20 +115,23 @@ void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_pict #define FILTER_SIZE_C 4 //Chroma filter size // Fractional chroma 1/8-pel - kvz_pixel octpel_src[((LCU_WIDTH_C) + FILTER_SIZE_C) * ((LCU_WIDTH_C) + FILTER_SIZE_C)]; - kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)]; + extended_block src_u = { 0, 0, 0 }; + extended_block src_v = { 0, 0, 0 }; //Fractional chroma U extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C, - ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src); - sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width, + ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u); + sample_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width, block_width, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); //Fractional chroma V extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C, - ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src); - sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width, + ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v); + sample_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_u.stride, block_width, block_width, lcu->rec.v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); + + if (src_u.malloc_used) free(src_u.buffer); + if (src_v.malloc_used) free(src_v.buffer); } void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out) @@ -142,20 +147,23 @@ void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kv #define FILTER_SIZE_C 4 //Chroma filter size // Fractional chroma 1/8-pel - kvz_pixel octpel_src[((LCU_WIDTH_C)+FILTER_SIZE_C) * ((LCU_WIDTH_C)+FILTER_SIZE_C)]; - kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)]; + extended_block src_u = {0, 0, 0}; + extended_block src_v = { 0, 0, 0 }; //Fractional chroma U extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C, - ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src); - sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width, + ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u); + sample_14bit_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width, block_width, hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); //Fractional chroma V extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C, - ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src); - sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width, + ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v); + sample_14bit_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width, block_width, hi_prec_out->v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); + + if (src_u.malloc_used) free(src_u.buffer); + if (src_v.malloc_used) free(src_v.buffer); } /** diff --git a/src/search_inter.c b/src/search_inter.c index 933760d8..2c6f5812 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -815,10 +815,7 @@ static unsigned search_frac(const encoder_state_t * const state, #define FILTER_SIZE 8 #define HALF_FILTER (FILTER_SIZE>>1) - //create buffer for block + extra for filter - int src_stride = block_width+FILTER_SIZE+1; - kvz_pixel src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)]; - kvz_pixel* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)]; + extended_block src = {0, 0, 0}; //destination buffer for interpolation int dst_stride = (block_width+1)*4; @@ -828,11 +825,12 @@ static unsigned search_frac(const encoder_state_t * const state, extend_borders(orig->x, orig->y, mv.x-1, mv.y-1, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH, - ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src); + ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, &src); - filter_inter_quarterpel_luma(state->encoder_control, src_off, src_stride, block_width+1, + filter_inter_quarterpel_luma(state->encoder_control, src.orig_topleft, src.stride, block_width+1, block_width+1, dst, dst_stride, 1, 1); + if (src.malloc_used) free(src.buffer); //Set mv to half-pixel precision mv.x <<= 1; diff --git a/src/strategies/avx2/ipol-avx2.c b/src/strategies/avx2/ipol-avx2.c index 3fbde5d5..40798206 100644 --- a/src/strategies/avx2/ipol-avx2.c +++ b/src/strategies/avx2/ipol-avx2.c @@ -482,37 +482,52 @@ void filter_inter_octpel_chroma_avx2(const encoder_control_t * const encoder, kv } void extend_borders_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filterSize, int width, int height, kvz_pixel *dst) { + int filterSize, int width, int height, extended_block *out) { - int16_t mv[2] = { mv_x, mv_y }; int halfFilterSize = filterSize >> 1; - int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp; + out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x); + out->stride = ref_width; + out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize; + out->malloc_used = 0; - for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) { + int min_y = ypos - halfFilterSize + off_y + mv_y; + int max_y = min_y + height + filterSize; + int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height); - // calculate y-pixel offset - coord_y = y + off_y + mv[1]; + int min_x = xpos - halfFilterSize + off_x + mv_x; + int max_x = min_x + width + filterSize; + int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width); - // On y-overflow set coord_y accordingly - overflow_neg_y_temp = (coord_y < 0) ? 1 : 0; - overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0; - if (overflow_neg_y_temp) coord_y = 0; - else if (overflow_pos_y_temp) coord_y = (ref_height)-1; - coord_y *= ref_width; + int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x; - for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) { - coord_x = x + off_x + mv[0]; + if (sample_out_of_bounds){ + out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize)); + out->stride = width + filterSize; + out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize; + out->malloc_used = 1; - // On x-overflow set coord_x accordingly - overflow_neg_x_temp = (coord_x < 0) ? 1 : 0; - overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0; - if (overflow_neg_x_temp) coord_x = 0; - else if (overflow_pos_x_temp) coord_x = ref_width - 1; + int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - // Store source block data (with extended borders) - dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x]; + for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) { + + // calculate y-pixel offset + coord_y = y + off_y + mv_y; + coord_y = CLIP(0, (ref_height)-1, coord_y); + coord_y *= ref_width; + + if (!out_of_bounds_x){ + memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel)); + } else { + for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) { + + coord_x = x + off_x + mv_x; + coord_x = CLIP(0, (ref_width)-1, coord_x); + + // Store source block data (with extended borders) + out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x]; + } + } } } } diff --git a/src/strategies/generic/ipol-generic.c b/src/strategies/generic/ipol-generic.c index 7b29e444..166b2fb2 100644 --- a/src/strategies/generic/ipol-generic.c +++ b/src/strategies/generic/ipol-generic.c @@ -481,40 +481,56 @@ void sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, } } -void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filterSize, int width, int height, kvz_pixel *dst) { - int16_t mv[2] = { mv_x, mv_y }; +void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, + int filterSize, int width, int height, extended_block *out) { + int halfFilterSize = filterSize >> 1; - int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp; + out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x); + out->stride = ref_width; + out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize; + out->malloc_used = 0; - for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) { + int min_y = ypos - halfFilterSize + off_y + mv_y; + int max_y = min_y + height + filterSize; + int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height); - // calculate y-pixel offset - coord_y = y + off_y + mv[1]; + int min_x = xpos - halfFilterSize + off_x + mv_x; + int max_x = min_x + width + filterSize; + int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width); - // On y-overflow set coord_y accordingly - overflow_neg_y_temp = (coord_y < 0) ? 1 : 0; - overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0; - if (overflow_neg_y_temp) coord_y = 0; - else if (overflow_pos_y_temp) coord_y = (ref_height)-1; - coord_y *= ref_width; + int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x; - for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) { - coord_x = x + off_x + mv[0]; + if (sample_out_of_bounds){ + out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize)); + out->stride = width + filterSize; + out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize; + out->malloc_used = 1; - // On x-overflow set coord_x accordingly - overflow_neg_x_temp = (coord_x < 0) ? 1 : 0; - overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0; - if (overflow_neg_x_temp) coord_x = 0; - else if (overflow_pos_x_temp) coord_x = ref_width - 1; + int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; - // Store source block data (with extended borders) - dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x]; + for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) { + + // calculate y-pixel offset + coord_y = y + off_y + mv_y; + coord_y = CLIP(0, (ref_height)-1, coord_y); + coord_y *= ref_width; + + if (!out_of_bounds_x){ + memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel)); + } else { + for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) { + + coord_x = x + off_x + mv_x; + coord_x = CLIP(0, (ref_width)-1, coord_x); + + // Store source block data (with extended borders) + out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x]; + } + } } - } + } } diff --git a/src/strategies/strategies-ipol.h b/src/strategies/strategies-ipol.h index ac6e2c06..1fe3514b 100644 --- a/src/strategies/strategies-ipol.h +++ b/src/strategies/strategies-ipol.h @@ -24,12 +24,13 @@ #include "encoder.h" +typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } extended_block; typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, - int filterSize, int width, int height, kvz_pixel *dst); + int filterSize, int width, int height, extended_block *out); // Declare function pointers.