Changes to extend border function.

Now outputs a pointer to a block with guaranteed padding for filtering.
Only generate extra pixels if samples are needed out of bounds.
Use memcpy otherwise.
This commit is contained in:
Ari Lemmetti 2015-08-14 18:39:39 +03:00
parent 4dcc0d876d
commit d82582c37c
5 changed files with 111 additions and 73 deletions

View file

@ -72,14 +72,15 @@ void inter_recon_frac_luma(const encoder_state_t * const state, const kvz_pictur
#define FILTER_SIZE_Y 8 //Luma filter size
// Fractional luma 1/4-pel
kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)];
kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)];
extended_block src = {0, 0, 0};
// Fractional luma
extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH,
ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y);
sample_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width,
ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src);
sample_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width,
block_width, lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param);
if (src.malloc_used) free(src.buffer);
}
void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out)
@ -90,14 +91,15 @@ void inter_recon_14bit_frac_luma(const encoder_state_t * const state, const kvz_
#define FILTER_SIZE_Y 8 //Luma filter size
// Fractional luma 1/4-pel
kvz_pixel qpel_src_y[(LCU_WIDTH + FILTER_SIZE_Y) * (LCU_WIDTH + FILTER_SIZE_Y)];
kvz_pixel* qpel_src_off_y = &qpel_src_y[(block_width + FILTER_SIZE_Y)*(FILTER_SIZE_Y >> 1) + (FILTER_SIZE_Y >> 1)];
extended_block src = {0, 0, 0};
// Fractional luma
extend_borders(xpos, ypos, mv_param[0] >> 2, mv_param[1] >> 2, state->tile->lcu_offset_x * LCU_WIDTH, state->tile->lcu_offset_y * LCU_WIDTH,
ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, qpel_src_y);
sample_14bit_quarterpel_luma_generic(state->encoder_control, qpel_src_off_y, block_width + FILTER_SIZE_Y, block_width,
ref->y, ref->width, ref->height, FILTER_SIZE_Y, block_width, block_width, &src);
sample_14bit_quarterpel_luma_generic(state->encoder_control, src.orig_topleft, src.stride, block_width,
block_width, hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, mv_param);
if (src.malloc_used) free(src.buffer);
}
void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], lcu_t *lcu)
@ -113,20 +115,23 @@ void inter_recon_frac_chroma(const encoder_state_t * const state, const kvz_pict
#define FILTER_SIZE_C 4 //Chroma filter size
// Fractional chroma 1/8-pel
kvz_pixel octpel_src[((LCU_WIDTH_C) + FILTER_SIZE_C) * ((LCU_WIDTH_C) + FILTER_SIZE_C)];
kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)];
extended_block src_u = { 0, 0, 0 };
extended_block src_v = { 0, 0, 0 };
//Fractional chroma U
extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u);
sample_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
block_width, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
//Fractional chroma V
extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
sample_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v);
sample_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_u.stride, block_width,
block_width, lcu->rec.v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
}
void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kvz_picture * const ref, int32_t xpos, int32_t ypos, int32_t block_width, const int16_t mv_param[2], hi_prec_buf_t *hi_prec_out)
@ -142,20 +147,23 @@ void inter_recon_14bit_frac_chroma(const encoder_state_t * const state, const kv
#define FILTER_SIZE_C 4 //Chroma filter size
// Fractional chroma 1/8-pel
kvz_pixel octpel_src[((LCU_WIDTH_C)+FILTER_SIZE_C) * ((LCU_WIDTH_C)+FILTER_SIZE_C)];
kvz_pixel* octpel_src_off = &octpel_src[(block_width + FILTER_SIZE_C)*(FILTER_SIZE_C >> 1) + (FILTER_SIZE_C >> 1)];
extended_block src_u = {0, 0, 0};
extended_block src_v = { 0, 0, 0 };
//Fractional chroma U
extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_u);
sample_14bit_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
block_width, hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
//Fractional chroma V
extend_borders(xpos, ypos, (mv_param[0] >> 2) >> 1, (mv_param[1] >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, octpel_src);
sample_14bit_octpel_chroma_generic(state->encoder_control, octpel_src_off, block_width + FILTER_SIZE_C, block_width,
ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_width, &src_v);
sample_14bit_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
block_width, hi_prec_out->v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
}
/**

View file

@ -815,10 +815,7 @@ static unsigned search_frac(const encoder_state_t * const state,
#define FILTER_SIZE 8
#define HALF_FILTER (FILTER_SIZE>>1)
//create buffer for block + extra for filter
int src_stride = block_width+FILTER_SIZE+1;
kvz_pixel src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
kvz_pixel* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
extended_block src = {0, 0, 0};
//destination buffer for interpolation
int dst_stride = (block_width+1)*4;
@ -828,11 +825,12 @@ static unsigned search_frac(const encoder_state_t * const state,
extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
state->tile->lcu_offset_x * LCU_WIDTH,
state->tile->lcu_offset_y * LCU_WIDTH,
ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, &src);
filter_inter_quarterpel_luma(state->encoder_control, src_off, src_stride, block_width+1,
filter_inter_quarterpel_luma(state->encoder_control, src.orig_topleft, src.stride, block_width+1,
block_width+1, dst, dst_stride, 1, 1);
if (src.malloc_used) free(src.buffer);
//Set mv to half-pixel precision
mv.x <<= 1;

View file

@ -482,37 +482,52 @@ void filter_inter_octpel_chroma_avx2(const encoder_control_t * const encoder, kv
}
void extend_borders_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, kvz_pixel *dst) {
int filterSize, int width, int height, extended_block *out) {
int16_t mv[2] = { mv_x, mv_y };
int halfFilterSize = filterSize >> 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp;
out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x);
out->stride = ref_width;
out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
out->malloc_used = 0;
for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
int min_y = ypos - halfFilterSize + off_y + mv_y;
int max_y = min_y + height + filterSize;
int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
// calculate y-pixel offset
coord_y = y + off_y + mv[1];
int min_x = xpos - halfFilterSize + off_x + mv_x;
int max_x = min_x + width + filterSize;
int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
// On y-overflow set coord_y accordingly
overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
if (overflow_neg_y_temp) coord_y = 0;
else if (overflow_pos_y_temp) coord_y = (ref_height)-1;
coord_y *= ref_width;
int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
coord_x = x + off_x + mv[0];
if (sample_out_of_bounds){
out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize));
out->stride = width + filterSize;
out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
out->malloc_used = 1;
// On x-overflow set coord_x accordingly
overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0;
if (overflow_neg_x_temp) coord_x = 0;
else if (overflow_pos_x_temp) coord_x = ref_width - 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
// Store source block data (with extended borders)
dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
// calculate y-pixel offset
coord_y = y + off_y + mv_y;
coord_y = CLIP(0, (ref_height)-1, coord_y);
coord_y *= ref_width;
if (!out_of_bounds_x){
memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel));
} else {
for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
coord_x = x + off_x + mv_x;
coord_x = CLIP(0, (ref_width)-1, coord_x);
// Store source block data (with extended borders)
out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
}
}
}
}
}

View file

@ -481,40 +481,56 @@ void sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder,
}
}
void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, kvz_pixel *dst) {
int16_t mv[2] = { mv_x, mv_y };
void extend_borders_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, extended_block *out) {
int halfFilterSize = filterSize >> 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
int8_t overflow_neg_y_temp, overflow_pos_y_temp, overflow_neg_x_temp, overflow_pos_x_temp;
out->buffer = ref + (ypos - halfFilterSize + off_y + mv_y) * ref_width + (xpos - halfFilterSize + off_x + mv_x);
out->stride = ref_width;
out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
out->malloc_used = 0;
for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
int min_y = ypos - halfFilterSize + off_y + mv_y;
int max_y = min_y + height + filterSize;
int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
// calculate y-pixel offset
coord_y = y + off_y + mv[1];
int min_x = xpos - halfFilterSize + off_x + mv_x;
int max_x = min_x + width + filterSize;
int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
// On y-overflow set coord_y accordingly
overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
if (overflow_neg_y_temp) coord_y = 0;
else if (overflow_pos_y_temp) coord_y = (ref_height)-1;
coord_y *= ref_width;
int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
coord_x = x + off_x + mv[0];
if (sample_out_of_bounds){
out->buffer = MALLOC(kvz_pixel, (width + filterSize) * (width + filterSize));
out->stride = width + filterSize;
out->orig_topleft = out->buffer + out->stride * halfFilterSize + halfFilterSize;
out->malloc_used = 1;
// On x-overflow set coord_x accordingly
overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
overflow_pos_x_temp = (coord_x >= ref_width) ? 1 : 0;
if (overflow_neg_x_temp) coord_x = 0;
else if (overflow_pos_x_temp) coord_x = ref_width - 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
// Store source block data (with extended borders)
dst[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
// calculate y-pixel offset
coord_y = y + off_y + mv_y;
coord_y = CLIP(0, (ref_height)-1, coord_y);
coord_y *= ref_width;
if (!out_of_bounds_x){
memcpy(&out->buffer[dst_y*(width + filterSize) + 0], &ref[coord_y + min_x], (width + filterSize) * sizeof(kvz_pixel));
} else {
for (dst_x = 0, x = (xpos)-halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
coord_x = x + off_x + mv_x;
coord_x = CLIP(0, (ref_width)-1, coord_x);
// Store source block data (with extended borders)
out->buffer[dst_y*(width + filterSize) + dst_x] = ref[coord_y + coord_x];
}
}
}
}
}
}

View file

@ -24,12 +24,13 @@
#include "encoder.h"
typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } extended_block;
typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, kvz_pixel *dst);
int filterSize, int width, int height, extended_block *out);
// Declare function pointers.