Merge branch 'interpolation-2021'

This commit is contained in:
Ari Lemmetti 2021-03-08 22:36:34 +02:00
commit c36d423a8c
9 changed files with 934 additions and 1109 deletions

View file

@ -477,20 +477,37 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
ref->stride) >> (KVZ_BIT_DEPTH - 8);
} else {
// Extrapolate pixels from outside the frame.
kvz_extended_block block;
kvz_get_extended_block(pic_x,
pic_y,
ref_x - pic_x,
ref_y - pic_y,
0,
0,
ref->y,
ref->width,
ref->height,
0,
block_width,
block_height,
&block);
// Space for extrapolated pixels and the part from the picture
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[LCU_LUMA_SIZE];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = ref_x,
.blk_y = ref_y,
.blk_w = block_width,
.blk_h = block_height,
.pad_l = 0,
.pad_r = 0,
.pad_t = 0,
.pad_b = 0,
.pad_b_simd = 0,
};
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
@ -498,12 +515,8 @@ unsigned kvz_image_calc_satd(const kvz_picture *pic,
block_height,
pic_data,
pic->stride,
block.buffer,
block.stride) >> (KVZ_BIT_DEPTH - 8);
if (block.malloc_used) {
FREE_POINTER(block.buffer);
}
ext_origin,
ext_s) >> (KVZ_BIT_DEPTH - 8);
return satd;
}

View file

@ -40,8 +40,8 @@ typedef struct {
} merge_candidates_t;
static void inter_recon_frac_luma(const encoder_state_t * const state,
const kvz_picture * const ref,
static void inter_recon_frac_luma(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
@ -52,26 +52,40 @@ static void inter_recon_frac_luma(const encoder_state_t * const state,
int mv_frac_x = (mv_param[0] & 3);
int mv_frac_y = (mv_param[1] & 3);
// Fractional luma 1/4-pel
kvz_extended_block src = {0, 0, 0, 0};
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
.blk_w = block_width,
.blk_h = block_height,
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 1 // One row for AVX2
};
// Fractional luma
kvz_get_extended_block(xpos,
ypos,
mv_param[0] >> 2,
mv_param[1] >> 2,
state->tile->offset_x,
state->tile->offset_y,
ref->y,
ref->width,
ref->height,
KVZ_LUMA_FILTER_TAPS,
block_width,
block_height,
&src);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_sample_quarterpel_luma(state->encoder_control,
src.orig_topleft,
src.stride,
ext_origin,
ext_s,
block_width,
block_height,
lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
@ -79,12 +93,10 @@ static void inter_recon_frac_luma(const encoder_state_t * const state,
mv_frac_x,
mv_frac_y,
mv_param);
if (src.malloc_used) free(src.buffer);
}
static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
const kvz_picture * const ref,
static void inter_recon_frac_luma_hi(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
@ -95,26 +107,40 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
int mv_frac_x = (mv_param[0] & 3);
int mv_frac_y = (mv_param[1] & 3);
// Fractional luma 1/4-pel
kvz_extended_block src = { 0, 0, 0, 0 };
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + xpos + (mv_param[0] >> 2),
.blk_y = state->tile->offset_y + ypos + (mv_param[1] >> 2),
.blk_w = block_width,
.blk_h = block_height,
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 1 // One row for AVX2
};
// Fractional luma
kvz_get_extended_block(xpos,
ypos,
mv_param[0] >> 2,
mv_param[1] >> 2,
state->tile->offset_x,
state->tile->offset_y,
ref->y,
ref->width,
ref->height,
KVZ_LUMA_FILTER_TAPS,
block_width,
block_height,
&src);
kvz_sample_14bit_quarterpel_luma(state->encoder_control,
src.orig_topleft,
src.stride,
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_sample_quarterpel_luma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width,
block_height,
hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
@ -122,12 +148,10 @@ static void inter_recon_14bit_frac_luma(const encoder_state_t * const state,
mv_frac_x,
mv_frac_y,
mv_param);
if (src.malloc_used) free(src.buffer);
}
static void inter_recon_frac_chroma(const encoder_state_t * const state,
const kvz_picture * const ref,
static void inter_recon_frac_chroma(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
@ -138,54 +162,68 @@ static void inter_recon_frac_chroma(const encoder_state_t * const state,
int mv_frac_x = (mv_param[0] & 7);
int mv_frac_y = (mv_param[1] & 7);
// Translate to chroma
xpos >>= 1;
ypos >>= 1;
block_width >>= 1;
block_height >>= 1;
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
// Fractional chroma 1/8-pel
kvz_extended_block src_u = { 0, 0, 0, 0 };
kvz_extended_block src_v = { 0, 0, 0, 0 };
// Chroma U
// Divisions by 2 due to 4:2:0 chroma subsampling
kvz_epol_args epol_args = {
.src = ref->u,
.src_w = ref->width / 2,
.src_h = ref->height / 2,
.src_s = ref->stride / 2,
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
.blk_w = block_width / 2,
.blk_h = block_height / 2,
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_b_simd = 3 // Three rows for AVX2
};
//Fractional chroma U
kvz_get_extended_block(xpos, ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->u,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_u);
kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
//Fractional chroma V
kvz_get_extended_block(xpos, ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->v,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_v);
kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
block_height, lcu->rec.v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
// Chroma V
epol_args.src = ref->v;
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
}
static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
const kvz_picture * const ref,
static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
const kvz_picture *const ref,
int32_t xpos,
int32_t ypos,
int32_t block_width,
@ -196,68 +234,64 @@ static void inter_recon_14bit_frac_chroma(const encoder_state_t * const state,
int mv_frac_x = (mv_param[0] & 7);
int mv_frac_y = (mv_param[1] & 7);
// Translate to chroma
xpos >>= 1;
ypos >>= 1;
block_width >>= 1;
block_height >>= 1;
// Space for extrapolated pixels and the part from the picture.
// Some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
// Fractional chroma 1/8-pel
kvz_extended_block src_u = { 0, 0, 0, 0 };
kvz_extended_block src_v = { 0, 0, 0, 0 };
// Chroma U
// Divisions by 2 due to 4:2:0 chroma subsampling
kvz_epol_args epol_args = {
.src = ref->u,
.src_w = ref->width / 2,
.src_h = ref->height / 2,
.src_s = ref->stride / 2,
.blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param[0] >> 3),
.blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param[1] >> 3),
.blk_w = block_width / 2,
.blk_h = block_height / 2,
.pad_l = KVZ_CHROMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_t = KVZ_CHROMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
.pad_b_simd = 3 // Three rows for AVX2
};
//Fractional chroma U
kvz_get_extended_block(xpos,
ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->u,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_u);
kvz_sample_14bit_octpel_chroma(state->encoder_control,
src_u.orig_topleft,
src_u.stride,
block_width,
block_height,
hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
//Fractional chroma V
kvz_get_extended_block(xpos,
ypos,
(mv_param[0] >> 2) >> 1,
(mv_param[1] >> 2) >> 1,
state->tile->offset_x >> 1,
state->tile->offset_y >> 1,
ref->v,
ref->width >> 1,
ref->height >> 1,
KVZ_CHROMA_FILTER_TAPS,
block_width,
block_height,
&src_v);
kvz_sample_14bit_octpel_chroma(state->encoder_control,
src_v.orig_topleft,
src_v.stride,
block_width,
block_height,
hi_prec_out->v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
// Chroma V
epol_args.src = ref->v;
kvz_get_extended_block(&epol_args);
kvz_sample_octpel_chroma_hi(state->encoder_control,
ext_origin,
ext_s,
block_width / 2,
block_height / 2,
hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
LCU_WIDTH_C,
mv_frac_x,
mv_frac_y,
mv_param);
if (src_u.malloc_used) free(src_u.buffer);
if (src_v.malloc_used) free(src_v.buffer);
}
@ -348,7 +382,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
if (fractional_luma) {
// With a fractional MV, do interpolation.
if (state->encoder_control->cfg.bipred && hi_prec_out) {
inter_recon_14bit_frac_luma(state, ref,
inter_recon_frac_luma_hi(state, ref,
pu_in_tile.x, pu_in_tile.y,
width, height,
mv_param, hi_prec_out);
@ -388,7 +422,7 @@ static void inter_recon_unipred(const encoder_state_t * const state,
if (fractional_luma || fractional_chroma) {
// With a fractional MV, do interpolation.
if (state->encoder_control->cfg.bipred && hi_prec_out) {
inter_recon_14bit_frac_chroma(state, ref,
inter_recon_frac_chroma_hi(state, ref,
pu_in_tile.x, pu_in_tile.y,
width, height,
mv_param, hi_prec_out);

View file

@ -992,12 +992,11 @@ static void search_frac(inter_search_info_t *info)
unsigned costs[4] = { 0 };
kvz_extended_block src = { 0, 0, 0, 0 };
ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH];
ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];
// Storage buffers for intermediate horizontally filtered results.
// Have the first columns in contiguous memory for vectorization.
ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH];
ALIGNED(64) int16_t intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD];
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1];
const kvz_picture *ref = info->ref;
@ -1013,12 +1012,37 @@ static void search_frac(inter_search_info_t *info)
int8_t sample_off_x = 0;
int8_t sample_off_y = 0;
kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
state->tile->offset_x,
state->tile->offset_y,
ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
internal_width+1, internal_height+1,
&src);
// Space for (possibly) extrapolated pixels and the part from the picture
// One extra row and column compared to normal interpolation and some extra for AVX2.
// The extrapolation function will set the pointers and stride.
kvz_pixel ext_buffer[KVZ_FME_MAX_INPUT_SIZE_SIMD];
kvz_pixel *ext = NULL;
kvz_pixel *ext_origin = NULL;
int ext_s = 0;
kvz_epol_args epol_args = {
.src = ref->y,
.src_w = ref->width,
.src_h = ref->height,
.src_s = ref->stride,
.blk_x = state->tile->offset_x + orig.x + mv.x - 1,
.blk_y = state->tile->offset_y + orig.y + mv.y - 1,
.blk_w = internal_width + 1, // TODO: real width
.blk_h = internal_height + 1, // TODO: real height
.pad_l = KVZ_LUMA_FILTER_OFFSET,
.pad_r = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_t = KVZ_LUMA_FILTER_OFFSET,
.pad_b = KVZ_EXT_PADDING_LUMA - KVZ_LUMA_FILTER_OFFSET,
.pad_b_simd = 0 // AVX2 padding unnecessary because of blk_h
};
// Initialize separately. Gets rid of warning
// about using nonstandard extension.
epol_args.buf = ext_buffer;
epol_args.ext = &ext;
epol_args.ext_origin = &ext_origin;
epol_args.ext_s = &ext_s;
kvz_get_extended_block(&epol_args);
kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
int tmp_stride = pic->stride;
@ -1026,7 +1050,7 @@ static void search_frac(inter_search_info_t *info)
// Search integer position
costs[0] = kvz_satd_any_size(width, height,
tmp_pic, tmp_stride,
src.orig_topleft + src.stride + 1, src.stride);
ext_origin + ext_s + 1, ext_s);
costs[0] += info->mvd_cost_func(state,
mv.x, mv.y, 2,
@ -1056,8 +1080,8 @@ static void search_frac(inter_search_info_t *info)
const int mv_shift = (step < 2) ? 1 : 0;
filter_steps[step](state->encoder_control,
src.orig_topleft,
src.stride,
ext_origin,
ext_s,
internal_width,
internal_height,
filtered,
@ -1131,8 +1155,6 @@ static void search_frac(inter_search_info_t *info)
info->best_mv = mv;
info->best_cost = best_cost;
info->best_bitcost = best_bitcost;
if (src.malloc_used) free(src.buffer);
}
/**

File diff suppressed because it is too large Load diff

View file

@ -156,7 +156,7 @@ void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
}
}
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
{
//TODO: horizontal and vertical only filtering
int32_t x, y;
@ -194,8 +194,8 @@ void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -309,8 +309,8 @@ void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -390,8 +390,8 @@ void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encod
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -550,8 +550,8 @@ void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
int16_t src_stride,
int width,
int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH],
int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH],
kvz_pixel filtered[4][LCU_LUMA_SIZE],
int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
int8_t fme_level,
int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t hpel_off_x, int8_t hpel_off_y)
@ -694,7 +694,7 @@ void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, k
}
}
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
{
//TODO: horizontal and vertical only filtering
int32_t x, y;
@ -728,58 +728,54 @@ void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const enco
}
void kvz_get_extended_block_generic(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filter_size, int width, int height, kvz_extended_block *out) {
void kvz_get_extended_block_generic(kvz_epol_args *args) {
int half_filter_size = filter_size >> 1;
int min_y = args->blk_y - args->pad_t;
int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
out->buffer = ref + (ypos - half_filter_size + off_y + mv_y) * ref_width + (xpos - half_filter_size + off_x + mv_x);
out->stride = ref_width;
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
out->malloc_used = 0;
int min_x = args->blk_x - args->pad_l;
int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
int min_y = ypos - half_filter_size + off_y + mv_y;
int max_y = min_y + height + filter_size;
int out_of_bounds_y = (min_y < 0) || (max_y >= ref_height);
if (out_of_bounds_y || out_of_bounds_x) {
int min_x = xpos - half_filter_size + off_x + mv_x;
int max_x = min_x + width + filter_size;
int out_of_bounds_x = (min_x < 0) || (max_x >= ref_width);
*args->ext = args->buf;
*args->ext_s = args->pad_l + args->blk_w + args->pad_r;
*args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
int sample_out_of_bounds = out_of_bounds_y || out_of_bounds_x;
// Note that stride equals width here.
int cnt_l = CLIP(0, *args->ext_s, -min_x);
int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
if (sample_out_of_bounds){
out->buffer = MALLOC(kvz_pixel, (width + filter_size) * (height + filter_size));
if (!out->buffer){
fprintf(stderr, "Memory allocation failed!\n");
assert(0);
// For each row including real padding.
// Don't read "don't care" values (SIMD padding). Zero them out.
int y;
for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
kvz_pixel *dst_m = dst_l + cnt_l;
kvz_pixel *dst_r = dst_m + cnt_m;
for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
}
out->stride = width + filter_size;
out->orig_topleft = out->buffer + out->stride * half_filter_size + half_filter_size;
out->malloc_used = 1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y;
for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
FILL_ARRAY(dst, 0, *args->ext_s);
}
for (dst_y = 0, y = ypos - half_filter_size; y < ((ypos + height)) + half_filter_size; dst_y++, y++) {
// calculate y-pixel offset
coord_y = y + off_y + mv_y;
coord_y = CLIP(0, (ref_height)-1, coord_y);
coord_y *= ref_width;
if (!out_of_bounds_x){
memcpy(&out->buffer[dst_y * out->stride + 0], &ref[coord_y + min_x], out->stride * sizeof(kvz_pixel));
} else {
for (dst_x = 0, x = (xpos)-half_filter_size; x < ((xpos + width)) + half_filter_size; dst_x++, x++) {
coord_x = x + off_x + mv_x;
coord_x = CLIP(0, (ref_width)-1, coord_x);
// Store source block data (with extended borders)
out->buffer[dst_y * out->stride + dst_x] = ref[coord_y + coord_x];
}
}
}
*args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
*args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
*args->ext_s = args->src_s;
}
}
@ -793,8 +789,8 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
return success;

View file

@ -32,9 +32,9 @@
int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
#endif //STRATEGIES_IPOL_GENERIC_H_

View file

@ -33,8 +33,8 @@ ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
epol_func *kvz_get_extended_block;
kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {

View file

@ -31,21 +31,63 @@
#include "kvazaar.h"
#include "search_inter.h"
// AVX2 implementation of horizontal filter reads and
// writes two rows for luma and four for chroma at a time.
// Extra vertical padding is added to prevent segfaults.
// Horizontal padding is not needed even if one extra byte
// is read because kvz_image_alloc adds enough padding.
#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
// On top of basic interpolation, FME needs one extra
// column and row for ME (left and up). Adding the
// extra row happens to satisfy AVX2 requirements for
// row count. No other extra rows are needed.
#define KVZ_FME_MAX_INPUT_SIZE_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * (KVZ_EXT_BLOCK_W_LUMA + 1))
typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
kvz_pixel filtered[4][LCU_LUMA_SIZE], int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
int8_t sample_off_x, int8_t sample_off_y);
typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
int filter_size, int width, int height, kvz_extended_block *out);
typedef struct {
// Source samples
kvz_pixel *src; // Top-left sample
int src_w; // Width
int src_h; // Height
int src_s; // Stride
// Requested sampling position, base dimensions, and padding
int blk_x;
int blk_y;
int blk_w; // Width
int blk_h; // Height
int pad_l; // Left
int pad_r; // Right
int pad_t; // Top
int pad_b; // Bottom
int pad_b_simd; // "Don't care" rows in the end. Zeroed out.
// Buffer for possible extrapolation. Free memory provided by the caller.
kvz_pixel *buf;
// Extended block data. These are set by the function.
kvz_pixel **ext; // Top-left sample with padding
kvz_pixel **ext_origin; // Top-left sample without padding
int *ext_s; // Stride
} kvz_epol_args;
typedef void(epol_func)(kvz_epol_args *args);
typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]);
// Declare function pointers.
extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
@ -55,8 +97,8 @@ extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
extern epol_func * kvz_get_extended_block;
extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
extern kvz_sample_quarterpel_luma_hi_func * kvz_sample_quarterpel_luma_hi;
extern kvz_sample_octpel_chroma_hi_func * kvz_sample_octpel_chroma_hi;
int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@ -69,8 +111,8 @@ int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
{"filter_qpel_blocks_diag_luma", (void**) &kvz_filter_qpel_blocks_diag_luma}, \
{"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
{"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
{"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
{"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
{"sample_quarterpel_luma_hi", (void**) &kvz_sample_quarterpel_luma_hi}, \
{"sample_octpel_chroma_hi", (void**) &kvz_sample_octpel_chroma_hi}, \
{"get_extended_block", (void**) &kvz_get_extended_block}, \

View file

@ -1,3 +1,4 @@
race:kvz_eight_tap_filter_hor_8x1_avx2
# AVX2 interpolation reads some extra pixels
race:kvz_ipol_8tap_hor_px_im_avx2
race:kvz_filter_hpel_blocks_hor_ver_luma_avx2
race:kvz_eight_tap_filter_hor_avx2