From efc43c8b3a7e2d6e0886e81f43aedc5a7f8b9f87 Mon Sep 17 00:00:00 2001 From: Tapio Katajisto Date: Wed, 14 May 2014 01:42:02 +0000 Subject: [PATCH] Added fractional pixel motion estimation Added farctional mv support for inter recon Added 1/8-pel chroma and 1/4-pel luma interpolation --- src/filter.c | 408 ++++++++++++++++++++++++++++++++++++++++++++++++++- src/filter.h | 6 + src/inter.c | 357 +++++++++++++++++++++++++++++--------------- src/inter.h | 2 + src/search.c | 161 +++++++++++++++++++- 5 files changed, 806 insertions(+), 128 deletions(-) diff --git a/src/filter.c b/src/filter.c index bee1629d..debba2c5 100644 --- a/src/filter.c +++ b/src/filter.c @@ -515,7 +515,7 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t * ea0,0 = (-4*B0,-1 + 36*B0,0 + 36*B0,1 - 4*B0,2) >> shift1 * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2 */ - + int i = 0; int32_t x, y; int32_t shift1 = encoder->bitdepth-8; int32_t shift2 = 6; @@ -568,4 +568,410 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t } } } + //Clamp values to bitdepth + for(i = 0; i < width*height*4; ++i) { + if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1); + if(dst[i] < 0) dst[i] = 0; + } +} + +void filter_inter_octpel_chroma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) +{ + + int32_t x, y; + int32_t shift1 = encoder->bitdepth-8; + int32_t shift2 = 6; + int32_t shift3 = 14-encoder->bitdepth; + int32_t offset3 = 1 << (shift3 - 1); + int32_t offset23 = 1 << (shift2 + shift3 - 1); + + //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions + int16_t c1[4], c2[4], c3[4], c4[4], c5[4], c6[4], c7[4]; + + int i; + for(i = 0; i < 4; ++i ) { + c1[i] = g_chroma_filter[1][i]; + c2[i] = g_chroma_filter[2][i]; + c3[i] = g_chroma_filter[3][i]; + c4[i] = g_chroma_filter[4][i]; + c5[i] = g_chroma_filter[5][i]; + c6[i] = g_chroma_filter[6][i]; + c7[i] = g_chroma_filter[7][i]; + } + + // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round + for (y = 0; y < height; y++) { + int dst_pos_y = (y<<3)*dst_stride; + int src_pos_y = y*src_stride; + for (x = 0; x < width; x++) { + // Calculate current dst and src pixel positions + int dst_pos = dst_pos_y+(x<<3); + int src_pos = src_pos_y+x; + + // Temporary horizontally interpolated postions + int32_t h_temp[7] = {0,0,0,0,0,0,0}; + + // Original pixel + dst[dst_pos] = src[src_pos]; + + // Horizontal 1/8-values + if (hor_flag) { + + h_temp[0] = ((c1[0]*src[src_pos - 1] + + c1[1]*src[src_pos] + + c1[2]*src[src_pos + 1] + + c1[3]*src[src_pos + 2]) >> shift1); // ae0,0 h0 + + h_temp[1] = ((c2[0]*src[src_pos - 1] + + c2[1]*src[src_pos] + + c2[2]*src[src_pos + 1] + + c2[3]*src[src_pos + 2]) >> shift1); // ae0,0 h1 + + h_temp[2] = ((c3[0]*src[src_pos - 1] + + c3[1]*src[src_pos] + + c3[2]*src[src_pos + 1] + + c3[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2 + + h_temp[3] = ((c4[0]*src[src_pos - 1] + + c4[1]*src[src_pos] + + c4[2]*src[src_pos + 1] + + c4[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2 + + h_temp[4] = ((c5[0]*src[src_pos - 1] + + c5[1]*src[src_pos] + + c5[2]*src[src_pos + 1] + + c5[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2 + + h_temp[5] = ((c6[0]*src[src_pos - 1] + + c6[1]*src[src_pos] + + c6[2]*src[src_pos + 1] + + c6[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2 + + h_temp[6] = ((c7[0]*src[src_pos - 1] + + c7[1]*src[src_pos] + + c7[2]*src[src_pos + 1] + + c7[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2 + } + + // Vertical 1/8-values + if(ver_flag) { + dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 1*src_stride] + + c1[1]*src[src_pos] + + c1[2]*src[src_pos + 1*src_stride] + + c1[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 1*src_stride] + + c2[1]*src[src_pos] + + c2[2]*src[src_pos + 1*src_stride] + + c2[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 1*src_stride] + + c3[1]*src[src_pos] + + c3[2]*src[src_pos + 1*src_stride] + + c3[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 4*dst_stride] = (((c4[0]*src[src_pos - 1*src_stride] + + c4[1]*src[src_pos] + + c4[2]*src[src_pos + 1*src_stride] + + c4[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 5*dst_stride] = (((c5[0]*src[src_pos - 1*src_stride] + + c5[1]*src[src_pos] + + c5[2]*src[src_pos + 1*src_stride] + + c5[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 6*dst_stride] = (((c6[0]*src[src_pos - 1*src_stride] + + c6[1]*src[src_pos] + + c6[2]*src[src_pos + 1*src_stride] + + c6[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + + dst[dst_pos + 7*dst_stride] = (((c7[0]*src[src_pos - 1*src_stride] + + c7[1]*src[src_pos] + + c7[2]*src[src_pos + 1*src_stride] + + c7[3]*src[src_pos + 2*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; // + } + + // When both flags, interpolate values from temporary horizontal values + if (hor_flag && ver_flag) { + + int32_t temp[3][7]; // Temporary horizontal values calculated from integer pixels + + // Calculate temporary values + src_pos -= 1*src_stride; //0,-3 + for(i = 0; i < 3; ++i) { + + temp[i][0] = ((c1[0]*src[src_pos - 1] + c1[1]*src[src_pos] + + c1[2]*src[src_pos + 1] + c1[3]*src[src_pos + 2]) + >> shift1); // h0(0,-3+i) + + temp[i][1] = ((c2[0]*src[src_pos - 1] + c2[1]*src[src_pos] + + c2[2]*src[src_pos + 1] + c2[3]*src[src_pos + 2]) + >> shift1); // h1(0,-3+i) + + temp[i][2] = ((c3[0]*src[src_pos - 1] + c3[1]*src[src_pos] + + c3[2]*src[src_pos + 1] + c3[3]*src[src_pos + 2]) + >> shift1); // h2(0,-3+i) + + temp[i][3] = ((c4[0]*src[src_pos - 1] + c4[1]*src[src_pos] + + c4[2]*src[src_pos + 1] + c4[3]*src[src_pos + 2]) + >> shift1); // h2(0,-3+i) + + temp[i][4] = ((c5[0]*src[src_pos - 1] + c5[1]*src[src_pos] + + c5[2]*src[src_pos + 1] + c5[3]*src[src_pos + 2]) + >> shift1); // h2(0,-3+i) + + temp[i][5] = ((c6[0]*src[src_pos - 1] + c6[1]*src[src_pos] + + c6[2]*src[src_pos + 1] + c6[3]*src[src_pos + 2]) + >> shift1); // h2(0,-3+i) + + temp[i][6] = ((c7[0]*src[src_pos - 1] + c7[1]*src[src_pos] + + c7[2]*src[src_pos + 1] + c7[3]*src[src_pos + 2]) + >> shift1); // h2(0,-3+i) + + if(i == 0) { + //Skip calculating h_temp again + src_pos += 2*src_stride; + } else { + src_pos += src_stride; + } + } + + + //Calculate values from temporary horizontal 1/8-values + for(i=0;i<7;++i){ + dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*h_temp[i] + + c1[2]*temp[1][i] + c1[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*h_temp[i] + + c2[2]*temp[1][i] + c2[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*h_temp[i] + + c3[2]*temp[1][i] + c3[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 4*dst_stride + i+1] = (((c4[0]*temp[0][i] + c4[1]*h_temp[i] + + c4[2]*temp[1][i] + c4[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 5*dst_stride + i+1] = (((c5[0]*temp[0][i] + c5[1]*h_temp[i] + + c5[2]*temp[1][i] + c5[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 6*dst_stride + i+1] = (((c6[0]*temp[0][i] + c6[1]*h_temp[i] + + c6[2]*temp[1][i] + c6[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + dst[dst_pos + 7*dst_stride + i+1] = (((c7[0]*temp[0][i] + c7[1]*h_temp[i] + + c7[2]*temp[1][i] + c7[3]*temp[2][i]) + + offset23) >> shift2) >> shift3; // ee0,0 + + } + + } + + if(hor_flag) { + dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3; + dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3; + dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3; + dst[dst_pos + 4] = (h_temp[3] + offset3) >> shift3; + dst[dst_pos + 5] = (h_temp[4] + offset3) >> shift3; + dst[dst_pos + 6] = (h_temp[5] + offset3) >> shift3; + dst[dst_pos + 7] = (h_temp[6] + offset3) >> shift3; + } + + + } + } + + //Clamp values to bitdepth + for(i = 0; i < width*height*64; ++i) { + if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1); + if(dst[i] < 0) dst[i] = 0; + } +} + +void filter_inter_quarterpel_luma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) +{ + + int32_t x, y; + int32_t shift1 = encoder->bitdepth-8; + int32_t shift2 = 6; + int32_t shift3 = 14-encoder->bitdepth; + int32_t offset3 = 1 << (shift3 - 1); + int32_t offset23 = 1 << (shift2 + shift3 - 1); + + //coefficients for 1/4, 2/4 and 3/4 positions + int16_t c1[8], c2[8], c3[8]; + + int i; + for(i = 0; i < 8; ++i ) { + c1[i] = g_luma_filter[1][i]; + c2[i] = g_luma_filter[2][i]; + c3[i] = g_luma_filter[3][i]; + } + + // Loop source pixels and generate sixteen filtered quarter-pel pixels on each round + for (y = 0; y < height; y++) { + int dst_pos_y = (y<<2)*dst_stride; + int src_pos_y = y*src_stride; + for (x = 0; x < width; x++) { + // Calculate current dst and src pixel positions + int dst_pos = dst_pos_y+(x<<2); + int src_pos = src_pos_y+x; + + // Temporary variables.. + int32_t h_temp[3] = {0,0,0}; + + // Original pixel + dst[dst_pos] = src[src_pos]; + + // + if (hor_flag) { + + h_temp[0] = ((c1[0]*src[src_pos - 3] + + c1[1]*src[src_pos - 2] + + c1[2]*src[src_pos - 1] + + c1[3]*src[src_pos] + + c1[4]*src[src_pos + 1] + + c1[5]*src[src_pos + 2] + + c1[6]*src[src_pos + 3] + + c1[7]*src[src_pos + 4]) >> shift1); + + + + h_temp[1] = ((c2[0]*src[src_pos - 3] + + c2[1]*src[src_pos - 2] + + c2[2]*src[src_pos - 1] + + c2[3]*src[src_pos] + + c2[4]*src[src_pos + 1] + + c2[5]*src[src_pos + 2] + + c2[6]*src[src_pos + 3] + + c2[7]*src[src_pos + 4]) >> shift1); + + h_temp[2] = ((c3[0]*src[src_pos - 3] + + c3[1]*src[src_pos - 2] + + c3[2]*src[src_pos - 1] + + c3[3]*src[src_pos] + + c3[4]*src[src_pos + 1] + + c3[5]*src[src_pos + 2] + + c3[6]*src[src_pos + 3] + + c3[7]*src[src_pos + 4]) >> shift1); + } + // ea0,0 - needed only when ver_flag + if(ver_flag) { + dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 3*src_stride] + + c1[1]*src[src_pos - 2*src_stride] + + c1[2]*src[src_pos - 1*src_stride] + + c1[3]*src[src_pos] + + c1[4]*src[src_pos + 1*src_stride] + + c1[5]*src[src_pos + 2*src_stride] + + c1[6]*src[src_pos + 3*src_stride] + + c1[7]*src[src_pos + 4*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; + + dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 3*src_stride] + + c2[1]*src[src_pos - 2*src_stride] + + c2[2]*src[src_pos - 1*src_stride] + + c2[3]*src[src_pos] + + c2[4]*src[src_pos + 1*src_stride] + + c2[5]*src[src_pos + 2*src_stride] + + c2[6]*src[src_pos + 3*src_stride] + + c2[7]*src[src_pos + 4*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; + + dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 3*src_stride] + + c3[1]*src[src_pos - 2*src_stride] + + c3[2]*src[src_pos - 1*src_stride] + + c3[3]*src[src_pos] + + c3[4]*src[src_pos + 1*src_stride] + + c3[5]*src[src_pos + 2*src_stride] + + c3[6]*src[src_pos + 3*src_stride] + + c3[7]*src[src_pos + 4*src_stride]) >> shift1) + + (1<<(shift3-1))) >> shift3; + } + + // When both flags, we use _only_ this pixel (but still need ae0,0 for it) + if (hor_flag && ver_flag) { + + int32_t temp[7][3]; + + // Calculate temporary values.. + src_pos -= 3*src_stride; //0,-3 + for(i = 0; i < 7; ++i) { + + temp[i][0] = ((c1[0]*src[src_pos - 3] + c1[1]*src[src_pos - 2] + + c1[2]*src[src_pos - 1] + c1[3]*src[src_pos] + + c1[4]*src[src_pos + 1] + c1[5]*src[src_pos + 2] + + c1[6]*src[src_pos + 3] + c1[7]*src[src_pos + 4]) + >> shift1); // h0(0,-3+i) + + temp[i][1] = ((c2[0]*src[src_pos - 3] + c2[1]*src[src_pos - 2] + + c2[2]*src[src_pos - 1] + c2[3]*src[src_pos] + + c2[4]*src[src_pos + 1] + c2[5]*src[src_pos + 2] + + c2[6]*src[src_pos + 3] + c2[7]*src[src_pos + 4]) + >> shift1); // h1(0,-3+i) + + temp[i][2] = ((c3[0]*src[src_pos - 3] + c3[1]*src[src_pos - 2] + + c3[2]*src[src_pos - 1] + c3[3]*src[src_pos] + + c3[4]*src[src_pos + 1] + c3[5]*src[src_pos + 2] + + c3[6]*src[src_pos + 3] + c3[7]*src[src_pos + 4]) + >> shift1); // h2(0,-3+i) + + if(i == 2) { + //Skip calculating h_temp again + src_pos += 2*src_stride; + } else { + src_pos += src_stride; + } + } + + + + for(i=0;i<3;++i){ + dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*temp[1][i] + + c1[2]*temp[2][i] + c1[3]*h_temp[i] + + c1[4]*temp[3][i] + c1[5]*temp[4][i] + + c1[6]*temp[5][i] + c1[7]*temp[6][i]) + + offset23) >> shift2) >> shift3; + + dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*temp[1][i] + + c2[2]*temp[2][i] + c2[3]*h_temp[i] + + c2[4]*temp[3][i] + c2[5]*temp[4][i] + + c2[6]*temp[5][i] + c2[7]*temp[6][i]) + + offset23) >> shift2) >> shift3; + + dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*temp[1][i] + + c3[2]*temp[2][i] + c3[3]*h_temp[i] + + c3[4]*temp[3][i] + c3[5]*temp[4][i] + + c3[6]*temp[5][i] + c3[7]*temp[6][i]) + + offset23) >> shift2) >> shift3; + + } + + } + + if(hor_flag) { + dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3; + dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3; + dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3; + } + + + } + } + + //Clamp values to bitdepth + for(i = 0; i < width*height*16; ++i) { + if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1); + if(dst[i] < 0) dst[i] = 0; + } } diff --git a/src/filter.h b/src/filter.h index b7a51fa5..9abb3032 100644 --- a/src/filter.h +++ b/src/filter.h @@ -54,6 +54,12 @@ void filter_inter_halfpel_chroma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); +void filter_inter_octpel_chroma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, + int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); + +void filter_inter_quarterpel_luma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, + int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); + // SAO ////////////////////////////////////////////////////////////////////////// diff --git a/src/inter.c b/src/inter.c index ef19219c..9a4f0b6e 100644 --- a/src/inter.c +++ b/src/inter.c @@ -64,6 +64,45 @@ void inter_set_block(picture* pic, uint32_t x_cu, uint32_t y_cu, uint8_t depth, } } +void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height, + int filterSize, int width, int height, int16_t *dst) { + + int16_t mv[2] = {mv_x, mv_y}; + int halfFilterSize = filterSize>>1; + + int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; int ref_width_c; + int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp; + + ref_width_c = ref_width; + //width = LCU_WIDTH>>depth; + + for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) { + + // calculate y-pixel offset + coord_y = y + off_y + mv[1]; + + // On y-overflow set coord_y accordingly + overflow_neg_y_temp = (coord_y < 0) ? 1 : 0; + overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0; + if (overflow_neg_y_temp) coord_y = 0; + else if (overflow_pos_y_temp) coord_y = (ref_height) - 1; + coord_y *= ref_width_c; + + for (dst_x = 0, x = (xpos) - halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) { + coord_x = x + off_x + mv[0]; + + // On x-overflow set coord_x accordingly + overflow_neg_x_temp = (coord_x < 0) ? 1 : 0; + overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0; + if (overflow_neg_x_temp) coord_x = 0; + else if (overflow_pos_x_temp) coord_x = ref_width_c - 1; + + // Store source block data (with extended borders) + dst[dst_y*(width+filterSize) + dst_x] = ref[coord_y + coord_x]; + } + } +} + /** * \brief Reconstruct inter block * \param ref picture to copy the data from @@ -100,153 +139,233 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture * int16_t halfpel_u[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (u) int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v) - // TODO: Fractional pixel support + // Luma quarter-pel + int8_t fractional_mv = (mv[0]&1) || (mv[1]&1) || (mv[0]&2) || (mv[1]&2); // 2 lowest bits of mv set -> mv is fractional + + if(fractional_mv) { + int y_off_x = (mv[0]&3); + int y_off_y = (mv[1]&3); + + int c_off_x = (mv[0]&7); + int c_off_y = (mv[1]&7); + + int y,x; + + #define FILTER_SIZE_Y 8 + #define FILTER_SIZE_C 4 + + //vector2d orig = {xpos, ypos}; + //vector2d orig_c = {xpos>>1, ypos>>1}; + + // Fractional luma 1/4-pel + int16_t qpel_src_y[(LCU_WIDTH+FILTER_SIZE_Y) * (LCU_WIDTH+FILTER_SIZE_Y)]; + int16_t* qpel_src_off_y = &qpel_src_y[(width+FILTER_SIZE_Y)*(FILTER_SIZE_Y>>1)+(FILTER_SIZE_Y>>1)]; + int16_t qpel_dst_y[LCU_WIDTH*LCU_WIDTH*16]; + + // Fractional chroma 1/8-pel + int width_c = width>>1; + int16_t octpel_src_u[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)]; + int16_t* octpel_src_off_u = &octpel_src_u[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)]; + int16_t octpel_dst_u[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64]; + + int16_t octpel_src_v[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)]; + int16_t* octpel_src_off_v = &octpel_src_v[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)]; + int16_t octpel_dst_v[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64]; + + // Fractional luma + extend_borders(xpos, ypos, mv[0]>>2, mv[1]>>2, encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->lcu_offset_y * LCU_WIDTH, + ref->y_recdata, ref->width, ref->height, FILTER_SIZE_Y, width, width, qpel_src_y); + + filter_inter_quarterpel_luma(encoder_state->encoder_control, qpel_src_off_y, width+FILTER_SIZE_Y, width, + width, qpel_dst_y, width*4, y_off_x, y_off_y); + + //Fractional chroma U + extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1), + ref->u_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_u); + + filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_u, width_c+FILTER_SIZE_C, width_c, + width_c, octpel_dst_u, width_c*8, c_off_x, c_off_y); + + //Fractional chroma V + extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1), + ref->v_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_v); + + filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_v, width_c+FILTER_SIZE_C, width_c, + width_c, octpel_dst_v, width_c*8, c_off_x, c_off_y); + + //Luma + for(y = 0; y < width; ++y) { + int y_in_lcu = ((y+ypos) & ((LCU_WIDTH)-1)); + int qpel_y = y*4+y_off_y; + for(x = 0; x < width; ++x) { + int x_in_lcu = ((x+xpos) & ((LCU_WIDTH)-1)); + int qpel_x = x*4+y_off_x; + //printf("x: %d, y: %d\n", off_x, off_y); + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)qpel_dst_y[qpel_y*(width*4)+qpel_x]; + //printf("i: %d", qpel_y*(width*4)+qpel_x); + } + } + //Chroma + for(y = 0; y < width_c; ++y) { + int y_in_lcu = ((y+(ypos>>1)) & ((LCU_WIDTH>>1)-1)); + int qpel_y = y*8+c_off_y; + for(x = 0; x < width_c; ++x) { + int x_in_lcu = ((x+(xpos>>1)) & ((LCU_WIDTH>>1)-1)); + int qpel_x = x*8+c_off_x; + lcu->rec.u[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_u[qpel_y*(width_c*8)+qpel_x]; + lcu->rec.v[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_v[qpel_y*(width_c*8)+qpel_x]; + } + } + } + mv[0] >>= 2; mv[1] >>= 2; // Chroma half-pel // get half-pel interpolated block and push it to output - if(chroma_halfpel) { - int halfpel_y, halfpel_x; - int abs_mv_x = mv[0]&1; - int abs_mv_y = mv[1]&1; - int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp; - // Fill source blocks with data from reference, -4...width+4 - for (halfpel_y = 0, y = (ypos>>1) - 4; y < ((ypos + width)>>1) + 4; halfpel_y++, y++) { - // calculate y-pixel offset - coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1); - - // On y-overflow set coord_y accordingly - overflow_neg_y_temp = (coord_y < 0) ? 1 : 0; - overflow_pos_y_temp = (coord_y >= ref->height>>1) ? 1 : 0; - if (overflow_neg_y_temp) coord_y = 0; - else if (overflow_pos_y_temp) coord_y = (ref->height>>1) - 1; - coord_y *= ref_width_c; - - for (halfpel_x = 0, x = (xpos>>1) - 4; x < ((xpos + width)>>1) + 4; halfpel_x++, x++) { - coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1); - - // On x-overflow set coord_x accordingly - overflow_neg_x_temp = (coord_x < 0) ? 1 : 0; - overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0; - if (overflow_neg_x_temp) coord_x = 0; - else if (overflow_pos_x_temp) coord_x = ref_width_c - 1; - - // Store source block data (with extended borders) - halfpel_src_u[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->u_recdata[coord_y + coord_x]; - halfpel_src_v[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->v_recdata[coord_y + coord_x]; - } - } - - // Filter the block to half-pel resolution - filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_u, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_u, LCU_WIDTH, abs_mv_x, abs_mv_y); - filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_v, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_v, LCU_WIDTH, abs_mv_x, abs_mv_y); - - // Assign filtered pixels to output, take every second half-pel sample with offset of abs_mv_y/x - for (halfpel_y = abs_mv_y, y = ypos>>1; y < (ypos + width)>>1; halfpel_y += 2, y++) { - for (halfpel_x = abs_mv_x, x = xpos>>1; x < (xpos + width)>>1; halfpel_x += 2, x++) { - int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); - int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); - lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_u[halfpel_y*LCU_WIDTH + halfpel_x]; - lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_v[halfpel_y*LCU_WIDTH + halfpel_x]; - } - } - } - - // With overflow present, more checking - if (overflow_neg_x || overflow_neg_y || overflow_pos_x || overflow_pos_y) { - // Copy Luma with boundary checking - for (y = ypos; y < ypos + width; y++) { - for (x = xpos; x < xpos + width; x++) { - int x_in_lcu = (x & ((LCU_WIDTH)-1)); - int y_in_lcu = (y & ((LCU_WIDTH)-1)); - - coord_x = (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]; - coord_y = (y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]; - overflow_neg_x = (coord_x < 0)?1:0; - overflow_neg_y = (coord_y < 0)?1:0; - - overflow_pos_x = (coord_x >= ref->width )?1:0; - overflow_pos_y = (coord_y >= ref->height)?1:0; - - // On x-overflow set coord_x accordingly - if (overflow_neg_x) { - coord_x = 0; - } else if (overflow_pos_x) { - coord_x = ref->width - 1; - } + if(!fractional_mv) { + if(chroma_halfpel) { + int halfpel_y, halfpel_x; + int abs_mv_x = mv[0]&1; + int abs_mv_y = mv[1]&1; + int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp; + // Fill source blocks with data from reference, -4...width+4 + for (halfpel_y = 0, y = (ypos>>1) - 4; y < ((ypos + width)>>1) + 4; halfpel_y++, y++) { + // calculate y-pixel offset + coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1); // On y-overflow set coord_y accordingly - if (overflow_neg_y) { - coord_y = 0; - } else if (overflow_pos_y) { - coord_y = ref->height - 1; - } + overflow_neg_y_temp = (coord_y < 0) ? 1 : 0; + overflow_pos_y_temp = (coord_y >= ref->height>>1) ? 1 : 0; + if (overflow_neg_y_temp) coord_y = 0; + else if (overflow_pos_y_temp) coord_y = (ref->height>>1) - 1; + coord_y *= ref_width_c; - // set destination to (corrected) pixel value from the reference - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y*ref->width + coord_x]; + for (halfpel_x = 0, x = (xpos>>1) - 4; x < ((xpos + width)>>1) + 4; halfpel_x++, x++) { + coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1); + + // On x-overflow set coord_x accordingly + overflow_neg_x_temp = (coord_x < 0) ? 1 : 0; + overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0; + if (overflow_neg_x_temp) coord_x = 0; + else if (overflow_pos_x_temp) coord_x = ref_width_c - 1; + + // Store source block data (with extended borders) + halfpel_src_u[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->u_recdata[coord_y + coord_x]; + halfpel_src_v[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->v_recdata[coord_y + coord_x]; + } + } + + // Filter the block to half-pel resolution + filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_u, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_u, LCU_WIDTH, abs_mv_x, abs_mv_y); + filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_v, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_v, LCU_WIDTH, abs_mv_x, abs_mv_y); + + // Assign filtered pixels to output, take every second half-pel sample with offset of abs_mv_y/x + for (halfpel_y = abs_mv_y, y = ypos>>1; y < (ypos + width)>>1; halfpel_y += 2, y++) { + for (halfpel_x = abs_mv_x, x = xpos>>1; x < (xpos + width)>>1; halfpel_x += 2, x++) { + int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); + int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); + lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_u[halfpel_y*LCU_WIDTH + halfpel_x]; + lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_v[halfpel_y*LCU_WIDTH + halfpel_x]; + } } } - if(!chroma_halfpel) { - // Copy Chroma with boundary checking - // TODO: chroma fractional pixel interpolation - for (y = ypos>>1; y < (ypos + width)>>1; y++) { - for (x = xpos>>1; x < (xpos + width)>>1; x++) { - int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); - int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); - - coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1); - coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1); + // With overflow present, more checking + if (overflow_neg_x || overflow_neg_y || overflow_pos_x || overflow_pos_y) { + // Copy Luma with boundary checking + for (y = ypos; y < ypos + width; y++) { + for (x = xpos; x < xpos + width; x++) { + int x_in_lcu = (x & ((LCU_WIDTH)-1)); + int y_in_lcu = (y & ((LCU_WIDTH)-1)); + coord_x = (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]; + coord_y = (y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]; overflow_neg_x = (coord_x < 0)?1:0; - overflow_neg_y = (y + (mv[1]>>1) < 0)?1:0; + overflow_neg_y = (coord_y < 0)?1:0; - overflow_pos_x = (coord_x >= ref->width>>1 )?1:0; - overflow_pos_y = (coord_y >= ref->height>>1)?1:0; + overflow_pos_x = (coord_x >= ref->width )?1:0; + overflow_pos_y = (coord_y >= ref->height)?1:0; // On x-overflow set coord_x accordingly - if(overflow_neg_x) { + if (overflow_neg_x) { coord_x = 0; - } else if(overflow_pos_x) { - coord_x = (ref->width>>1) - 1; + } else if (overflow_pos_x) { + coord_x = ref->width - 1; } // On y-overflow set coord_y accordingly - if(overflow_neg_y) { + if (overflow_neg_y) { coord_y = 0; - } else if(overflow_pos_y) { - coord_y = (ref->height>>1) - 1; + } else if (overflow_pos_y) { + coord_y = ref->height - 1; } - // set destinations to (corrected) pixel value from the reference - lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y * ref_width_c + coord_x]; - lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y * ref_width_c + coord_x]; + // set destination to (corrected) pixel value from the reference + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y*ref->width + coord_x]; } } - } - } else { //If no overflow, we can copy without checking boundaries - // Copy Luma - for (y = ypos; y < ypos + width; y++) { - int y_in_lcu = (y & ((LCU_WIDTH)-1)); - coord_y = ((y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate - for (x = xpos; x < xpos + width; x++) { - int x_in_lcu = (x & ((LCU_WIDTH)-1)); - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]]; + if(!chroma_halfpel) { + // Copy Chroma with boundary checking + // TODO: chroma fractional pixel interpolation + for (y = ypos>>1; y < (ypos + width)>>1; y++) { + for (x = xpos>>1; x < (xpos + width)>>1; x++) { + int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); + int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); + + coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1); + coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1); + + overflow_neg_x = (coord_x < 0)?1:0; + overflow_neg_y = (y + (mv[1]>>1) < 0)?1:0; + + overflow_pos_x = (coord_x >= ref->width>>1 )?1:0; + overflow_pos_y = (coord_y >= ref->height>>1)?1:0; + + // On x-overflow set coord_x accordingly + if(overflow_neg_x) { + coord_x = 0; + } else if(overflow_pos_x) { + coord_x = (ref->width>>1) - 1; + } + + // On y-overflow set coord_y accordingly + if(overflow_neg_y) { + coord_y = 0; + } else if(overflow_pos_y) { + coord_y = (ref->height>>1) - 1; + } + + // set destinations to (corrected) pixel value from the reference + lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y * ref_width_c + coord_x]; + lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y * ref_width_c + coord_x]; + } + } } - } + } else { //If no overflow, we can copy without checking boundaries + // Copy Luma + for (y = ypos; y < ypos + width; y++) { + int y_in_lcu = (y & ((LCU_WIDTH)-1)); + coord_y = ((y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate + for (x = xpos; x < xpos + width; x++) { + int x_in_lcu = (x & ((LCU_WIDTH)-1)); - if(!chroma_halfpel) { - // Copy Chroma - // TODO: chroma fractional pixel interpolation - for (y = ypos>>1; y < (ypos + width)>>1; y++) { - int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); - coord_y = ((y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate - for (x = xpos>>1; x < (xpos + width)>>1; x++) { - int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); - lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)]; - lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)]; + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]]; + } + } + + if(!chroma_halfpel) { + // Copy Chroma + // TODO: chroma fractional pixel interpolation + for (y = ypos>>1; y < (ypos + width)>>1; y++) { + int y_in_lcu = (y & ((LCU_WIDTH>>1)-1)); + coord_y = ((y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate + for (x = xpos>>1; x < (xpos + width)>>1; x++) { + int x_in_lcu = (x & ((LCU_WIDTH>>1)-1)); + lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)]; + lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)]; + } } } } diff --git a/src/inter.h b/src/inter.h index d02749aa..f2c60884 100644 --- a/src/inter.h +++ b/src/inter.h @@ -37,4 +37,6 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i cu_info **b2,cu_info **a0,cu_info **a1, lcu_t *lcu); void inter_get_mv_cand(const encoder_state *encoder_state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info* cur_cu, lcu_t *lcu); uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu); +void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height, + int filterSize, int width, int height, int16_t *dst); #endif diff --git a/src/search.c b/src/search.c index a5e69242..2f3b1b75 100644 --- a/src/search.c +++ b/src/search.c @@ -72,6 +72,16 @@ const vector2d small_hexbs[5] = { { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 } }; +/* + * 6 7 8 + * 3 4 5 + * 0 1 2 + */ +const vector2d square[9] = { + { -1, 1 }, + { 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 }, + { 0, -1 }, { 1, -1 } +}; static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count) { @@ -118,7 +128,7 @@ static uint32_t get_mvd_coding_cost(vector2d *mvd) return bitcost; } -static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, +static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, int mv_shift, int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3], int16_t num_cand,int32_t ref_idx, uint32_t *bitcost) { @@ -129,8 +139,8 @@ static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y int8_t merged = 0; int8_t cur_mv_cand = 0; - x <<= 2; - y <<= 2; + x <<= mv_shift; + y <<= mv_shift; // Check every candidate to find a match for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) { @@ -205,7 +215,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, block_width, block_width); - cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; @@ -220,7 +230,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, block_width, block_width); - cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + cost += calc_mvd_cost(encoder_state, 0, 0, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost); // If the 0,0 is better, redo the hexagon around that point. if (cost < best_cost) { @@ -236,7 +246,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, block_width, block_width); - cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; @@ -271,7 +281,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width); - cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost); if (cost < best_cost) { best_cost = cost; @@ -294,7 +304,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, block_width, block_width); - cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost); if (cost > 0 && cost < best_cost) { best_cost = cost; @@ -369,6 +379,139 @@ static unsigned search_mv_full(unsigned depth, } #endif +static unsigned search_frac( const encoder_state * const encoder_state, + unsigned depth, + const picture *pic, const picture *ref, + const vector2d *orig, vector2d *mv_in_out, + int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3], + int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) { + + //Set mv to halfpel precision + vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 }; + int block_width = CU_WIDTH_FROM_DEPTH(depth); + unsigned best_cost = UINT32_MAX; + uint32_t best_bitcost = 0, bitcost; + unsigned i; + unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs. + + unsigned cost = 0; + + cost_16bit_nxn_func satd = get_satd_16bit_nxn_func(block_width); + + vector2d halfpel_offset; + + #define FILTER_SIZE 8 + #define HALF_FILTER (FILTER_SIZE>>1) + + //create buffer for block + extra for filter + int src_stride = block_width+FILTER_SIZE+1; + int16_t src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)]; + int16_t* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)]; + + //destination buffer for interpolation + int dst_stride = (block_width+1)*4; + int16_t dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16]; + int16_t* dst_off = &dst[dst_stride*4+4]; + + extend_borders(orig->x, orig->y, mv.x-1, mv.y-1, + encoder_state->tile->lcu_offset_x * LCU_WIDTH, + encoder_state->tile->lcu_offset_y * LCU_WIDTH, + ref->y_data, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src); + + filter_inter_quarterpel_luma(encoder_state->encoder_control, src_off, src_stride, block_width+1, + block_width+1, dst, dst_stride, 1, 1); + + + //Set mv to half-pixel precision + mv.x <<= 1; + mv.y <<= 1; + + // Search halfpel positions around best integer mv + for (i = 0; i < 9; ++i) { + const vector2d *pattern = &square[i]; + + pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH]; + pixel tmp_pic[LCU_WIDTH*LCU_WIDTH]; + + int y,x; + for(y = 0; y < block_width; ++y) { + int dst_y = y*4+pattern->y*2; + for(x = 0; x < block_width; ++x) { + int dst_x = x*4+pattern->x*2; + tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x]; + tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width]; + } + } + + cost = satd(tmp_pic,tmp_filtered); + + cost = cost>>1; + + cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + + if (cost < best_cost) { + best_cost = cost; + best_index = i; + best_bitcost = bitcost; + + } + } + + //Set mv to best match + mv.x += square[best_index].x; + mv.y += square[best_index].y; + + halfpel_offset.x = square[best_index].x*2; + halfpel_offset.y = square[best_index].y*2; + + //Set mv to quarterpel precision + mv.x <<= 1; + mv.y <<= 1; + + //Search quarterpel points around best halfpel mv + for (i = 0; i < 9; ++i) { + const vector2d *pattern = &square[i]; + + pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH]; + pixel tmp_pic[LCU_WIDTH*LCU_WIDTH]; + + int y,x; + for(y = 0; y < block_width; ++y) { + int dst_y = y*4+halfpel_offset.y+pattern->y; + for(x = 0; x < block_width; ++x) { + int dst_x = x*4+halfpel_offset.x+pattern->x; + tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x]; + tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width]; + } + } + + cost = satd(tmp_pic,tmp_filtered); + + cost = cost>>1; + + cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); + + if (cost < best_cost) { + best_cost = cost; + best_index = i; + best_bitcost = bitcost; + } + } + + //Set mv to best final best match + mv.x += square[best_index].x; + mv.y += square[best_index].y; + + mv_in_out->x = mv.x; + mv_in_out->y = mv.y; + + *bitcost_out = best_bitcost; + + + return best_cost; + +} + /** * Update lcu to have best modes at this depth. * \return Cost of best mode. @@ -425,6 +568,8 @@ static int search_cu_inter(const encoder_state * const encoder_state, int x, int temp_cost += hexagon_search(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); #endif + temp_cost = search_frac(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); + merged = 0; // Check every candidate to find a match for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {