Added fractional pixel motion estimation

Added farctional mv support for inter recon

Added 1/8-pel chroma and 1/4-pel luma interpolation
This commit is contained in:
Tapio Katajisto 2014-05-14 01:42:02 +00:00
parent 6c7e4dbeef
commit efc43c8b3a
5 changed files with 806 additions and 128 deletions

View file

@ -515,7 +515,7 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
* ea0,0 = (-4*B0,-1 + 36*B0,0 + 36*B0,1 - 4*B0,2) >> shift1 * ea0,0 = (-4*B0,-1 + 36*B0,0 + 36*B0,1 - 4*B0,2) >> shift1
* ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2 * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2
*/ */
int i = 0;
int32_t x, y; int32_t x, y;
int32_t shift1 = encoder->bitdepth-8; int32_t shift1 = encoder->bitdepth-8;
int32_t shift2 = 6; int32_t shift2 = 6;
@ -568,4 +568,410 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
} }
} }
} }
//Clamp values to bitdepth
for(i = 0; i < width*height*4; ++i) {
if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
if(dst[i] < 0) dst[i] = 0;
}
}
void filter_inter_octpel_chroma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
{
int32_t x, y;
int32_t shift1 = encoder->bitdepth-8;
int32_t shift2 = 6;
int32_t shift3 = 14-encoder->bitdepth;
int32_t offset3 = 1 << (shift3 - 1);
int32_t offset23 = 1 << (shift2 + shift3 - 1);
//coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions
int16_t c1[4], c2[4], c3[4], c4[4], c5[4], c6[4], c7[4];
int i;
for(i = 0; i < 4; ++i ) {
c1[i] = g_chroma_filter[1][i];
c2[i] = g_chroma_filter[2][i];
c3[i] = g_chroma_filter[3][i];
c4[i] = g_chroma_filter[4][i];
c5[i] = g_chroma_filter[5][i];
c6[i] = g_chroma_filter[6][i];
c7[i] = g_chroma_filter[7][i];
}
// Loop source pixels and generate 64 filtered 1/8-pel pixels on each round
for (y = 0; y < height; y++) {
int dst_pos_y = (y<<3)*dst_stride;
int src_pos_y = y*src_stride;
for (x = 0; x < width; x++) {
// Calculate current dst and src pixel positions
int dst_pos = dst_pos_y+(x<<3);
int src_pos = src_pos_y+x;
// Temporary horizontally interpolated postions
int32_t h_temp[7] = {0,0,0,0,0,0,0};
// Original pixel
dst[dst_pos] = src[src_pos];
// Horizontal 1/8-values
if (hor_flag) {
h_temp[0] = ((c1[0]*src[src_pos - 1]
+ c1[1]*src[src_pos]
+ c1[2]*src[src_pos + 1]
+ c1[3]*src[src_pos + 2]) >> shift1); // ae0,0 h0
h_temp[1] = ((c2[0]*src[src_pos - 1]
+ c2[1]*src[src_pos]
+ c2[2]*src[src_pos + 1]
+ c2[3]*src[src_pos + 2]) >> shift1); // ae0,0 h1
h_temp[2] = ((c3[0]*src[src_pos - 1]
+ c3[1]*src[src_pos]
+ c3[2]*src[src_pos + 1]
+ c3[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
h_temp[3] = ((c4[0]*src[src_pos - 1]
+ c4[1]*src[src_pos]
+ c4[2]*src[src_pos + 1]
+ c4[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
h_temp[4] = ((c5[0]*src[src_pos - 1]
+ c5[1]*src[src_pos]
+ c5[2]*src[src_pos + 1]
+ c5[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
h_temp[5] = ((c6[0]*src[src_pos - 1]
+ c6[1]*src[src_pos]
+ c6[2]*src[src_pos + 1]
+ c6[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
h_temp[6] = ((c7[0]*src[src_pos - 1]
+ c7[1]*src[src_pos]
+ c7[2]*src[src_pos + 1]
+ c7[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
}
// Vertical 1/8-values
if(ver_flag) {
dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 1*src_stride]
+ c1[1]*src[src_pos]
+ c1[2]*src[src_pos + 1*src_stride]
+ c1[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 1*src_stride]
+ c2[1]*src[src_pos]
+ c2[2]*src[src_pos + 1*src_stride]
+ c2[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 1*src_stride]
+ c3[1]*src[src_pos]
+ c3[2]*src[src_pos + 1*src_stride]
+ c3[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 4*dst_stride] = (((c4[0]*src[src_pos - 1*src_stride]
+ c4[1]*src[src_pos]
+ c4[2]*src[src_pos + 1*src_stride]
+ c4[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 5*dst_stride] = (((c5[0]*src[src_pos - 1*src_stride]
+ c5[1]*src[src_pos]
+ c5[2]*src[src_pos + 1*src_stride]
+ c5[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 6*dst_stride] = (((c6[0]*src[src_pos - 1*src_stride]
+ c6[1]*src[src_pos]
+ c6[2]*src[src_pos + 1*src_stride]
+ c6[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
dst[dst_pos + 7*dst_stride] = (((c7[0]*src[src_pos - 1*src_stride]
+ c7[1]*src[src_pos]
+ c7[2]*src[src_pos + 1*src_stride]
+ c7[3]*src[src_pos + 2*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3; //
}
// When both flags, interpolate values from temporary horizontal values
if (hor_flag && ver_flag) {
int32_t temp[3][7]; // Temporary horizontal values calculated from integer pixels
// Calculate temporary values
src_pos -= 1*src_stride; //0,-3
for(i = 0; i < 3; ++i) {
temp[i][0] = ((c1[0]*src[src_pos - 1] + c1[1]*src[src_pos]
+ c1[2]*src[src_pos + 1] + c1[3]*src[src_pos + 2])
>> shift1); // h0(0,-3+i)
temp[i][1] = ((c2[0]*src[src_pos - 1] + c2[1]*src[src_pos]
+ c2[2]*src[src_pos + 1] + c2[3]*src[src_pos + 2])
>> shift1); // h1(0,-3+i)
temp[i][2] = ((c3[0]*src[src_pos - 1] + c3[1]*src[src_pos]
+ c3[2]*src[src_pos + 1] + c3[3]*src[src_pos + 2])
>> shift1); // h2(0,-3+i)
temp[i][3] = ((c4[0]*src[src_pos - 1] + c4[1]*src[src_pos]
+ c4[2]*src[src_pos + 1] + c4[3]*src[src_pos + 2])
>> shift1); // h2(0,-3+i)
temp[i][4] = ((c5[0]*src[src_pos - 1] + c5[1]*src[src_pos]
+ c5[2]*src[src_pos + 1] + c5[3]*src[src_pos + 2])
>> shift1); // h2(0,-3+i)
temp[i][5] = ((c6[0]*src[src_pos - 1] + c6[1]*src[src_pos]
+ c6[2]*src[src_pos + 1] + c6[3]*src[src_pos + 2])
>> shift1); // h2(0,-3+i)
temp[i][6] = ((c7[0]*src[src_pos - 1] + c7[1]*src[src_pos]
+ c7[2]*src[src_pos + 1] + c7[3]*src[src_pos + 2])
>> shift1); // h2(0,-3+i)
if(i == 0) {
//Skip calculating h_temp again
src_pos += 2*src_stride;
} else {
src_pos += src_stride;
}
}
//Calculate values from temporary horizontal 1/8-values
for(i=0;i<7;++i){
dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*h_temp[i]
+ c1[2]*temp[1][i] + c1[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*h_temp[i]
+ c2[2]*temp[1][i] + c2[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*h_temp[i]
+ c3[2]*temp[1][i] + c3[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 4*dst_stride + i+1] = (((c4[0]*temp[0][i] + c4[1]*h_temp[i]
+ c4[2]*temp[1][i] + c4[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 5*dst_stride + i+1] = (((c5[0]*temp[0][i] + c5[1]*h_temp[i]
+ c5[2]*temp[1][i] + c5[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 6*dst_stride + i+1] = (((c6[0]*temp[0][i] + c6[1]*h_temp[i]
+ c6[2]*temp[1][i] + c6[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
dst[dst_pos + 7*dst_stride + i+1] = (((c7[0]*temp[0][i] + c7[1]*h_temp[i]
+ c7[2]*temp[1][i] + c7[3]*temp[2][i])
+ offset23) >> shift2) >> shift3; // ee0,0
}
}
if(hor_flag) {
dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
dst[dst_pos + 4] = (h_temp[3] + offset3) >> shift3;
dst[dst_pos + 5] = (h_temp[4] + offset3) >> shift3;
dst[dst_pos + 6] = (h_temp[5] + offset3) >> shift3;
dst[dst_pos + 7] = (h_temp[6] + offset3) >> shift3;
}
}
}
//Clamp values to bitdepth
for(i = 0; i < width*height*64; ++i) {
if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
if(dst[i] < 0) dst[i] = 0;
}
}
void filter_inter_quarterpel_luma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
{
int32_t x, y;
int32_t shift1 = encoder->bitdepth-8;
int32_t shift2 = 6;
int32_t shift3 = 14-encoder->bitdepth;
int32_t offset3 = 1 << (shift3 - 1);
int32_t offset23 = 1 << (shift2 + shift3 - 1);
//coefficients for 1/4, 2/4 and 3/4 positions
int16_t c1[8], c2[8], c3[8];
int i;
for(i = 0; i < 8; ++i ) {
c1[i] = g_luma_filter[1][i];
c2[i] = g_luma_filter[2][i];
c3[i] = g_luma_filter[3][i];
}
// Loop source pixels and generate sixteen filtered quarter-pel pixels on each round
for (y = 0; y < height; y++) {
int dst_pos_y = (y<<2)*dst_stride;
int src_pos_y = y*src_stride;
for (x = 0; x < width; x++) {
// Calculate current dst and src pixel positions
int dst_pos = dst_pos_y+(x<<2);
int src_pos = src_pos_y+x;
// Temporary variables..
int32_t h_temp[3] = {0,0,0};
// Original pixel
dst[dst_pos] = src[src_pos];
//
if (hor_flag) {
h_temp[0] = ((c1[0]*src[src_pos - 3]
+ c1[1]*src[src_pos - 2]
+ c1[2]*src[src_pos - 1]
+ c1[3]*src[src_pos]
+ c1[4]*src[src_pos + 1]
+ c1[5]*src[src_pos + 2]
+ c1[6]*src[src_pos + 3]
+ c1[7]*src[src_pos + 4]) >> shift1);
h_temp[1] = ((c2[0]*src[src_pos - 3]
+ c2[1]*src[src_pos - 2]
+ c2[2]*src[src_pos - 1]
+ c2[3]*src[src_pos]
+ c2[4]*src[src_pos + 1]
+ c2[5]*src[src_pos + 2]
+ c2[6]*src[src_pos + 3]
+ c2[7]*src[src_pos + 4]) >> shift1);
h_temp[2] = ((c3[0]*src[src_pos - 3]
+ c3[1]*src[src_pos - 2]
+ c3[2]*src[src_pos - 1]
+ c3[3]*src[src_pos]
+ c3[4]*src[src_pos + 1]
+ c3[5]*src[src_pos + 2]
+ c3[6]*src[src_pos + 3]
+ c3[7]*src[src_pos + 4]) >> shift1);
}
// ea0,0 - needed only when ver_flag
if(ver_flag) {
dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 3*src_stride]
+ c1[1]*src[src_pos - 2*src_stride]
+ c1[2]*src[src_pos - 1*src_stride]
+ c1[3]*src[src_pos]
+ c1[4]*src[src_pos + 1*src_stride]
+ c1[5]*src[src_pos + 2*src_stride]
+ c1[6]*src[src_pos + 3*src_stride]
+ c1[7]*src[src_pos + 4*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3;
dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 3*src_stride]
+ c2[1]*src[src_pos - 2*src_stride]
+ c2[2]*src[src_pos - 1*src_stride]
+ c2[3]*src[src_pos]
+ c2[4]*src[src_pos + 1*src_stride]
+ c2[5]*src[src_pos + 2*src_stride]
+ c2[6]*src[src_pos + 3*src_stride]
+ c2[7]*src[src_pos + 4*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3;
dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 3*src_stride]
+ c3[1]*src[src_pos - 2*src_stride]
+ c3[2]*src[src_pos - 1*src_stride]
+ c3[3]*src[src_pos]
+ c3[4]*src[src_pos + 1*src_stride]
+ c3[5]*src[src_pos + 2*src_stride]
+ c3[6]*src[src_pos + 3*src_stride]
+ c3[7]*src[src_pos + 4*src_stride]) >> shift1)
+ (1<<(shift3-1))) >> shift3;
}
// When both flags, we use _only_ this pixel (but still need ae0,0 for it)
if (hor_flag && ver_flag) {
int32_t temp[7][3];
// Calculate temporary values..
src_pos -= 3*src_stride; //0,-3
for(i = 0; i < 7; ++i) {
temp[i][0] = ((c1[0]*src[src_pos - 3] + c1[1]*src[src_pos - 2]
+ c1[2]*src[src_pos - 1] + c1[3]*src[src_pos]
+ c1[4]*src[src_pos + 1] + c1[5]*src[src_pos + 2]
+ c1[6]*src[src_pos + 3] + c1[7]*src[src_pos + 4])
>> shift1); // h0(0,-3+i)
temp[i][1] = ((c2[0]*src[src_pos - 3] + c2[1]*src[src_pos - 2]
+ c2[2]*src[src_pos - 1] + c2[3]*src[src_pos]
+ c2[4]*src[src_pos + 1] + c2[5]*src[src_pos + 2]
+ c2[6]*src[src_pos + 3] + c2[7]*src[src_pos + 4])
>> shift1); // h1(0,-3+i)
temp[i][2] = ((c3[0]*src[src_pos - 3] + c3[1]*src[src_pos - 2]
+ c3[2]*src[src_pos - 1] + c3[3]*src[src_pos]
+ c3[4]*src[src_pos + 1] + c3[5]*src[src_pos + 2]
+ c3[6]*src[src_pos + 3] + c3[7]*src[src_pos + 4])
>> shift1); // h2(0,-3+i)
if(i == 2) {
//Skip calculating h_temp again
src_pos += 2*src_stride;
} else {
src_pos += src_stride;
}
}
for(i=0;i<3;++i){
dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*temp[1][i]
+ c1[2]*temp[2][i] + c1[3]*h_temp[i]
+ c1[4]*temp[3][i] + c1[5]*temp[4][i]
+ c1[6]*temp[5][i] + c1[7]*temp[6][i])
+ offset23) >> shift2) >> shift3;
dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*temp[1][i]
+ c2[2]*temp[2][i] + c2[3]*h_temp[i]
+ c2[4]*temp[3][i] + c2[5]*temp[4][i]
+ c2[6]*temp[5][i] + c2[7]*temp[6][i])
+ offset23) >> shift2) >> shift3;
dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*temp[1][i]
+ c3[2]*temp[2][i] + c3[3]*h_temp[i]
+ c3[4]*temp[3][i] + c3[5]*temp[4][i]
+ c3[6]*temp[5][i] + c3[7]*temp[6][i])
+ offset23) >> shift2) >> shift3;
}
}
if(hor_flag) {
dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
}
}
}
//Clamp values to bitdepth
for(i = 0; i < width*height*16; ++i) {
if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
if(dst[i] < 0) dst[i] = 0;
}
} }

View file

@ -54,6 +54,12 @@ void filter_inter_halfpel_chroma(const encoder_control * encoder,
int16_t *src, int16_t src_stride, int width, int height, int16_t *src, int16_t src_stride, int width, int height,
int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
void filter_inter_octpel_chroma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
void filter_inter_quarterpel_luma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
// SAO // SAO
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////

View file

@ -64,6 +64,45 @@ void inter_set_block(picture* pic, uint32_t x_cu, uint32_t y_cu, uint8_t depth,
} }
} }
void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, int16_t *dst) {
int16_t mv[2] = {mv_x, mv_y};
int halfFilterSize = filterSize>>1;
int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; int ref_width_c;
int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp;
ref_width_c = ref_width;
//width = LCU_WIDTH>>depth;
for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
// calculate y-pixel offset
coord_y = y + off_y + mv[1];
// On y-overflow set coord_y accordingly
overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
if (overflow_neg_y_temp) coord_y = 0;
else if (overflow_pos_y_temp) coord_y = (ref_height) - 1;
coord_y *= ref_width_c;
for (dst_x = 0, x = (xpos) - halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
coord_x = x + off_x + mv[0];
// On x-overflow set coord_x accordingly
overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0;
if (overflow_neg_x_temp) coord_x = 0;
else if (overflow_pos_x_temp) coord_x = ref_width_c - 1;
// Store source block data (with extended borders)
dst[dst_y*(width+filterSize) + dst_x] = ref[coord_y + coord_x];
}
}
}
/** /**
* \brief Reconstruct inter block * \brief Reconstruct inter block
* \param ref picture to copy the data from * \param ref picture to copy the data from
@ -100,12 +139,91 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
int16_t halfpel_u[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (u) int16_t halfpel_u[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (u)
int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v) int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v)
// TODO: Fractional pixel support // Luma quarter-pel
int8_t fractional_mv = (mv[0]&1) || (mv[1]&1) || (mv[0]&2) || (mv[1]&2); // 2 lowest bits of mv set -> mv is fractional
if(fractional_mv) {
int y_off_x = (mv[0]&3);
int y_off_y = (mv[1]&3);
int c_off_x = (mv[0]&7);
int c_off_y = (mv[1]&7);
int y,x;
#define FILTER_SIZE_Y 8
#define FILTER_SIZE_C 4
//vector2d orig = {xpos, ypos};
//vector2d orig_c = {xpos>>1, ypos>>1};
// Fractional luma 1/4-pel
int16_t qpel_src_y[(LCU_WIDTH+FILTER_SIZE_Y) * (LCU_WIDTH+FILTER_SIZE_Y)];
int16_t* qpel_src_off_y = &qpel_src_y[(width+FILTER_SIZE_Y)*(FILTER_SIZE_Y>>1)+(FILTER_SIZE_Y>>1)];
int16_t qpel_dst_y[LCU_WIDTH*LCU_WIDTH*16];
// Fractional chroma 1/8-pel
int width_c = width>>1;
int16_t octpel_src_u[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
int16_t* octpel_src_off_u = &octpel_src_u[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
int16_t octpel_dst_u[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
int16_t octpel_src_v[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
int16_t* octpel_src_off_v = &octpel_src_v[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
int16_t octpel_dst_v[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
// Fractional luma
extend_borders(xpos, ypos, mv[0]>>2, mv[1]>>2, encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->lcu_offset_y * LCU_WIDTH,
ref->y_recdata, ref->width, ref->height, FILTER_SIZE_Y, width, width, qpel_src_y);
filter_inter_quarterpel_luma(encoder_state->encoder_control, qpel_src_off_y, width+FILTER_SIZE_Y, width,
width, qpel_dst_y, width*4, y_off_x, y_off_y);
//Fractional chroma U
extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
ref->u_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_u);
filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_u, width_c+FILTER_SIZE_C, width_c,
width_c, octpel_dst_u, width_c*8, c_off_x, c_off_y);
//Fractional chroma V
extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
ref->v_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_v);
filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_v, width_c+FILTER_SIZE_C, width_c,
width_c, octpel_dst_v, width_c*8, c_off_x, c_off_y);
//Luma
for(y = 0; y < width; ++y) {
int y_in_lcu = ((y+ypos) & ((LCU_WIDTH)-1));
int qpel_y = y*4+y_off_y;
for(x = 0; x < width; ++x) {
int x_in_lcu = ((x+xpos) & ((LCU_WIDTH)-1));
int qpel_x = x*4+y_off_x;
//printf("x: %d, y: %d\n", off_x, off_y);
lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)qpel_dst_y[qpel_y*(width*4)+qpel_x];
//printf("i: %d", qpel_y*(width*4)+qpel_x);
}
}
//Chroma
for(y = 0; y < width_c; ++y) {
int y_in_lcu = ((y+(ypos>>1)) & ((LCU_WIDTH>>1)-1));
int qpel_y = y*8+c_off_y;
for(x = 0; x < width_c; ++x) {
int x_in_lcu = ((x+(xpos>>1)) & ((LCU_WIDTH>>1)-1));
int qpel_x = x*8+c_off_x;
lcu->rec.u[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_u[qpel_y*(width_c*8)+qpel_x];
lcu->rec.v[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_v[qpel_y*(width_c*8)+qpel_x];
}
}
}
mv[0] >>= 2; mv[0] >>= 2;
mv[1] >>= 2; mv[1] >>= 2;
// Chroma half-pel // Chroma half-pel
// get half-pel interpolated block and push it to output // get half-pel interpolated block and push it to output
if(!fractional_mv) {
if(chroma_halfpel) { if(chroma_halfpel) {
int halfpel_y, halfpel_x; int halfpel_y, halfpel_x;
int abs_mv_x = mv[0]&1; int abs_mv_x = mv[0]&1;
@ -251,6 +369,7 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
} }
} }
} }
}
} }
/** /**

View file

@ -37,4 +37,6 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
cu_info **b2,cu_info **a0,cu_info **a1, lcu_t *lcu); cu_info **b2,cu_info **a0,cu_info **a1, lcu_t *lcu);
void inter_get_mv_cand(const encoder_state *encoder_state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info* cur_cu, lcu_t *lcu); void inter_get_mv_cand(const encoder_state *encoder_state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info* cur_cu, lcu_t *lcu);
uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu); uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu);
void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
int filterSize, int width, int height, int16_t *dst);
#endif #endif

View file

@ -72,6 +72,16 @@ const vector2d small_hexbs[5] = {
{ -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 } { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 }
}; };
/*
* 6 7 8
* 3 4 5
* 0 1 2
*/
const vector2d square[9] = {
{ -1, 1 },
{ 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 },
{ 0, -1 }, { 1, -1 }
};
static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count) static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
{ {
@ -118,7 +128,7 @@ static uint32_t get_mvd_coding_cost(vector2d *mvd)
return bitcost; return bitcost;
} }
static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, int mv_shift,
int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3], int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
int16_t num_cand,int32_t ref_idx, uint32_t *bitcost) int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
{ {
@ -129,8 +139,8 @@ static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y
int8_t merged = 0; int8_t merged = 0;
int8_t cur_mv_cand = 0; int8_t cur_mv_cand = 0;
x <<= 2; x <<= mv_shift;
y <<= 2; y <<= mv_shift;
// Check every candidate to find a match // Check every candidate to find a match
for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) { for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
@ -205,7 +215,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
block_width, block_width); block_width, block_width);
cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost < best_cost) { if (cost < best_cost) {
best_cost = cost; best_cost = cost;
@ -220,7 +230,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
block_width, block_width); block_width, block_width);
cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); cost += calc_mvd_cost(encoder_state, 0, 0, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
// If the 0,0 is better, redo the hexagon around that point. // If the 0,0 is better, redo the hexagon around that point.
if (cost < best_cost) { if (cost < best_cost) {
@ -236,7 +246,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
block_width, block_width); block_width, block_width);
cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost < best_cost) { if (cost < best_cost) {
best_cost = cost; best_cost = cost;
@ -271,7 +281,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
block_width, block_width); block_width, block_width);
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost < best_cost) { if (cost < best_cost) {
best_cost = cost; best_cost = cost;
@ -294,7 +304,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
(encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
(encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
block_width, block_width); block_width, block_width);
cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost); cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost > 0 && cost < best_cost) { if (cost > 0 && cost < best_cost) {
best_cost = cost; best_cost = cost;
@ -369,6 +379,139 @@ static unsigned search_mv_full(unsigned depth,
} }
#endif #endif
static unsigned search_frac( const encoder_state * const encoder_state,
unsigned depth,
const picture *pic, const picture *ref,
const vector2d *orig, vector2d *mv_in_out,
int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) {
//Set mv to halfpel precision
vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
int block_width = CU_WIDTH_FROM_DEPTH(depth);
unsigned best_cost = UINT32_MAX;
uint32_t best_bitcost = 0, bitcost;
unsigned i;
unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
unsigned cost = 0;
cost_16bit_nxn_func satd = get_satd_16bit_nxn_func(block_width);
vector2d halfpel_offset;
#define FILTER_SIZE 8
#define HALF_FILTER (FILTER_SIZE>>1)
//create buffer for block + extra for filter
int src_stride = block_width+FILTER_SIZE+1;
int16_t src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
int16_t* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
//destination buffer for interpolation
int dst_stride = (block_width+1)*4;
int16_t dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16];
int16_t* dst_off = &dst[dst_stride*4+4];
extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
encoder_state->tile->lcu_offset_x * LCU_WIDTH,
encoder_state->tile->lcu_offset_y * LCU_WIDTH,
ref->y_data, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
filter_inter_quarterpel_luma(encoder_state->encoder_control, src_off, src_stride, block_width+1,
block_width+1, dst, dst_stride, 1, 1);
//Set mv to half-pixel precision
mv.x <<= 1;
mv.y <<= 1;
// Search halfpel positions around best integer mv
for (i = 0; i < 9; ++i) {
const vector2d *pattern = &square[i];
pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
int y,x;
for(y = 0; y < block_width; ++y) {
int dst_y = y*4+pattern->y*2;
for(x = 0; x < block_width; ++x) {
int dst_x = x*4+pattern->x*2;
tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
}
}
cost = satd(tmp_pic,tmp_filtered);
cost = cost>>1;
cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost < best_cost) {
best_cost = cost;
best_index = i;
best_bitcost = bitcost;
}
}
//Set mv to best match
mv.x += square[best_index].x;
mv.y += square[best_index].y;
halfpel_offset.x = square[best_index].x*2;
halfpel_offset.y = square[best_index].y*2;
//Set mv to quarterpel precision
mv.x <<= 1;
mv.y <<= 1;
//Search quarterpel points around best halfpel mv
for (i = 0; i < 9; ++i) {
const vector2d *pattern = &square[i];
pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
int y,x;
for(y = 0; y < block_width; ++y) {
int dst_y = y*4+halfpel_offset.y+pattern->y;
for(x = 0; x < block_width; ++x) {
int dst_x = x*4+halfpel_offset.x+pattern->x;
tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
}
}
cost = satd(tmp_pic,tmp_filtered);
cost = cost>>1;
cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
if (cost < best_cost) {
best_cost = cost;
best_index = i;
best_bitcost = bitcost;
}
}
//Set mv to best final best match
mv.x += square[best_index].x;
mv.y += square[best_index].y;
mv_in_out->x = mv.x;
mv_in_out->y = mv.y;
*bitcost_out = best_bitcost;
return best_cost;
}
/** /**
* Update lcu to have best modes at this depth. * Update lcu to have best modes at this depth.
* \return Cost of best mode. * \return Cost of best mode.
@ -425,6 +568,8 @@ static int search_cu_inter(const encoder_state * const encoder_state, int x, int
temp_cost += hexagon_search(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost); temp_cost += hexagon_search(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
#endif #endif
temp_cost = search_frac(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
merged = 0; merged = 0;
// Check every candidate to find a match // Check every candidate to find a match
for(merge_idx = 0; merge_idx < num_cand; merge_idx++) { for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {