Added fractional pixel motion estimation

Added farctional mv support for inter recon Added 1/8-pel chroma and 1/4-pel luma interpolation
2024-11-28 03:34:06 +00:00 · 2014-05-14 01:42:02 +00:00 · 2014-05-14 01:42:02 +00:00 · efc43c8b3a
parent 6c7e4dbeef
commit efc43c8b3a
5 changed files with 806 additions and 128 deletions
--- a/src/filter.c
+++ b/src/filter.c
@ -515,7 +515,7 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
   * ea0,0 = (-4*B0,-1  + 36*B0,0  + 36*B0,1  - 4*B0,2)  >> shift1
   * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2
   */
-
+  int i = 0;
  int32_t x, y;
  int32_t shift1 = encoder->bitdepth-8;
  int32_t shift2 = 6;
@ -568,4 +568,410 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
      }
    }
  }
  //Clamp values to bitdepth
  for(i = 0; i < width*height*4; ++i) {
    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
    if(dst[i] < 0) dst[i] = 0;
  }
 }
 void filter_inter_octpel_chroma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
 {
  int32_t x, y;
  int32_t shift1 = encoder->bitdepth-8;
  int32_t shift2 = 6;
  int32_t shift3 = 14-encoder->bitdepth;
  int32_t offset3 = 1 << (shift3 - 1);
  int32_t offset23 = 1 << (shift2 + shift3 - 1);
  //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions
  int16_t c1[4], c2[4], c3[4], c4[4], c5[4], c6[4], c7[4];
  int i;
  for(i = 0; i < 4; ++i ) {
    c1[i] = g_chroma_filter[1][i];
    c2[i] = g_chroma_filter[2][i];
    c3[i] = g_chroma_filter[3][i];
    c4[i] = g_chroma_filter[4][i];
    c5[i] = g_chroma_filter[5][i];
    c6[i] = g_chroma_filter[6][i];
    c7[i] = g_chroma_filter[7][i];
  }
  // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round
  for (y = 0; y < height; y++) {
    int dst_pos_y = (y<<3)*dst_stride;
    int src_pos_y = y*src_stride;
    for (x = 0; x < width; x++) {
      // Calculate current dst and src pixel positions
      int dst_pos = dst_pos_y+(x<<3);
      int src_pos = src_pos_y+x;
      // Temporary horizontally interpolated postions
      int32_t h_temp[7] = {0,0,0,0,0,0,0};
      // Original pixel
      dst[dst_pos] = src[src_pos];
      // Horizontal 1/8-values
      if (hor_flag) {
        h_temp[0] = ((c1[0]*src[src_pos - 1]
                    + c1[1]*src[src_pos]
                    + c1[2]*src[src_pos + 1]
                    + c1[3]*src[src_pos + 2]) >> shift1); // ae0,0 h0
        h_temp[1] = ((c2[0]*src[src_pos - 1]
                    + c2[1]*src[src_pos]
                    + c2[2]*src[src_pos + 1]
                    + c2[3]*src[src_pos + 2]) >> shift1); // ae0,0 h1
        h_temp[2] = ((c3[0]*src[src_pos - 1]
                    + c3[1]*src[src_pos]
                    + c3[2]*src[src_pos + 1]
                    + c3[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
        h_temp[3] = ((c4[0]*src[src_pos - 1]
                    + c4[1]*src[src_pos]
                    + c4[2]*src[src_pos + 1]
                    + c4[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
        h_temp[4] = ((c5[0]*src[src_pos - 1]
                    + c5[1]*src[src_pos]
                    + c5[2]*src[src_pos + 1]
                    + c5[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
        h_temp[5] = ((c6[0]*src[src_pos - 1]
                    + c6[1]*src[src_pos]
                    + c6[2]*src[src_pos + 1]
                    + c6[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
        h_temp[6] = ((c7[0]*src[src_pos - 1]
                    + c7[1]*src[src_pos]
                    + c7[2]*src[src_pos + 1]
                    + c7[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
      }
      // Vertical 1/8-values
      if(ver_flag) {
        dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 1*src_stride]
                                       + c1[1]*src[src_pos]
                                       + c1[2]*src[src_pos + 1*src_stride]
                                       + c1[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 1*src_stride]
                                       + c2[1]*src[src_pos]
                                       + c2[2]*src[src_pos + 1*src_stride]
                                       + c2[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 1*src_stride]
                                       + c3[1]*src[src_pos]
                                       + c3[2]*src[src_pos + 1*src_stride]
                                       + c3[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 4*dst_stride] = (((c4[0]*src[src_pos - 1*src_stride]
                                       + c4[1]*src[src_pos]
                                       + c4[2]*src[src_pos + 1*src_stride]
                                       + c4[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 5*dst_stride] = (((c5[0]*src[src_pos - 1*src_stride]
                                       + c5[1]*src[src_pos]
                                       + c5[2]*src[src_pos + 1*src_stride]
                                       + c5[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 6*dst_stride] = (((c6[0]*src[src_pos - 1*src_stride]
                                       + c6[1]*src[src_pos]
                                       + c6[2]*src[src_pos + 1*src_stride]
                                       + c6[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
        dst[dst_pos + 7*dst_stride] = (((c7[0]*src[src_pos - 1*src_stride]
                                       + c7[1]*src[src_pos]
                                       + c7[2]*src[src_pos + 1*src_stride]
                                       + c7[3]*src[src_pos + 2*src_stride]) >> shift1)
                                       + (1<<(shift3-1))) >> shift3; //
      }
      // When both flags, interpolate values from temporary horizontal values
      if (hor_flag && ver_flag) {
        int32_t temp[3][7]; // Temporary horizontal values calculated from integer pixels
        // Calculate temporary values
        src_pos -= 1*src_stride;  //0,-3
        for(i = 0; i < 3; ++i) {
          temp[i][0] = ((c1[0]*src[src_pos - 1] + c1[1]*src[src_pos]
                       + c1[2]*src[src_pos + 1] + c1[3]*src[src_pos + 2])
                      >> shift1); // h0(0,-3+i)
          temp[i][1] = ((c2[0]*src[src_pos - 1] + c2[1]*src[src_pos]
                       + c2[2]*src[src_pos + 1] + c2[3]*src[src_pos + 2])
                      >> shift1); // h1(0,-3+i)
          temp[i][2] = ((c3[0]*src[src_pos - 1] + c3[1]*src[src_pos]
                       + c3[2]*src[src_pos + 1] + c3[3]*src[src_pos + 2])
                      >> shift1); // h2(0,-3+i)
          temp[i][3] = ((c4[0]*src[src_pos - 1] + c4[1]*src[src_pos]
                       + c4[2]*src[src_pos + 1] + c4[3]*src[src_pos + 2])
                       >> shift1); // h2(0,-3+i)
          temp[i][4] = ((c5[0]*src[src_pos - 1] + c5[1]*src[src_pos]
                       + c5[2]*src[src_pos + 1] + c5[3]*src[src_pos + 2])
                       >> shift1); // h2(0,-3+i)
          temp[i][5] = ((c6[0]*src[src_pos - 1] + c6[1]*src[src_pos]
                       + c6[2]*src[src_pos + 1] + c6[3]*src[src_pos + 2])
                       >> shift1); // h2(0,-3+i)
          temp[i][6] = ((c7[0]*src[src_pos - 1] + c7[1]*src[src_pos]
                       + c7[2]*src[src_pos + 1] + c7[3]*src[src_pos + 2])
                       >> shift1); // h2(0,-3+i)
          if(i == 0) {
            //Skip calculating h_temp again
            src_pos += 2*src_stride;
          } else {
            src_pos += src_stride;
          }
        }
        //Calculate values from temporary horizontal 1/8-values
        for(i=0;i<7;++i){
          dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*h_temp[i]
                                               + c1[2]*temp[1][i] + c1[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*h_temp[i]
                                               + c2[2]*temp[1][i] + c2[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*h_temp[i]
                                               + c3[2]*temp[1][i] + c3[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 4*dst_stride + i+1] = (((c4[0]*temp[0][i] + c4[1]*h_temp[i]
                                               + c4[2]*temp[1][i] + c4[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 5*dst_stride + i+1] = (((c5[0]*temp[0][i] + c5[1]*h_temp[i]
                                               + c5[2]*temp[1][i] + c5[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 6*dst_stride + i+1] = (((c6[0]*temp[0][i] + c6[1]*h_temp[i]
                                               + c6[2]*temp[1][i] + c6[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
          dst[dst_pos + 7*dst_stride + i+1] = (((c7[0]*temp[0][i] + c7[1]*h_temp[i]
                                               + c7[2]*temp[1][i] + c7[3]*temp[2][i])
                                               + offset23) >> shift2) >> shift3; // ee0,0
        }
      }
      if(hor_flag) {
        dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
        dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
        dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
        dst[dst_pos + 4] = (h_temp[3] + offset3) >> shift3;
        dst[dst_pos + 5] = (h_temp[4] + offset3) >> shift3;
        dst[dst_pos + 6] = (h_temp[5] + offset3) >> shift3;
        dst[dst_pos + 7] = (h_temp[6] + offset3) >> shift3;
      }
    }
  }
  //Clamp values to bitdepth
  for(i = 0; i < width*height*64; ++i) {
    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
    if(dst[i] < 0) dst[i] = 0;
  }
 }
 void filter_inter_quarterpel_luma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
 {
  int32_t x, y;
  int32_t shift1 = encoder->bitdepth-8;
  int32_t shift2 = 6;
  int32_t shift3 = 14-encoder->bitdepth;
  int32_t offset3 = 1 << (shift3 - 1);
  int32_t offset23 = 1 << (shift2 + shift3 - 1);
  //coefficients for 1/4, 2/4 and 3/4 positions
  int16_t c1[8], c2[8], c3[8];
  int i;
  for(i = 0; i < 8; ++i ) {
    c1[i] = g_luma_filter[1][i];
    c2[i] = g_luma_filter[2][i];
    c3[i] = g_luma_filter[3][i];
  }
  // Loop source pixels and generate sixteen filtered quarter-pel pixels on each round
  for (y = 0; y < height; y++) {
    int dst_pos_y = (y<<2)*dst_stride;
    int src_pos_y = y*src_stride;
    for (x = 0; x < width; x++) {
      // Calculate current dst and src pixel positions
      int dst_pos = dst_pos_y+(x<<2);
      int src_pos = src_pos_y+x;
      // Temporary variables..
      int32_t h_temp[3] = {0,0,0};
      // Original pixel
      dst[dst_pos] = src[src_pos];
      //
      if (hor_flag) {
        h_temp[0] = ((c1[0]*src[src_pos - 3]
                    + c1[1]*src[src_pos - 2]
                    + c1[2]*src[src_pos - 1]
                    + c1[3]*src[src_pos]
                    + c1[4]*src[src_pos + 1]
                    + c1[5]*src[src_pos + 2]
                    + c1[6]*src[src_pos + 3]
                    + c1[7]*src[src_pos + 4]) >> shift1);
        h_temp[1] = ((c2[0]*src[src_pos - 3]
                    + c2[1]*src[src_pos - 2]
                    + c2[2]*src[src_pos - 1]
                    + c2[3]*src[src_pos]
                    + c2[4]*src[src_pos + 1]
                    + c2[5]*src[src_pos + 2]
                    + c2[6]*src[src_pos + 3]
                    + c2[7]*src[src_pos + 4]) >> shift1);
        h_temp[2] = ((c3[0]*src[src_pos - 3]
                    + c3[1]*src[src_pos - 2]
                    + c3[2]*src[src_pos - 1]
                    + c3[3]*src[src_pos]
                    + c3[4]*src[src_pos + 1]
                    + c3[5]*src[src_pos + 2]
                    + c3[6]*src[src_pos + 3]
                    + c3[7]*src[src_pos + 4]) >> shift1);
      }
      // ea0,0 - needed only when ver_flag
      if(ver_flag) {
        dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 3*src_stride]
                                       + c1[1]*src[src_pos - 2*src_stride]
                                       + c1[2]*src[src_pos - 1*src_stride]
                                       + c1[3]*src[src_pos]
                                       + c1[4]*src[src_pos + 1*src_stride]
                                       + c1[5]*src[src_pos + 2*src_stride]
                                       + c1[6]*src[src_pos + 3*src_stride]
                                       + c1[7]*src[src_pos + 4*src_stride]) >> shift1)
                                        + (1<<(shift3-1))) >> shift3;
        dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 3*src_stride]
                                       + c2[1]*src[src_pos - 2*src_stride]
                                       + c2[2]*src[src_pos - 1*src_stride]
                                       + c2[3]*src[src_pos]
                                       + c2[4]*src[src_pos + 1*src_stride]
                                       + c2[5]*src[src_pos + 2*src_stride]
                                       + c2[6]*src[src_pos + 3*src_stride]
                                       + c2[7]*src[src_pos + 4*src_stride]) >> shift1)
                                        + (1<<(shift3-1))) >> shift3;
        dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 3*src_stride]
                                       + c3[1]*src[src_pos - 2*src_stride]
                                       + c3[2]*src[src_pos - 1*src_stride]
                                       + c3[3]*src[src_pos]
                                       + c3[4]*src[src_pos + 1*src_stride]
                                       + c3[5]*src[src_pos + 2*src_stride]
                                       + c3[6]*src[src_pos + 3*src_stride]
                                       + c3[7]*src[src_pos + 4*src_stride]) >> shift1)
                                        + (1<<(shift3-1))) >> shift3;
      }
      // When both flags, we use _only_ this pixel (but still need ae0,0 for it)
      if (hor_flag && ver_flag) {
        int32_t temp[7][3];
        // Calculate temporary values..
        src_pos -= 3*src_stride;  //0,-3
        for(i = 0; i < 7; ++i) {
          temp[i][0] = ((c1[0]*src[src_pos - 3] + c1[1]*src[src_pos - 2]
                       + c1[2]*src[src_pos - 1] + c1[3]*src[src_pos]
                       + c1[4]*src[src_pos + 1] + c1[5]*src[src_pos + 2]
                       + c1[6]*src[src_pos + 3] + c1[7]*src[src_pos + 4])
              >> shift1); // h0(0,-3+i)
          temp[i][1] = ((c2[0]*src[src_pos - 3] + c2[1]*src[src_pos - 2]
                       + c2[2]*src[src_pos - 1] + c2[3]*src[src_pos]
                       + c2[4]*src[src_pos + 1] + c2[5]*src[src_pos + 2]
                       + c2[6]*src[src_pos + 3] + c2[7]*src[src_pos + 4])
                        >> shift1); // h1(0,-3+i)
          temp[i][2] = ((c3[0]*src[src_pos - 3] + c3[1]*src[src_pos - 2]
                       + c3[2]*src[src_pos - 1] + c3[3]*src[src_pos]
                       + c3[4]*src[src_pos + 1] + c3[5]*src[src_pos + 2]
                       + c3[6]*src[src_pos + 3] + c3[7]*src[src_pos + 4])
                        >> shift1); // h2(0,-3+i)
          if(i == 2) {
            //Skip calculating h_temp again
            src_pos += 2*src_stride;
          } else {
            src_pos += src_stride;
          }
        }
        for(i=0;i<3;++i){
          dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*temp[1][i]
                                               + c1[2]*temp[2][i] + c1[3]*h_temp[i]
                                               + c1[4]*temp[3][i] + c1[5]*temp[4][i]
                                               + c1[6]*temp[5][i] + c1[7]*temp[6][i])
                                                + offset23) >> shift2) >> shift3;
          dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*temp[1][i]
                                               + c2[2]*temp[2][i] + c2[3]*h_temp[i]
                                               + c2[4]*temp[3][i] + c2[5]*temp[4][i]
                                               + c2[6]*temp[5][i] + c2[7]*temp[6][i])
                                                + offset23) >> shift2) >> shift3;
          dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*temp[1][i]
                                               + c3[2]*temp[2][i] + c3[3]*h_temp[i]
                                               + c3[4]*temp[3][i] + c3[5]*temp[4][i]
                                               + c3[6]*temp[5][i] + c3[7]*temp[6][i])
                                                + offset23) >> shift2) >> shift3;
        }
      }
      if(hor_flag) {
        dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
        dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
        dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
      }
    }
  }
  //Clamp values to bitdepth
  for(i = 0; i < width*height*16; ++i) {
    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
    if(dst[i] < 0) dst[i] = 0;
  }
 }
--- a/src/filter.h
+++ b/src/filter.h
@ -54,6 +54,12 @@ void filter_inter_halfpel_chroma(const encoder_control * encoder,
                                 int16_t *src, int16_t src_stride, int width, int height,
                                 int16_t *dst, int16_t dst_stride,  int8_t hor_flag, int8_t ver_flag);
 void filter_inter_octpel_chroma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
                                int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
 void filter_inter_quarterpel_luma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
                                  int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
 // SAO
 //////////////////////////////////////////////////////////////////////////
--- a/src/inter.c
+++ b/src/inter.c
@ -64,6 +64,45 @@ void inter_set_block(picture* pic, uint32_t x_cu, uint32_t y_cu, uint8_t depth,
  }
 }
 void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
    int filterSize, int width, int height, int16_t *dst) {
  int16_t mv[2] = {mv_x, mv_y};
  int halfFilterSize = filterSize>>1;
  int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; int ref_width_c;
  int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp;
  ref_width_c = ref_width;
  //width = LCU_WIDTH>>depth;
  for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
    // calculate y-pixel offset
    coord_y = y + off_y + mv[1];
    // On y-overflow set coord_y accordingly
    overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
    overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
    if (overflow_neg_y_temp)      coord_y = 0;
    else if (overflow_pos_y_temp) coord_y = (ref_height) - 1;
    coord_y *= ref_width_c;
    for (dst_x = 0, x = (xpos) - halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
      coord_x = x + off_x + mv[0];
      // On x-overflow set coord_x accordingly
      overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
      overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0;
      if (overflow_neg_x_temp)      coord_x = 0;
      else if (overflow_pos_x_temp) coord_x = ref_width_c - 1;
      // Store source block data (with extended borders)
      dst[dst_y*(width+filterSize) + dst_x] = ref[coord_y + coord_x];
    }
  }
 }
 /**
 * \brief Reconstruct inter block
 * \param ref picture to copy the data from
@ -100,12 +139,91 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
  int16_t halfpel_u[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (u)
  int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v)
-  // TODO: Fractional pixel support
+  // Luma quarter-pel
    int8_t fractional_mv = (mv[0]&1) || (mv[1]&1) || (mv[0]&2) || (mv[1]&2); // 2 lowest bits of mv set -> mv is fractional
    if(fractional_mv) {
      int y_off_x = (mv[0]&3);
      int y_off_y = (mv[1]&3);
      int c_off_x = (mv[0]&7);
      int c_off_y = (mv[1]&7);
      int y,x;
      #define FILTER_SIZE_Y 8
      #define FILTER_SIZE_C 4
      //vector2d orig = {xpos, ypos};
      //vector2d orig_c = {xpos>>1, ypos>>1};
      // Fractional luma 1/4-pel
      int16_t qpel_src_y[(LCU_WIDTH+FILTER_SIZE_Y) * (LCU_WIDTH+FILTER_SIZE_Y)];
      int16_t* qpel_src_off_y = &qpel_src_y[(width+FILTER_SIZE_Y)*(FILTER_SIZE_Y>>1)+(FILTER_SIZE_Y>>1)];
      int16_t qpel_dst_y[LCU_WIDTH*LCU_WIDTH*16];
      // Fractional chroma 1/8-pel
      int width_c = width>>1;
      int16_t octpel_src_u[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
      int16_t* octpel_src_off_u = &octpel_src_u[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
      int16_t octpel_dst_u[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
      int16_t octpel_src_v[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
      int16_t* octpel_src_off_v = &octpel_src_v[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
      int16_t octpel_dst_v[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
      // Fractional luma
      extend_borders(xpos, ypos, mv[0]>>2, mv[1]>>2, encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->lcu_offset_y * LCU_WIDTH,
          ref->y_recdata, ref->width, ref->height, FILTER_SIZE_Y, width, width, qpel_src_y);
      filter_inter_quarterpel_luma(encoder_state->encoder_control, qpel_src_off_y, width+FILTER_SIZE_Y, width,
                                   width, qpel_dst_y, width*4, y_off_x, y_off_y);
      //Fractional chroma U
      extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
          ref->u_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_u);
      filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_u, width_c+FILTER_SIZE_C, width_c,
                                 width_c, octpel_dst_u, width_c*8, c_off_x, c_off_y);
      //Fractional chroma V
      extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
          ref->v_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_v);
      filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_v, width_c+FILTER_SIZE_C, width_c,
                   width_c, octpel_dst_v, width_c*8, c_off_x, c_off_y);
      //Luma
      for(y = 0; y < width; ++y) {
        int y_in_lcu = ((y+ypos) & ((LCU_WIDTH)-1));
        int qpel_y = y*4+y_off_y;
        for(x = 0; x < width; ++x) {
          int x_in_lcu = ((x+xpos) & ((LCU_WIDTH)-1));
          int qpel_x = x*4+y_off_x;
          //printf("x: %d, y: %d\n", off_x, off_y);
          lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)qpel_dst_y[qpel_y*(width*4)+qpel_x];
          //printf("i: %d", qpel_y*(width*4)+qpel_x);
        }
      }
      //Chroma
      for(y = 0; y < width_c; ++y) {
        int y_in_lcu = ((y+(ypos>>1)) & ((LCU_WIDTH>>1)-1));
        int qpel_y = y*8+c_off_y;
        for(x = 0; x < width_c; ++x) {
          int x_in_lcu = ((x+(xpos>>1)) & ((LCU_WIDTH>>1)-1));
          int qpel_x = x*8+c_off_x;
          lcu->rec.u[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_u[qpel_y*(width_c*8)+qpel_x];
          lcu->rec.v[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_v[qpel_y*(width_c*8)+qpel_x];
        }
      }
    }
  mv[0] >>= 2;
  mv[1] >>= 2;
  // Chroma half-pel
  // get half-pel interpolated block and push it to output
  if(!fractional_mv) {
    if(chroma_halfpel) {
      int halfpel_y, halfpel_x;
      int abs_mv_x = mv[0]&1;
@ -251,6 +369,7 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
        }
      }
    }
  }
 }
 /**
--- a/src/inter.h
+++ b/src/inter.h
@ -37,4 +37,6 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
                                        cu_info **b2,cu_info **a0,cu_info **a1, lcu_t *lcu);
 void inter_get_mv_cand(const encoder_state *encoder_state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info* cur_cu, lcu_t *lcu);
 uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu);
 void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
                    int filterSize, int width, int height, int16_t *dst);
 #endif
--- a/src/search.c
+++ b/src/search.c
@ -72,6 +72,16 @@ const vector2d small_hexbs[5] = {
  { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 }
 };
 /*
 *  6 7 8
 *  3 4 5
 *  0 1 2
 */
 const vector2d square[9] = {
  { -1, 1 },
  { 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 },
  { 0, -1 }, { 1, -1 }
 };
 static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
 {
@ -118,7 +128,7 @@ static uint32_t get_mvd_coding_cost(vector2d *mvd)
  return bitcost;
 }
-static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y,
+static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, int mv_shift,
                         int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
                         int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
 {
@ -129,8 +139,8 @@ static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y
  int8_t merged      = 0;
  int8_t cur_mv_cand = 0;
-  x <<= 2;
+  x <<= mv_shift;
-  y <<= 2;
+  y <<= mv_shift;
  // Check every candidate to find a match
  for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
@ -205,7 +215,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
                             block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
    if (cost < best_cost) {
      best_cost    = cost;
@ -220,7 +230,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, 
                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
                             block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, 0, 0, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
    // If the 0,0 is better, redo the hexagon around that point.
    if (cost < best_cost) {
@ -236,7 +246,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                                 (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
                                 (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
                                 block_width, block_width);
-        cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+        cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
        if (cost < best_cost) {
          best_cost    = cost;
@ -271,7 +281,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                               (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                               (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                               block_width, block_width);
-      cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+      cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
      if (cost < best_cost) {
        best_cost    = cost;
@ -294,7 +304,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                             block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
    if (cost > 0 && cost < best_cost) {
      best_cost    = cost;
@ -369,6 +379,139 @@ static unsigned search_mv_full(unsigned depth,
 }
 #endif
 static unsigned search_frac( const encoder_state * const encoder_state,
        unsigned depth,
        const picture *pic, const picture *ref,
        const vector2d *orig, vector2d *mv_in_out,
        int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
        int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) {
  //Set mv to halfpel precision
  vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
  int block_width = CU_WIDTH_FROM_DEPTH(depth);
  unsigned best_cost = UINT32_MAX;
  uint32_t best_bitcost = 0, bitcost;
  unsigned i;
  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
  unsigned cost = 0;
  cost_16bit_nxn_func satd = get_satd_16bit_nxn_func(block_width);
  vector2d halfpel_offset;
  #define FILTER_SIZE 8
  #define HALF_FILTER (FILTER_SIZE>>1)
  //create buffer for block + extra for filter
  int src_stride = block_width+FILTER_SIZE+1;
  int16_t src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
  int16_t* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
  //destination buffer for interpolation
  int dst_stride = (block_width+1)*4;
  int16_t dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16];
  int16_t* dst_off = &dst[dst_stride*4+4];
  extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
                encoder_state->tile->lcu_offset_x * LCU_WIDTH,
                encoder_state->tile->lcu_offset_y * LCU_WIDTH,
                ref->y_data, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
  filter_inter_quarterpel_luma(encoder_state->encoder_control, src_off, src_stride, block_width+1,
      block_width+1, dst, dst_stride, 1, 1);
  //Set mv to half-pixel precision
  mv.x <<= 1;
  mv.y <<= 1;
  // Search halfpel positions around best integer mv
  for (i = 0; i < 9; ++i) {
    const vector2d *pattern = &square[i];
    pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
    pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
    int y,x;
    for(y = 0; y < block_width; ++y) {
      int dst_y = y*4+pattern->y*2;
      for(x = 0; x < block_width; ++x) {
        int dst_x = x*4+pattern->x*2;
        tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
        tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
      }
    }
    cost = satd(tmp_pic,tmp_filtered);
    cost = cost>>1;
    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
    if (cost < best_cost) {
      best_cost    = cost;
      best_index   = i;
      best_bitcost = bitcost;
    }
  }
  //Set mv to best match
  mv.x += square[best_index].x;
  mv.y += square[best_index].y;
  halfpel_offset.x = square[best_index].x*2;
  halfpel_offset.y = square[best_index].y*2;
  //Set mv to quarterpel precision
  mv.x <<= 1;
  mv.y <<= 1;
  //Search quarterpel points around best halfpel mv
  for (i = 0; i < 9; ++i) {
    const vector2d *pattern = &square[i];
    pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
    pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
    int y,x;
    for(y = 0; y < block_width; ++y) {
      int dst_y = y*4+halfpel_offset.y+pattern->y;
      for(x = 0; x < block_width; ++x) {
        int dst_x = x*4+halfpel_offset.x+pattern->x;
        tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
        tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
      }
    }
    cost = satd(tmp_pic,tmp_filtered);
    cost = cost>>1;
    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
    if (cost < best_cost) {
      best_cost    = cost;
      best_index   = i;
      best_bitcost = bitcost;
    }
  }
  //Set mv to best final best match
  mv.x += square[best_index].x;
  mv.y += square[best_index].y;
  mv_in_out->x = mv.x;
  mv_in_out->y = mv.y;
  *bitcost_out = best_bitcost;
  return best_cost;
 }
 /**
 * Update lcu to have best modes at this depth.
 * \return Cost of best mode.
@ -425,6 +568,8 @@ static int search_cu_inter(const encoder_state * const encoder_state, int x, int
    temp_cost += hexagon_search(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
 #endif
    temp_cost = search_frac(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
    merged = 0;
    // Check every candidate to find a match
    for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {