From efc43c8b3a7e2d6e0886e81f43aedc5a7f8b9f87 Mon Sep 17 00:00:00 2001
From: Tapio Katajisto <tapio.katajisto@gmail.com>
Date: Wed, 14 May 2014 01:42:02 +0000
Subject: [PATCH] Added fractional pixel motion estimation

Added farctional mv support for inter recon

Added 1/8-pel chroma and 1/4-pel luma interpolation
---
 src/filter.c | 408 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/filter.h |   6 +
 src/inter.c  | 357 +++++++++++++++++++++++++++++---------------
 src/inter.h  |   2 +
 src/search.c | 161 +++++++++++++++++++-
 5 files changed, 806 insertions(+), 128 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index bee1629d..debba2c5 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -515,7 +515,7 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
    * ea0,0 = (-4*B0,-1  + 36*B0,0  + 36*B0,1  - 4*B0,2)  >> shift1
    * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2
    */
-
+  int i = 0;
   int32_t x, y;
   int32_t shift1 = encoder->bitdepth-8;
   int32_t shift2 = 6;
@@ -568,4 +568,410 @@ void filter_inter_halfpel_chroma(const encoder_control * const encoder, int16_t
       }
     }
   }
+  //Clamp values to bitdepth
+  for(i = 0; i < width*height*4; ++i) {
+    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
+    if(dst[i] < 0) dst[i] = 0;
+  }
+}
+
+void filter_inter_octpel_chroma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+{
+
+  int32_t x, y;
+  int32_t shift1 = encoder->bitdepth-8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14-encoder->bitdepth;
+  int32_t offset3 = 1 << (shift3 - 1);
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions
+  int16_t c1[4], c2[4], c3[4], c4[4], c5[4], c6[4], c7[4];
+
+  int i;
+  for(i = 0; i < 4; ++i ) {
+    c1[i] = g_chroma_filter[1][i];
+    c2[i] = g_chroma_filter[2][i];
+    c3[i] = g_chroma_filter[3][i];
+    c4[i] = g_chroma_filter[4][i];
+    c5[i] = g_chroma_filter[5][i];
+    c6[i] = g_chroma_filter[6][i];
+    c7[i] = g_chroma_filter[7][i];
+  }
+
+  // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round
+  for (y = 0; y < height; y++) {
+    int dst_pos_y = (y<<3)*dst_stride;
+    int src_pos_y = y*src_stride;
+    for (x = 0; x < width; x++) {
+      // Calculate current dst and src pixel positions
+      int dst_pos = dst_pos_y+(x<<3);
+      int src_pos = src_pos_y+x;
+
+      // Temporary horizontally interpolated postions
+      int32_t h_temp[7] = {0,0,0,0,0,0,0};
+
+      // Original pixel
+      dst[dst_pos] = src[src_pos];
+
+      // Horizontal 1/8-values
+      if (hor_flag) {
+
+        h_temp[0] = ((c1[0]*src[src_pos - 1]
+                    + c1[1]*src[src_pos]
+                    + c1[2]*src[src_pos + 1]
+                    + c1[3]*src[src_pos + 2]) >> shift1); // ae0,0 h0
+
+        h_temp[1] = ((c2[0]*src[src_pos - 1]
+                    + c2[1]*src[src_pos]
+                    + c2[2]*src[src_pos + 1]
+                    + c2[3]*src[src_pos + 2]) >> shift1); // ae0,0 h1
+
+        h_temp[2] = ((c3[0]*src[src_pos - 1]
+                    + c3[1]*src[src_pos]
+                    + c3[2]*src[src_pos + 1]
+                    + c3[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
+
+        h_temp[3] = ((c4[0]*src[src_pos - 1]
+                    + c4[1]*src[src_pos]
+                    + c4[2]*src[src_pos + 1]
+                    + c4[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
+
+        h_temp[4] = ((c5[0]*src[src_pos - 1]
+                    + c5[1]*src[src_pos]
+                    + c5[2]*src[src_pos + 1]
+                    + c5[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
+
+        h_temp[5] = ((c6[0]*src[src_pos - 1]
+                    + c6[1]*src[src_pos]
+                    + c6[2]*src[src_pos + 1]
+                    + c6[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
+
+        h_temp[6] = ((c7[0]*src[src_pos - 1]
+                    + c7[1]*src[src_pos]
+                    + c7[2]*src[src_pos + 1]
+                    + c7[3]*src[src_pos + 2]) >> shift1); // ae0,0 h2
+      }
+
+      // Vertical 1/8-values
+      if(ver_flag) {
+        dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 1*src_stride]
+                                       + c1[1]*src[src_pos]
+                                       + c1[2]*src[src_pos + 1*src_stride]
+                                       + c1[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 1*src_stride]
+                                       + c2[1]*src[src_pos]
+                                       + c2[2]*src[src_pos + 1*src_stride]
+                                       + c2[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 1*src_stride]
+                                       + c3[1]*src[src_pos]
+                                       + c3[2]*src[src_pos + 1*src_stride]
+                                       + c3[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 4*dst_stride] = (((c4[0]*src[src_pos - 1*src_stride]
+                                       + c4[1]*src[src_pos]
+                                       + c4[2]*src[src_pos + 1*src_stride]
+                                       + c4[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 5*dst_stride] = (((c5[0]*src[src_pos - 1*src_stride]
+                                       + c5[1]*src[src_pos]
+                                       + c5[2]*src[src_pos + 1*src_stride]
+                                       + c5[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 6*dst_stride] = (((c6[0]*src[src_pos - 1*src_stride]
+                                       + c6[1]*src[src_pos]
+                                       + c6[2]*src[src_pos + 1*src_stride]
+                                       + c6[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+
+        dst[dst_pos + 7*dst_stride] = (((c7[0]*src[src_pos - 1*src_stride]
+                                       + c7[1]*src[src_pos]
+                                       + c7[2]*src[src_pos + 1*src_stride]
+                                       + c7[3]*src[src_pos + 2*src_stride]) >> shift1)
+                                       + (1<<(shift3-1))) >> shift3; //
+      }
+
+      // When both flags, interpolate values from temporary horizontal values
+      if (hor_flag && ver_flag) {
+
+        int32_t temp[3][7]; // Temporary horizontal values calculated from integer pixels
+
+        // Calculate temporary values
+        src_pos -= 1*src_stride;  //0,-3
+        for(i = 0; i < 3; ++i) {
+
+          temp[i][0] = ((c1[0]*src[src_pos - 1] + c1[1]*src[src_pos]
+                       + c1[2]*src[src_pos + 1] + c1[3]*src[src_pos + 2])
+                      >> shift1); // h0(0,-3+i)
+
+          temp[i][1] = ((c2[0]*src[src_pos - 1] + c2[1]*src[src_pos]
+                       + c2[2]*src[src_pos + 1] + c2[3]*src[src_pos + 2])
+                      >> shift1); // h1(0,-3+i)
+
+          temp[i][2] = ((c3[0]*src[src_pos - 1] + c3[1]*src[src_pos]
+                       + c3[2]*src[src_pos + 1] + c3[3]*src[src_pos + 2])
+                      >> shift1); // h2(0,-3+i)
+
+          temp[i][3] = ((c4[0]*src[src_pos - 1] + c4[1]*src[src_pos]
+                       + c4[2]*src[src_pos + 1] + c4[3]*src[src_pos + 2])
+                       >> shift1); // h2(0,-3+i)
+
+          temp[i][4] = ((c5[0]*src[src_pos - 1] + c5[1]*src[src_pos]
+                       + c5[2]*src[src_pos + 1] + c5[3]*src[src_pos + 2])
+                       >> shift1); // h2(0,-3+i)
+
+          temp[i][5] = ((c6[0]*src[src_pos - 1] + c6[1]*src[src_pos]
+                       + c6[2]*src[src_pos + 1] + c6[3]*src[src_pos + 2])
+                       >> shift1); // h2(0,-3+i)
+
+          temp[i][6] = ((c7[0]*src[src_pos - 1] + c7[1]*src[src_pos]
+                       + c7[2]*src[src_pos + 1] + c7[3]*src[src_pos + 2])
+                       >> shift1); // h2(0,-3+i)
+
+          if(i == 0) {
+            //Skip calculating h_temp again
+            src_pos += 2*src_stride;
+          } else {
+            src_pos += src_stride;
+          }
+        }
+
+
+        //Calculate values from temporary horizontal 1/8-values
+        for(i=0;i<7;++i){
+          dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*h_temp[i]
+                                               + c1[2]*temp[1][i] + c1[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*h_temp[i]
+                                               + c2[2]*temp[1][i] + c2[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*h_temp[i]
+                                               + c3[2]*temp[1][i] + c3[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 4*dst_stride + i+1] = (((c4[0]*temp[0][i] + c4[1]*h_temp[i]
+                                               + c4[2]*temp[1][i] + c4[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 5*dst_stride + i+1] = (((c5[0]*temp[0][i] + c5[1]*h_temp[i]
+                                               + c5[2]*temp[1][i] + c5[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 6*dst_stride + i+1] = (((c6[0]*temp[0][i] + c6[1]*h_temp[i]
+                                               + c6[2]*temp[1][i] + c6[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+          dst[dst_pos + 7*dst_stride + i+1] = (((c7[0]*temp[0][i] + c7[1]*h_temp[i]
+                                               + c7[2]*temp[1][i] + c7[3]*temp[2][i])
+                                               + offset23) >> shift2) >> shift3; // ee0,0
+
+        }
+
+      }
+
+      if(hor_flag) {
+        dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
+        dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
+        dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
+        dst[dst_pos + 4] = (h_temp[3] + offset3) >> shift3;
+        dst[dst_pos + 5] = (h_temp[4] + offset3) >> shift3;
+        dst[dst_pos + 6] = (h_temp[5] + offset3) >> shift3;
+        dst[dst_pos + 7] = (h_temp[6] + offset3) >> shift3;
+      }
+
+
+    }
+  }
+
+  //Clamp values to bitdepth
+  for(i = 0; i < width*height*64; ++i) {
+    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
+    if(dst[i] < 0) dst[i] = 0;
+  }
+}
+
+void filter_inter_quarterpel_luma(const encoder_control * const encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+{
+
+  int32_t x, y;
+  int32_t shift1 = encoder->bitdepth-8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14-encoder->bitdepth;
+  int32_t offset3 = 1 << (shift3 - 1);
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  //coefficients for 1/4, 2/4 and 3/4 positions
+  int16_t c1[8], c2[8], c3[8];
+
+  int i;
+  for(i = 0; i < 8; ++i ) {
+    c1[i] = g_luma_filter[1][i];
+    c2[i] = g_luma_filter[2][i];
+    c3[i] = g_luma_filter[3][i];
+  }
+
+  // Loop source pixels and generate sixteen filtered quarter-pel pixels on each round
+  for (y = 0; y < height; y++) {
+    int dst_pos_y = (y<<2)*dst_stride;
+    int src_pos_y = y*src_stride;
+    for (x = 0; x < width; x++) {
+      // Calculate current dst and src pixel positions
+      int dst_pos = dst_pos_y+(x<<2);
+      int src_pos = src_pos_y+x;
+
+      // Temporary variables..
+      int32_t h_temp[3] = {0,0,0};
+
+      // Original pixel
+      dst[dst_pos] = src[src_pos];
+
+      //
+      if (hor_flag) {
+
+        h_temp[0] = ((c1[0]*src[src_pos - 3]
+                    + c1[1]*src[src_pos - 2]
+                    + c1[2]*src[src_pos - 1]
+                    + c1[3]*src[src_pos]
+                    + c1[4]*src[src_pos + 1]
+                    + c1[5]*src[src_pos + 2]
+                    + c1[6]*src[src_pos + 3]
+                    + c1[7]*src[src_pos + 4]) >> shift1);
+
+
+
+        h_temp[1] = ((c2[0]*src[src_pos - 3]
+                    + c2[1]*src[src_pos - 2]
+                    + c2[2]*src[src_pos - 1]
+                    + c2[3]*src[src_pos]
+                    + c2[4]*src[src_pos + 1]
+                    + c2[5]*src[src_pos + 2]
+                    + c2[6]*src[src_pos + 3]
+                    + c2[7]*src[src_pos + 4]) >> shift1);
+
+        h_temp[2] = ((c3[0]*src[src_pos - 3]
+                    + c3[1]*src[src_pos - 2]
+                    + c3[2]*src[src_pos - 1]
+                    + c3[3]*src[src_pos]
+                    + c3[4]*src[src_pos + 1]
+                    + c3[5]*src[src_pos + 2]
+                    + c3[6]*src[src_pos + 3]
+                    + c3[7]*src[src_pos + 4]) >> shift1);
+      }
+      // ea0,0 - needed only when ver_flag
+      if(ver_flag) {
+        dst[dst_pos + 1*dst_stride] = (((c1[0]*src[src_pos - 3*src_stride]
+                                       + c1[1]*src[src_pos - 2*src_stride]
+                                       + c1[2]*src[src_pos - 1*src_stride]
+                                       + c1[3]*src[src_pos]
+                                       + c1[4]*src[src_pos + 1*src_stride]
+                                       + c1[5]*src[src_pos + 2*src_stride]
+                                       + c1[6]*src[src_pos + 3*src_stride]
+                                       + c1[7]*src[src_pos + 4*src_stride]) >> shift1)
+                                        + (1<<(shift3-1))) >> shift3;
+
+        dst[dst_pos + 2*dst_stride] = (((c2[0]*src[src_pos - 3*src_stride]
+                                       + c2[1]*src[src_pos - 2*src_stride]
+                                       + c2[2]*src[src_pos - 1*src_stride]
+                                       + c2[3]*src[src_pos]
+                                       + c2[4]*src[src_pos + 1*src_stride]
+                                       + c2[5]*src[src_pos + 2*src_stride]
+                                       + c2[6]*src[src_pos + 3*src_stride]
+                                       + c2[7]*src[src_pos + 4*src_stride]) >> shift1)
+                                        + (1<<(shift3-1))) >> shift3;
+
+        dst[dst_pos + 3*dst_stride] = (((c3[0]*src[src_pos - 3*src_stride]
+                                       + c3[1]*src[src_pos - 2*src_stride]
+                                       + c3[2]*src[src_pos - 1*src_stride]
+                                       + c3[3]*src[src_pos]
+                                       + c3[4]*src[src_pos + 1*src_stride]
+                                       + c3[5]*src[src_pos + 2*src_stride]
+                                       + c3[6]*src[src_pos + 3*src_stride]
+                                       + c3[7]*src[src_pos + 4*src_stride]) >> shift1)
+                                        + (1<<(shift3-1))) >> shift3;
+      }
+
+      // When both flags, we use _only_ this pixel (but still need ae0,0 for it)
+      if (hor_flag && ver_flag) {
+
+        int32_t temp[7][3];
+
+        // Calculate temporary values..
+        src_pos -= 3*src_stride;  //0,-3
+        for(i = 0; i < 7; ++i) {
+
+          temp[i][0] = ((c1[0]*src[src_pos - 3] + c1[1]*src[src_pos - 2]
+                       + c1[2]*src[src_pos - 1] + c1[3]*src[src_pos]
+                       + c1[4]*src[src_pos + 1] + c1[5]*src[src_pos + 2]
+                       + c1[6]*src[src_pos + 3] + c1[7]*src[src_pos + 4])
+              >> shift1); // h0(0,-3+i)
+
+          temp[i][1] = ((c2[0]*src[src_pos - 3] + c2[1]*src[src_pos - 2]
+                       + c2[2]*src[src_pos - 1] + c2[3]*src[src_pos]
+                       + c2[4]*src[src_pos + 1] + c2[5]*src[src_pos + 2]
+                       + c2[6]*src[src_pos + 3] + c2[7]*src[src_pos + 4])
+                        >> shift1); // h1(0,-3+i)
+
+          temp[i][2] = ((c3[0]*src[src_pos - 3] + c3[1]*src[src_pos - 2]
+                       + c3[2]*src[src_pos - 1] + c3[3]*src[src_pos]
+                       + c3[4]*src[src_pos + 1] + c3[5]*src[src_pos + 2]
+                       + c3[6]*src[src_pos + 3] + c3[7]*src[src_pos + 4])
+                        >> shift1); // h2(0,-3+i)
+
+          if(i == 2) {
+            //Skip calculating h_temp again
+            src_pos += 2*src_stride;
+          } else {
+            src_pos += src_stride;
+          }
+        }
+
+
+
+        for(i=0;i<3;++i){
+          dst[dst_pos + 1*dst_stride + i+1] = (((c1[0]*temp[0][i] + c1[1]*temp[1][i]
+                                               + c1[2]*temp[2][i] + c1[3]*h_temp[i]
+                                               + c1[4]*temp[3][i] + c1[5]*temp[4][i]
+                                               + c1[6]*temp[5][i] + c1[7]*temp[6][i])
+                                                + offset23) >> shift2) >> shift3;
+
+          dst[dst_pos + 2*dst_stride + i+1] = (((c2[0]*temp[0][i] + c2[1]*temp[1][i]
+                                               + c2[2]*temp[2][i] + c2[3]*h_temp[i]
+                                               + c2[4]*temp[3][i] + c2[5]*temp[4][i]
+                                               + c2[6]*temp[5][i] + c2[7]*temp[6][i])
+                                                + offset23) >> shift2) >> shift3;
+
+          dst[dst_pos + 3*dst_stride + i+1] = (((c3[0]*temp[0][i] + c3[1]*temp[1][i]
+                                               + c3[2]*temp[2][i] + c3[3]*h_temp[i]
+                                               + c3[4]*temp[3][i] + c3[5]*temp[4][i]
+                                               + c3[6]*temp[5][i] + c3[7]*temp[6][i])
+                                                + offset23) >> shift2) >> shift3;
+
+        }
+
+      }
+
+      if(hor_flag) {
+        dst[dst_pos + 1] = (h_temp[0] + offset3) >> shift3;
+        dst[dst_pos + 2] = (h_temp[1] + offset3) >> shift3;
+        dst[dst_pos + 3] = (h_temp[2] + offset3) >> shift3;
+      }
+
+
+    }
+  }
+
+  //Clamp values to bitdepth
+  for(i = 0; i < width*height*16; ++i) {
+    if(dst[i] > ((1 << encoder->bitdepth)-1)) dst[i] = (int16_t)((1 << encoder->bitdepth)-1);
+    if(dst[i] < 0) dst[i] = 0;
+  }
 }
diff --git a/src/filter.h b/src/filter.h
index b7a51fa5..9abb3032 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -54,6 +54,12 @@ void filter_inter_halfpel_chroma(const encoder_control * encoder,
                                  int16_t *src, int16_t src_stride, int width, int height,
                                  int16_t *dst, int16_t dst_stride,  int8_t hor_flag, int8_t ver_flag);
 
+void filter_inter_octpel_chroma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
+                                int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
+
+void filter_inter_quarterpel_luma(const encoder_control * encoder, int16_t *src, int16_t src_stride, int width, int height, int16_t *dst,
+                                  int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
+
 // SAO
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/inter.c b/src/inter.c
index ef19219c..9a4f0b6e 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -64,6 +64,45 @@ void inter_set_block(picture* pic, uint32_t x_cu, uint32_t y_cu, uint8_t depth,
   }
 }
 
+void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
+    int filterSize, int width, int height, int16_t *dst) {
+
+  int16_t mv[2] = {mv_x, mv_y};
+  int halfFilterSize = filterSize>>1;
+
+  int dst_y; int y; int dst_x; int x; int coord_x; int coord_y; int ref_width_c;
+  int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp;
+
+  ref_width_c = ref_width;
+  //width = LCU_WIDTH>>depth;
+
+  for (dst_y = 0, y = ypos - halfFilterSize; y < ((ypos + height)) + halfFilterSize; dst_y++, y++) {
+
+    // calculate y-pixel offset
+    coord_y = y + off_y + mv[1];
+
+    // On y-overflow set coord_y accordingly
+    overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
+    overflow_pos_y_temp = (coord_y >= ref_height) ? 1 : 0;
+    if (overflow_neg_y_temp)      coord_y = 0;
+    else if (overflow_pos_y_temp) coord_y = (ref_height) - 1;
+    coord_y *= ref_width_c;
+
+    for (dst_x = 0, x = (xpos) - halfFilterSize; x < ((xpos + width)) + halfFilterSize; dst_x++, x++) {
+      coord_x = x + off_x + mv[0];
+
+      // On x-overflow set coord_x accordingly
+      overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
+      overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0;
+      if (overflow_neg_x_temp)      coord_x = 0;
+      else if (overflow_pos_x_temp) coord_x = ref_width_c - 1;
+
+      // Store source block data (with extended borders)
+      dst[dst_y*(width+filterSize) + dst_x] = ref[coord_y + coord_x];
+    }
+  }
+}
+
 /**
  * \brief Reconstruct inter block
  * \param ref picture to copy the data from
@@ -100,153 +139,233 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
   int16_t halfpel_u[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (u)
   int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v)
 
-  // TODO: Fractional pixel support
+  // Luma quarter-pel
+    int8_t fractional_mv = (mv[0]&1) || (mv[1]&1) || (mv[0]&2) || (mv[1]&2); // 2 lowest bits of mv set -> mv is fractional
+
+    if(fractional_mv) {
+      int y_off_x = (mv[0]&3);
+      int y_off_y = (mv[1]&3);
+
+      int c_off_x = (mv[0]&7);
+      int c_off_y = (mv[1]&7);
+
+      int y,x;
+
+      #define FILTER_SIZE_Y 8
+      #define FILTER_SIZE_C 4
+
+      //vector2d orig = {xpos, ypos};
+      //vector2d orig_c = {xpos>>1, ypos>>1};
+
+      // Fractional luma 1/4-pel
+      int16_t qpel_src_y[(LCU_WIDTH+FILTER_SIZE_Y) * (LCU_WIDTH+FILTER_SIZE_Y)];
+      int16_t* qpel_src_off_y = &qpel_src_y[(width+FILTER_SIZE_Y)*(FILTER_SIZE_Y>>1)+(FILTER_SIZE_Y>>1)];
+      int16_t qpel_dst_y[LCU_WIDTH*LCU_WIDTH*16];
+
+      // Fractional chroma 1/8-pel
+      int width_c = width>>1;
+      int16_t octpel_src_u[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
+      int16_t* octpel_src_off_u = &octpel_src_u[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
+      int16_t octpel_dst_u[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
+
+      int16_t octpel_src_v[((LCU_WIDTH>>1)+FILTER_SIZE_C) * ((LCU_WIDTH>>1)+FILTER_SIZE_C)];
+      int16_t* octpel_src_off_v = &octpel_src_v[(width_c+FILTER_SIZE_C)*(FILTER_SIZE_C>>1)+(FILTER_SIZE_C>>1)];
+      int16_t octpel_dst_v[(LCU_WIDTH>>1)*(LCU_WIDTH>>1)*64];
+
+      // Fractional luma
+      extend_borders(xpos, ypos, mv[0]>>2, mv[1]>>2, encoder_state->tile->lcu_offset_x * LCU_WIDTH, encoder_state->tile->lcu_offset_y * LCU_WIDTH,
+          ref->y_recdata, ref->width, ref->height, FILTER_SIZE_Y, width, width, qpel_src_y);
+
+      filter_inter_quarterpel_luma(encoder_state->encoder_control, qpel_src_off_y, width+FILTER_SIZE_Y, width,
+                                   width, qpel_dst_y, width*4, y_off_x, y_off_y);
+
+      //Fractional chroma U
+      extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
+          ref->u_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_u);
+
+      filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_u, width_c+FILTER_SIZE_C, width_c,
+                                 width_c, octpel_dst_u, width_c*8, c_off_x, c_off_y);
+
+      //Fractional chroma V
+      extend_borders(xpos>>1, ypos>>1, (mv[0]>>2)>>1, (mv[1]>>2)>>1, encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1), encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1),
+          ref->v_recdata, ref->width>>1, ref->height>>1, FILTER_SIZE_C, width_c, width_c, octpel_src_v);
+
+      filter_inter_octpel_chroma(encoder_state->encoder_control, octpel_src_off_v, width_c+FILTER_SIZE_C, width_c,
+                   width_c, octpel_dst_v, width_c*8, c_off_x, c_off_y);
+
+      //Luma
+      for(y = 0; y < width; ++y) {
+        int y_in_lcu = ((y+ypos) & ((LCU_WIDTH)-1));
+        int qpel_y = y*4+y_off_y;
+        for(x = 0; x < width; ++x) {
+          int x_in_lcu = ((x+xpos) & ((LCU_WIDTH)-1));
+          int qpel_x = x*4+y_off_x;
+          //printf("x: %d, y: %d\n", off_x, off_y);
+          lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)qpel_dst_y[qpel_y*(width*4)+qpel_x];
+          //printf("i: %d", qpel_y*(width*4)+qpel_x);
+        }
+      }
+      //Chroma
+      for(y = 0; y < width_c; ++y) {
+        int y_in_lcu = ((y+(ypos>>1)) & ((LCU_WIDTH>>1)-1));
+        int qpel_y = y*8+c_off_y;
+        for(x = 0; x < width_c; ++x) {
+          int x_in_lcu = ((x+(xpos>>1)) & ((LCU_WIDTH>>1)-1));
+          int qpel_x = x*8+c_off_x;
+          lcu->rec.u[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_u[qpel_y*(width_c*8)+qpel_x];
+          lcu->rec.v[y_in_lcu * dst_width_c + x_in_lcu] = (uint8_t)octpel_dst_v[qpel_y*(width_c*8)+qpel_x];
+        }
+      }
+    }
+
   mv[0] >>= 2;
   mv[1] >>= 2;
 
   // Chroma half-pel
   // get half-pel interpolated block and push it to output
-  if(chroma_halfpel) {
-    int halfpel_y, halfpel_x;
-    int abs_mv_x = mv[0]&1;
-    int abs_mv_y = mv[1]&1;
-    int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp;
-    // Fill source blocks with data from reference, -4...width+4
-    for (halfpel_y = 0, y = (ypos>>1) - 4; y < ((ypos + width)>>1) + 4; halfpel_y++, y++) {
-      // calculate y-pixel offset
-      coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1);
-
-      // On y-overflow set coord_y accordingly
-      overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
-      overflow_pos_y_temp = (coord_y >= ref->height>>1) ? 1 : 0;
-      if (overflow_neg_y_temp)      coord_y = 0;
-      else if (overflow_pos_y_temp) coord_y = (ref->height>>1) - 1;
-      coord_y *= ref_width_c;
-
-      for (halfpel_x = 0, x = (xpos>>1) - 4; x < ((xpos + width)>>1) + 4; halfpel_x++, x++) {
-        coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1);
-
-        // On x-overflow set coord_x accordingly
-        overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
-        overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0;
-        if (overflow_neg_x_temp)      coord_x = 0;
-        else if (overflow_pos_x_temp) coord_x = ref_width_c - 1;
-
-        // Store source block data (with extended borders)
-        halfpel_src_u[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->u_recdata[coord_y + coord_x];
-        halfpel_src_v[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->v_recdata[coord_y + coord_x];
-      }
-    }
-
-    // Filter the block to half-pel resolution
-    filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_u, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_u, LCU_WIDTH, abs_mv_x, abs_mv_y);
-    filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_v, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_v, LCU_WIDTH, abs_mv_x, abs_mv_y);
-
-    // Assign filtered pixels to output, take every second half-pel sample with offset of abs_mv_y/x
-    for (halfpel_y = abs_mv_y, y = ypos>>1; y < (ypos + width)>>1; halfpel_y += 2, y++) {
-      for (halfpel_x = abs_mv_x, x = xpos>>1; x < (xpos + width)>>1; halfpel_x += 2, x++) {
-        int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-        int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-        lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_u[halfpel_y*LCU_WIDTH + halfpel_x];
-        lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_v[halfpel_y*LCU_WIDTH + halfpel_x];
-      }
-    }
-  }
-
-  // With overflow present, more checking
-  if (overflow_neg_x || overflow_neg_y || overflow_pos_x || overflow_pos_y) {
-    // Copy Luma with boundary checking
-    for (y = ypos; y < ypos + width; y++) {
-      for (x = xpos; x < xpos + width; x++) {
-        int x_in_lcu = (x & ((LCU_WIDTH)-1));
-        int y_in_lcu = (y & ((LCU_WIDTH)-1));
-
-        coord_x = (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0];
-        coord_y = (y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1];
-        overflow_neg_x = (coord_x < 0)?1:0;
-        overflow_neg_y = (coord_y < 0)?1:0;
-
-        overflow_pos_x = (coord_x >= ref->width )?1:0;
-        overflow_pos_y = (coord_y >= ref->height)?1:0;
-
-        // On x-overflow set coord_x accordingly
-        if (overflow_neg_x) {
-          coord_x = 0;
-        } else if (overflow_pos_x) {
-          coord_x = ref->width - 1;
-        }
+  if(!fractional_mv) {
+    if(chroma_halfpel) {
+      int halfpel_y, halfpel_x;
+      int abs_mv_x = mv[0]&1;
+      int abs_mv_y = mv[1]&1;
+      int8_t overflow_neg_y_temp,overflow_pos_y_temp,overflow_neg_x_temp,overflow_pos_x_temp;
+      // Fill source blocks with data from reference, -4...width+4
+      for (halfpel_y = 0, y = (ypos>>1) - 4; y < ((ypos + width)>>1) + 4; halfpel_y++, y++) {
+        // calculate y-pixel offset
+        coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1);
 
         // On y-overflow set coord_y accordingly
-        if (overflow_neg_y) {
-          coord_y = 0;
-        } else if (overflow_pos_y) {
-          coord_y = ref->height - 1;
-        }
+        overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
+        overflow_pos_y_temp = (coord_y >= ref->height>>1) ? 1 : 0;
+        if (overflow_neg_y_temp)      coord_y = 0;
+        else if (overflow_pos_y_temp) coord_y = (ref->height>>1) - 1;
+        coord_y *= ref_width_c;
 
-        // set destination to (corrected) pixel value from the reference
-        lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y*ref->width + coord_x];
+        for (halfpel_x = 0, x = (xpos>>1) - 4; x < ((xpos + width)>>1) + 4; halfpel_x++, x++) {
+          coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1);
+
+          // On x-overflow set coord_x accordingly
+          overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
+          overflow_pos_x_temp = (coord_x >= ref_width_c) ? 1 : 0;
+          if (overflow_neg_x_temp)      coord_x = 0;
+          else if (overflow_pos_x_temp) coord_x = ref_width_c - 1;
+
+          // Store source block data (with extended borders)
+          halfpel_src_u[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->u_recdata[coord_y + coord_x];
+          halfpel_src_v[halfpel_y*HALFPEL_CHROMA_WIDTH + halfpel_x] = ref->v_recdata[coord_y + coord_x];
+        }
+      }
+
+      // Filter the block to half-pel resolution
+      filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_u, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_u, LCU_WIDTH, abs_mv_x, abs_mv_y);
+      filter_inter_halfpel_chroma(encoder_state->encoder_control, halfpel_src_off_v, HALFPEL_CHROMA_WIDTH, width>>1, width>>1, halfpel_v, LCU_WIDTH, abs_mv_x, abs_mv_y);
+
+      // Assign filtered pixels to output, take every second half-pel sample with offset of abs_mv_y/x
+      for (halfpel_y = abs_mv_y, y = ypos>>1; y < (ypos + width)>>1; halfpel_y += 2, y++) {
+        for (halfpel_x = abs_mv_x, x = xpos>>1; x < (xpos + width)>>1; halfpel_x += 2, x++) {
+          int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
+          int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
+          lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_u[halfpel_y*LCU_WIDTH + halfpel_x];
+          lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = (uint8_t)halfpel_v[halfpel_y*LCU_WIDTH + halfpel_x];
+        }
       }
     }
 
-    if(!chroma_halfpel) {
-      // Copy Chroma with boundary checking
-      // TODO: chroma fractional pixel interpolation
-      for (y = ypos>>1; y < (ypos + width)>>1; y++) {
-        for (x = xpos>>1; x < (xpos + width)>>1; x++) {
-          int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-          int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-
-          coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1);
-          coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1);
+    // With overflow present, more checking
+    if (overflow_neg_x || overflow_neg_y || overflow_pos_x || overflow_pos_y) {
+      // Copy Luma with boundary checking
+      for (y = ypos; y < ypos + width; y++) {
+        for (x = xpos; x < xpos + width; x++) {
+          int x_in_lcu = (x & ((LCU_WIDTH)-1));
+          int y_in_lcu = (y & ((LCU_WIDTH)-1));
 
+          coord_x = (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0];
+          coord_y = (y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1];
           overflow_neg_x = (coord_x < 0)?1:0;
-          overflow_neg_y = (y + (mv[1]>>1) < 0)?1:0;
+          overflow_neg_y = (coord_y < 0)?1:0;
 
-          overflow_pos_x = (coord_x >= ref->width>>1 )?1:0;
-          overflow_pos_y = (coord_y >= ref->height>>1)?1:0;
+          overflow_pos_x = (coord_x >= ref->width )?1:0;
+          overflow_pos_y = (coord_y >= ref->height)?1:0;
 
           // On x-overflow set coord_x accordingly
-          if(overflow_neg_x) {
+          if (overflow_neg_x) {
             coord_x = 0;
-          } else if(overflow_pos_x) {
-            coord_x = (ref->width>>1) - 1;
+          } else if (overflow_pos_x) {
+            coord_x = ref->width - 1;
           }
 
           // On y-overflow set coord_y accordingly
-          if(overflow_neg_y) {
+          if (overflow_neg_y) {
             coord_y = 0;
-          } else if(overflow_pos_y) {
-            coord_y = (ref->height>>1) - 1;
+          } else if (overflow_pos_y) {
+            coord_y = ref->height - 1;
           }
 
-          // set destinations to (corrected) pixel value from the reference
-          lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y * ref_width_c + coord_x];
-          lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y * ref_width_c + coord_x];
+          // set destination to (corrected) pixel value from the reference
+          lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y*ref->width + coord_x];
         }
       }
-    }
-  } else { //If no overflow, we can copy without checking boundaries
-    // Copy Luma
-    for (y = ypos; y < ypos + width; y++) {
-      int y_in_lcu = (y & ((LCU_WIDTH)-1));
-      coord_y = ((y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate
-      for (x = xpos; x < xpos + width; x++) {
-        int x_in_lcu = (x & ((LCU_WIDTH)-1));
 
-        lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]];
+      if(!chroma_halfpel) {
+        // Copy Chroma with boundary checking
+        // TODO: chroma fractional pixel interpolation
+        for (y = ypos>>1; y < (ypos + width)>>1; y++) {
+          for (x = xpos>>1; x < (xpos + width)>>1; x++) {
+            int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
+            int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
+
+            coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1);
+            coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1);
+
+            overflow_neg_x = (coord_x < 0)?1:0;
+            overflow_neg_y = (y + (mv[1]>>1) < 0)?1:0;
+
+            overflow_pos_x = (coord_x >= ref->width>>1 )?1:0;
+            overflow_pos_y = (coord_y >= ref->height>>1)?1:0;
+
+            // On x-overflow set coord_x accordingly
+            if(overflow_neg_x) {
+              coord_x = 0;
+            } else if(overflow_pos_x) {
+              coord_x = (ref->width>>1) - 1;
+            }
+
+            // On y-overflow set coord_y accordingly
+            if(overflow_neg_y) {
+              coord_y = 0;
+            } else if(overflow_pos_y) {
+              coord_y = (ref->height>>1) - 1;
+            }
+
+            // set destinations to (corrected) pixel value from the reference
+            lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y * ref_width_c + coord_x];
+            lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y * ref_width_c + coord_x];
+          }
+        }
       }
-    }
+    } else { //If no overflow, we can copy without checking boundaries
+      // Copy Luma
+      for (y = ypos; y < ypos + width; y++) {
+        int y_in_lcu = (y & ((LCU_WIDTH)-1));
+        coord_y = ((y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate
+        for (x = xpos; x < xpos + width; x++) {
+          int x_in_lcu = (x & ((LCU_WIDTH)-1));
 
-    if(!chroma_halfpel) {
-      // Copy Chroma
-      // TODO: chroma fractional pixel interpolation
-      for (y = ypos>>1; y < (ypos + width)>>1; y++) {
-        int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-        coord_y = ((y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate
-        for (x = xpos>>1; x < (xpos + width)>>1; x++) {
-          int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-          lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
-          lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
+          lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]];
+        }
+      }
+
+      if(!chroma_halfpel) {
+        // Copy Chroma
+        // TODO: chroma fractional pixel interpolation
+        for (y = ypos>>1; y < (ypos + width)>>1; y++) {
+          int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
+          coord_y = ((y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate
+          for (x = xpos>>1; x < (xpos + width)>>1; x++) {
+            int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
+            lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
+            lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
+          }
         }
       }
     }
diff --git a/src/inter.h b/src/inter.h
index d02749aa..f2c60884 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -37,4 +37,6 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
                                         cu_info **b2,cu_info **a0,cu_info **a1, lcu_t *lcu);
 void inter_get_mv_cand(const encoder_state *encoder_state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info* cur_cu, lcu_t *lcu);
 uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu);
+void extend_borders(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, pixel *ref, int ref_width, int ref_height,
+                    int filterSize, int width, int height, int16_t *dst);
 #endif
diff --git a/src/search.c b/src/search.c
index a5e69242..2f3b1b75 100644
--- a/src/search.c
+++ b/src/search.c
@@ -72,6 +72,16 @@ const vector2d small_hexbs[5] = {
   { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 }
 };
 
+/*
+ *  6 7 8
+ *  3 4 5
+ *  0 1 2
+ */
+const vector2d square[9] = {
+  { -1, 1 },
+  { 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 },
+  { 0, -1 }, { 1, -1 }
+};
 
 static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
 {
@@ -118,7 +128,7 @@ static uint32_t get_mvd_coding_cost(vector2d *mvd)
   return bitcost;
 }
 
-static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y,
+static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y, int mv_shift,
                          int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
                          int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
 {
@@ -129,8 +139,8 @@ static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y
   int8_t merged      = 0;
   int8_t cur_mv_cand = 0;
 
-  x <<= 2;
-  y <<= 2;
+  x <<= mv_shift;
+  y <<= mv_shift;
 
   // Check every candidate to find a match
   for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
@@ -205,7 +215,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                              (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
                              (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
                              block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
     if (cost < best_cost) {
       best_cost    = cost;
@@ -220,7 +230,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                              (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, 
                              (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
                              block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, 0, 0, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
     // If the 0,0 is better, redo the hexagon around that point.
     if (cost < best_cost) {
@@ -236,7 +246,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                                  (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
                                  (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
                                  block_width, block_width);
-        cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+        cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
         if (cost < best_cost) {
           best_cost    = cost;
@@ -271,7 +281,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                                (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                                (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                                block_width, block_width);
-      cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+      cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
       if (cost < best_cost) {
         best_cost    = cost;
@@ -294,7 +304,7 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
                              (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
                              (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                              block_width, block_width);
-    cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+    cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, 2,mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
     if (cost > 0 && cost < best_cost) {
       best_cost    = cost;
@@ -369,6 +379,139 @@ static unsigned search_mv_full(unsigned depth,
 }
 #endif
 
+static unsigned search_frac( const encoder_state * const encoder_state,
+        unsigned depth,
+        const picture *pic, const picture *ref,
+        const vector2d *orig, vector2d *mv_in_out,
+        int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+        int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out) {
+
+  //Set mv to halfpel precision
+  vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+  int block_width = CU_WIDTH_FROM_DEPTH(depth);
+  unsigned best_cost = UINT32_MAX;
+  uint32_t best_bitcost = 0, bitcost;
+  unsigned i;
+  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
+
+  unsigned cost = 0;
+
+  cost_16bit_nxn_func satd = get_satd_16bit_nxn_func(block_width);
+
+  vector2d halfpel_offset;
+
+  #define FILTER_SIZE 8
+  #define HALF_FILTER (FILTER_SIZE>>1)
+
+  //create buffer for block + extra for filter
+  int src_stride = block_width+FILTER_SIZE+1;
+  int16_t src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
+  int16_t* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
+
+  //destination buffer for interpolation
+  int dst_stride = (block_width+1)*4;
+  int16_t dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16];
+  int16_t* dst_off = &dst[dst_stride*4+4];
+
+  extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
+                encoder_state->tile->lcu_offset_x * LCU_WIDTH,
+                encoder_state->tile->lcu_offset_y * LCU_WIDTH,
+                ref->y_data, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
+
+  filter_inter_quarterpel_luma(encoder_state->encoder_control, src_off, src_stride, block_width+1,
+      block_width+1, dst, dst_stride, 1, 1);
+
+
+  //Set mv to half-pixel precision
+  mv.x <<= 1;
+  mv.y <<= 1;
+
+  // Search halfpel positions around best integer mv
+  for (i = 0; i < 9; ++i) {
+    const vector2d *pattern = &square[i];
+
+    pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
+    pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
+
+    int y,x;
+    for(y = 0; y < block_width; ++y) {
+      int dst_y = y*4+pattern->y*2;
+      for(x = 0; x < block_width; ++x) {
+        int dst_x = x*4+pattern->x*2;
+        tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
+        tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
+      }
+    }
+
+    cost = satd(tmp_pic,tmp_filtered);
+
+    cost = cost>>1;
+
+    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+
+    }
+  }
+
+  //Set mv to best match
+  mv.x += square[best_index].x;
+  mv.y += square[best_index].y;
+
+  halfpel_offset.x = square[best_index].x*2;
+  halfpel_offset.y = square[best_index].y*2;
+
+  //Set mv to quarterpel precision
+  mv.x <<= 1;
+  mv.y <<= 1;
+
+  //Search quarterpel points around best halfpel mv
+  for (i = 0; i < 9; ++i) {
+    const vector2d *pattern = &square[i];
+
+    pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
+    pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
+
+    int y,x;
+    for(y = 0; y < block_width; ++y) {
+      int dst_y = y*4+halfpel_offset.y+pattern->y;
+      for(x = 0; x < block_width; ++x) {
+        int dst_x = x*4+halfpel_offset.x+pattern->x;
+        tmp_filtered[y*block_width+x] = (uint8_t)dst_off[dst_y*dst_stride+dst_x];
+        tmp_pic[y*block_width+x] = (uint8_t)pic->y_data[orig->x+x + (orig->y+y)*pic->width];
+      }
+    }
+
+    cost = satd(tmp_pic,tmp_filtered);
+
+    cost = cost>>1;
+
+    cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+    }
+  }
+
+  //Set mv to best final best match
+  mv.x += square[best_index].x;
+  mv.y += square[best_index].y;
+
+  mv_in_out->x = mv.x;
+  mv_in_out->y = mv.y;
+
+  *bitcost_out = best_bitcost;
+
+
+  return best_cost;
+
+}
+
 /**
  * Update lcu to have best modes at this depth.
  * \return Cost of best mode.
@@ -425,6 +568,8 @@ static int search_cu_inter(const encoder_state * const encoder_state, int x, int
     temp_cost += hexagon_search(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
 #endif
 
+    temp_cost = search_frac(encoder_state, depth, cur_pic, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+
     merged = 0;
     // Check every candidate to find a match
     for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {