From bf7542c35de0a52a48dd80281685efc372a8d94e Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 21 Jul 2015 12:02:54 +0300
Subject: [PATCH] Move functions from search to search_inter.

---
 src/search.c       | 1160 +------------------------------------------
 src/search_inter.c | 1174 ++++++++++++++++++++++++++++++++++++++++++++
 src/search_inter.h |    4 +
 3 files changed, 1179 insertions(+), 1159 deletions(-)

diff --git a/src/search.c b/src/search.c
index 0bacbf80..6c481c29 100644
--- a/src/search.c
+++ b/src/search.c
@@ -40,9 +40,7 @@
 #include "rdo.h"
 #include "transform.h"
 #include "encoder.h"
-
-// Temporarily for debugging.
-#define SEARCH_MV_FULL_RADIUS 0
+#include "search_inter.h"
 
 #define IN_FRAME(x, y, width, height, block_width, block_height) \
   ((x) >= 0 && (y) >= 0 \
@@ -78,1162 +76,6 @@
 # define TRSKIP_RATIO 1.7
 #endif
 
-/**
- * This is used in the hexagon_search to select 3 points to search.
- *
- * The start of the hexagonal pattern has been repeated at the end so that
- * the indices between 1-6 can be used as the start of a 3-point list of new
- * points to search.
- *
- *   6 o-o 1 / 7
- *    /   \
- * 5 o  0  o 2 / 8
- *    \   /
- *   4 o-o 3
- */
-const vector2d_t large_hexbs[10] = {
-  { 0, 0 },
-  { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }, { -1, -2 },
-  { 1, -2 }, { 2, 0 }
-};
-
-/**
- * This is used as the last step of the hexagon search.
- */
-const vector2d_t small_hexbs[5] = {
-  { 0, 0 },
-  { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 }
-};
-
-/*
- *  6 7 8
- *  3 4 5
- *  0 1 2
- */
-const vector2d_t square[9] = {
-  { -1, 1 },
-  { 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 },
-  { 0, -1 }, { 1, -1 }
-};
-
-static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
-{
-  int32_t num_bins = 0;
-  while (symbol >= (uint32_t)(1 << count)) {
-    ++num_bins;
-    symbol -= 1 << count;
-    ++count;
-  }
-  num_bins ++;
-
-  return num_bins;
-}
-
-static uint32_t get_mvd_coding_cost(vector2d_t *mvd)
-{
-  uint32_t bitcost = 0;
-  const int32_t mvd_hor = mvd->x;
-  const int32_t mvd_ver = mvd->y;
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-  // Greater than 0 for x/y
-  bitcost += 2;
-
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs-2, 1) - 2; // TODO: tune the costs
-    }
-    // Greater than 1 + sign
-    bitcost += 2;
-  }
-
-  if (ver_abs_gr0) {
-    if (mvd_ver_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs-2, 1) - 2; // TODO: tune the costs
-    }
-    // Greater than 1 + sign
-    bitcost += 2;
-  }
-
-  return bitcost;
-}
-
-static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int mv_shift,
-                         int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                         int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
-{
-  uint32_t temp_bitcost = 0;
-  uint32_t merge_idx;
-  int cand1_cost,cand2_cost;
-  vector2d_t mvd_temp1, mvd_temp2;
-  int8_t merged      = 0;
-  int8_t cur_mv_cand = 0;
-
-  x <<= mv_shift;
-  y <<= mv_shift;
-
-  // Check every candidate to find a match
-  for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
-    if (merge_cand[merge_idx].dir == 3) continue;
-    if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x &&
-        merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y &&
-        merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
-      temp_bitcost += merge_idx;
-      merged = 1;
-      break;
-    }
-  }
-
-  // Check mvd cost only if mv is not merged
-  if(!merged) {
-    mvd_temp1.x = x - mv_cand[0][0];
-    mvd_temp1.y = y - mv_cand[0][1];
-    cand1_cost = get_mvd_coding_cost(&mvd_temp1);
-
-    mvd_temp2.x = x - mv_cand[1][0];
-    mvd_temp2.y = y - mv_cand[1][1];
-    cand2_cost = get_mvd_coding_cost(&mvd_temp2);
-
-    // Select candidate 1 if it has lower cost
-    if (cand2_cost < cand1_cost) {
-      cur_mv_cand = 1;
-    }
-    temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
-  }
-  *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(state->global->cur_lambda_cost_sqrt+0.5);
-}
-
-unsigned tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
-                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
-                           int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                           int block_width, int max_lcu_below)
-{
-  int n_points;
-  int best_index = -1;
-  int i;
-  
-  vector2d_t mv_best = { 0, 0 };
-
-  assert(pattern_type < 4);
-
-  //implemented search patterns
-  vector2d_t pattern[4][8] = {
-      //diamond (8 points)
-      //[ ][ ][ ][ ][1][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][8][ ][ ][ ][5][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[4][ ][ ][ ][o][ ][ ][ ][2]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][7][ ][ ][ ][6][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][3][ ][ ][ ][ ]
-      {
-        { 0, iDist }, { iDist, 0 }, { 0, -iDist }, { -iDist, 0 },
-        { iDist / 2, iDist / 2 }, { iDist / 2, -iDist / 2 }, { -iDist / 2, -iDist / 2 }, { -iDist / 2, iDist / 2 }
-      },
-
-      //square (8 points)
-      //[8][ ][ ][ ][1][ ][ ][ ][2]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[7][ ][ ][ ][o][ ][ ][ ][3]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[6][ ][ ][ ][5][ ][ ][ ][4]
-      {
-        { 0, iDist }, { iDist, iDist }, { iDist, 0 }, { iDist, -iDist }, { 0, -iDist },
-        { -iDist, -iDist }, { -iDist, 0 }, { -iDist, iDist }
-      },
-
-      //octagon (8 points)
-      //[ ][ ][5][ ][ ][ ][1][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][2]
-      //[4][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][o][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[8][ ][ ][ ][ ][ ][ ][ ][6]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][7][ ][ ][ ][3][ ][ ]
-      {
-        { iDist / 2, iDist }, { iDist, iDist / 2 }, { iDist / 2, -iDist }, { -iDist, iDist / 2 },
-        { -iDist / 2, iDist }, { iDist, -iDist / 2 }, { -iDist / 2, -iDist }, { -iDist, -iDist / 2 }
-      },
-
-      //hexagon (6 points)
-      //[ ][ ][5][ ][ ][ ][1][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[4][ ][ ][ ][o][ ][ ][ ][2]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
-      //[ ][ ][6][ ][ ][ ][3][ ][ ]
-      {
-        { iDist / 2, iDist }, { iDist, 0 }, { iDist / 2, -iDist }, { -iDist, 0 },
-        { iDist / 2, iDist }, { -iDist / 2, -iDist }, { 0, 0 }, { 0, 0 }
-      }
-
-  };
-
-  //set the number of points to be checked
-  if (iDist == 1)
-  {
-    switch (pattern_type)
-    {
-      case 0:
-        n_points = 4;
-        break;
-      case 2:
-        n_points = 4;
-        break;
-      case 3:
-        n_points = 4;
-        break;
-      default:
-        n_points = 8;
-        break;
-    };
-  }
-  else
-  {
-    switch (pattern_type)
-    {
-      case 3:
-        n_points = 6;
-        break;
-      default:
-        n_points = 8;
-        break;
-    };
-  }
-
-  //compute SAD values for all chosen points
-  for (i = 0; i < n_points; i++)
-  {
-    vector2d_t *current = &pattern[pattern_type][i];
-    unsigned cost;
-    uint32_t bitcost;
-
-    {
-      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-      cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-                            block_width, block_width, max_lcu_below);
-      cost += calc_mvd_cost(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
-        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x + block_width,
-        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y + block_width);
-    }
-
-    if (cost < best_cost)
-    {
-      best_cost = cost;
-      *best_bitcost = bitcost;
-      best_index = i;
-    }
-
-  }
-
-  if (best_index >= 0)
-  {
-    mv_best = pattern[pattern_type][best_index];
-    *best_dist = iDist;
-  }
-  
-  mv->x += mv_best.x;
-  mv->y += mv_best.y;
-
-  return best_cost;
-
-}
-
-unsigned tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
-                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
-                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                          int block_width, int iSearchRange, int iRaster, int max_lcu_below)
-{
-  int i;
-  int k;
-
-  vector2d_t mv_best = { 0, 0 };
-  
-  //compute SAD values for every point in the iRaster downsampled version of the current search area
-  for (i = iSearchRange; i >= -iSearchRange; i -= iRaster)
-  {
-    for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
-    {
-      vector2d_t current = { k, i };
-      unsigned cost;
-      uint32_t bitcost;
-
-      {
-        PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-        cost = image_calc_sad(pic, ref, orig->x, orig->y,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          block_width, block_width, max_lcu_below);
-        cost += calc_mvd_cost(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-        PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k + block_width,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i + block_width);
-      }
-
-      if (cost < best_cost)
-      {
-        best_cost = cost;
-        *best_bitcost = bitcost;
-        mv_best = current;
-      }
-
-    }
-  }
-  
-  mv->x += mv_best.x;
-  mv->y += mv_best.y;
-
-  return best_cost;
-
-}
-
-static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
-                          const kvz_picture *pic, const kvz_picture *ref,
-                          const vector2d_t *orig, vector2d_t *mv_in_out,
-                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                          int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
-{
-
-  //TZ parameters
-  const int iSearchRange = 96;  // search range for each stage
-  const int iRaster = 5;  // search distance limit and downsampling factor for step 3                   
-  const unsigned step2_type = 0;  // search patterns for steps 2 and 4
-  const unsigned step4_type = 0;
-  const bool bRasterRefinementEnable = true;  // enable step 4 mode 1
-  const bool bStarRefinementEnable = false;   // enable step 4 mode 2 (only one mode will be executed)
-  
-  const int block_width = CU_WIDTH_FROM_DEPTH(depth);
-
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0;
-  int iDist;
-  int best_dist = 0;
-  unsigned best_index = num_cand;
-  int max_lcu_below = -1;
-
-  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
-  }
-
-  //step 1, compare (0,0) vector to predicted vectors
-  
-  // Check whatever input vector we got, unless its (0, 0) which will be checked later.
-  if (mv.x || mv.y) 
-  {
-    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-
-    best_cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        block_width, block_width, max_lcu_below);
-    best_cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
-
-    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
-  }
-
-  int i;
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  for (i = 0; i < num_cand; ++i) 
-  {
-    if (merge_cand[i].dir == 3) continue;
-    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
-    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
-
-    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-
-	  uint32_t bitcost;
-    unsigned cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   block_width, block_width, max_lcu_below);
-    cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
-
-    if (cost < best_cost) {
-      best_cost = cost;
-      best_index = i;
-      best_bitcost = bitcost;
-    }
-  }
-  
-  if (best_index < (unsigned)num_cand) {
-    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
-    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
-  } else {
-    mv.x = mv_in_out->x >> 2;
-    mv.y = mv_in_out->y >> 2;
-  }
-
-  //step 2, grid search
-  for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
-  {
-    best_cost = tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
-                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
-  }
-
-  //step 3, raster scan
-  if (best_dist > iRaster)
-  {
-    best_dist = iRaster;
-
-    best_cost = tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, 
-                                 num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_lcu_below);
-  }
-
-  //step 4
-
-  //raster refinement
-  if (bRasterRefinementEnable && best_dist > 0)
-  {
-    iDist = best_dist >> 1;
-    while (iDist > 0)
-    {
-      best_cost = tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
-
-      iDist = iDist >> 1;
-    }
-  }
-
-  //star refinement (repeat step 2 for the current starting point)
-  if (bStarRefinementEnable && best_dist > 0)
-  {
-    for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
-    {
-      best_cost = tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
-    }
-  }
-
-  mv.x = mv.x << 2;
-  mv.y = mv.y << 2;
-
-  *mv_in_out = mv;
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
-}
-
-/**
- * \brief Do motion search using the HEXBS algorithm.
- *
- * \param depth      log2 depth of the search
- * \param pic        Picture motion vector is searched for.
- * \param ref        Picture motion vector is searched from.
- * \param orig       Top left corner of the searched for block.
- * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
- *
- * \returns  Cost of the motion vector.
- *
- * Motion vector is searched by first searching iteratively with the large
- * hexagon pattern until the best match is at the center of the hexagon.
- * As a final step a smaller hexagon is used to check the adjacent pixels.
- *
- * If a non 0,0 predicted motion vector predictor is given as mv_in_out,
- * the 0,0 vector is also tried. This is hoped to help in the case where
- * the predicted motion vector is way off. In the future even more additional
- * points like 0,0 might be used, such as vectors from top or left.
- */
-static unsigned hexagon_search(const encoder_state_t * const state, unsigned depth,
-                               const kvz_picture *pic, const kvz_picture *ref,
-                               const vector2d_t *orig, vector2d_t *mv_in_out,
-                               int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
-{
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-  int block_width = CU_WIDTH_FROM_DEPTH(depth);
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0, bitcost;
-  unsigned i;
-  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-  int max_lcu_below = -1;
-  
-  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
-  }
-
-  // Check mv_in, if it's not in merge candidates.
-  bool mv_in_merge_cand = false;
-  for (int i = 0; i < num_cand; ++i) {
-    if (merge_cand[i].dir == 3) continue;
-    if (merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2 == mv.x &&
-        merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2 == mv.y) {
-      mv_in_merge_cand = true;
-      break;
-    }
-  }
-
-  if (!mv_in_merge_cand) {
-    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-
-    best_cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        block_width, block_width, max_lcu_below);
-    best_cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-    best_bitcost = bitcost;
-    best_index = num_cand; 
-
-    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
-  }
-
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  for (i = 0; i < num_cand; ++i) {
-    if (merge_cand[i].dir == 3) continue;
-    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
-    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
-
-    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-
-    unsigned cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   block_width, block_width, max_lcu_below);
-    cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
-
-    if (cost < best_cost) {
-      best_cost = cost;
-      best_index = i;
-      best_bitcost = bitcost;
-    }
-  }
-  if (best_index < num_cand) {
-    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
-    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
-  } else {
-    mv.x = mv_in_out->x >> 2;
-    mv.y = mv_in_out->y >> 2;
-  }
-  
-  // Search the initial 7 points of the hexagon.
-  best_index = 0;
-  for (i = 0; i < 7; ++i) {
-    const vector2d_t *pattern = &large_hexbs[i];
-    unsigned cost;
-    {
-      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-      cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
-                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
-                             block_width, block_width, max_lcu_below);
-      cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-
-      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
-                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
-                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x + block_width, 
-                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, 
-                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y + block_width);
-    }
-
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
-    }
-  }
-
-  // Iteratively search the 3 new points around the best match, until the best
-  // match is in the center.
-  while (best_index != 0) {
-    unsigned start; // Starting point of the 3 offsets to be searched.
-    if (best_index == 1) {
-      start = 6;
-    } else if (best_index == 8) {
-      start = 1;
-    } else {
-      start = best_index - 1;
-    }
-
-    // Move the center to the best match.
-    mv.x += large_hexbs[best_index].x;
-    mv.y += large_hexbs[best_index].y;
-    best_index = 0;
-
-    // Iterate through the next 3 points.
-    for (i = 0; i < 3; ++i) {
-      const vector2d_t *offset = &large_hexbs[start + i];
-      unsigned cost;
-      {
-        PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-        cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                               (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                               (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                               block_width, block_width, max_lcu_below);
-        cost += calc_mvd_cost(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-        PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
-              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
-              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + block_width, 
-              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
-              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + block_width);
-      }
-
-      if (cost < best_cost) {
-        best_cost    = cost;
-        best_index   = start + i;
-        best_bitcost = bitcost;
-      }
-      ++offset;
-    }
-  }
-
-  // Move the center to the best match.
-  mv.x += large_hexbs[best_index].x;
-  mv.y += large_hexbs[best_index].y;
-  best_index = 0;
-
-  // Do the final step of the search with a small pattern.
-  for (i = 1; i < 5; ++i) {
-    const vector2d_t *offset = &small_hexbs[i];
-    unsigned cost;
-    {
-      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
-      cost = image_calc_sad(pic, ref, orig->x, orig->y,
-                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                             block_width, block_width, max_lcu_below);
-      cost += calc_mvd_cost(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
-            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
-            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + block_width, 
-            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
-            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + block_width);
-    }
-
-    if (cost > 0 && cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
-    }
-  }
-
-  // Adjust the movement vector according to the final best match.
-  mv.x += small_hexbs[best_index].x;
-  mv.y += small_hexbs[best_index].y;
-
-  // Return final movement vector in quarter-pixel precision.
-  mv_in_out->x = mv.x << 2;
-  mv_in_out->y = mv.y << 2;
-
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
-}
-
-
-#if SEARCH_MV_FULL_RADIUS
-static unsigned search_mv_full(unsigned depth,
-                               const picture *pic, const picture *ref,
-                               const vector2d *orig, vector2d *mv_in_out,
-                               int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
-                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
-{
-  vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-  int block_width = CU_WIDTH_FROM_DEPTH(depth);
-  unsigned best_cost = UINT32_MAX;
-  int x, y;
-  uint32_t best_bitcost = 0, bitcost;
-  vector2d min_mv, max_mv;
-
-  /*if (abs(mv.x) > SEARCH_MV_FULL_RADIUS || abs(mv.y) > SEARCH_MV_FULL_RADIUS) {
-    best_cost = calc_sad(pic, ref, orig->x, orig->y,
-                         orig->x, orig->y,
-                         block_width, block_width);
-    mv.x = 0;
-    mv.y = 0;
-  }*/
-
-  min_mv.x = mv.x - SEARCH_MV_FULL_RADIUS;
-  min_mv.y = mv.y - SEARCH_MV_FULL_RADIUS;
-  max_mv.x = mv.x + SEARCH_MV_FULL_RADIUS;
-  max_mv.y = mv.y + SEARCH_MV_FULL_RADIUS;
-
-  for (y = min_mv.y; y < max_mv.y; ++y) {
-    for (x = min_mv.x; x < max_mv.x; ++x) {
-      unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                               orig->x + x,
-                               orig->y + y,
-                               block_width, block_width);
-      cost += calc_mvd_cost(x, y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-      if (cost < best_cost) {
-        best_cost    = cost;
-        best_bitcost = bitcost;
-        mv.x = x;
-        mv.y = y;
-      }
-    }
-  }
-
-  mv_in_out->x = mv.x << 2;
-  mv_in_out->y = mv.y << 2;
-
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
-}
-#endif
-
-/**
- * \brief Do fractional motion estimation
- *
- * \param depth      log2 depth of the search
- * \param pic        Picture motion vector is searched for.
- * \param ref        Picture motion vector is searched from.
- * \param orig       Top left corner of the searched for block.
- * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
- *
- * \returns  Cost of the motion vector.
- *
- * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
- * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
- */
-static unsigned search_frac(const encoder_state_t * const state,
-                            unsigned depth,
-                            const kvz_picture *pic, const kvz_picture *ref,
-                            const vector2d_t *orig, vector2d_t *mv_in_out,
-                            int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                            int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
-{
-
-  //Set mv to halfpel precision
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-  int block_width = CU_WIDTH_FROM_DEPTH(depth);
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0, bitcost;
-  unsigned i;
-  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-
-  unsigned cost = 0;
-
-  cost_pixel_nxn_func *satd = pixels_get_satd_func(block_width);
-
-  vector2d_t halfpel_offset;
-
-  #define FILTER_SIZE 8
-  #define HALF_FILTER (FILTER_SIZE>>1)
-
-  //create buffer for block + extra for filter
-  int src_stride = block_width+FILTER_SIZE+1;
-  kvz_pixel src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
-  kvz_pixel* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
-
-  //destination buffer for interpolation
-  int dst_stride = (block_width+1)*4;
-  kvz_pixel dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16];
-  kvz_pixel* dst_off = &dst[dst_stride*4+4];
-
-  extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
-                state->tile->lcu_offset_x * LCU_WIDTH,
-                state->tile->lcu_offset_y * LCU_WIDTH,
-                ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
-
-  filter_inter_quarterpel_luma(state->encoder_control, src_off, src_stride, block_width+1,
-      block_width+1, dst, dst_stride, 1, 1);
-
-
-  //Set mv to half-pixel precision
-  mv.x <<= 1;
-  mv.y <<= 1;
-
-  // Search halfpel positions around best integer mv
-  for (i = 0; i < 9; ++i) {
-    const vector2d_t *pattern = &square[i];
-
-    kvz_pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
-    kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
-
-    int y,x;
-    for(y = 0; y < block_width; ++y) {
-      int dst_y = y*4+pattern->y*2;
-      for(x = 0; x < block_width; ++x) {
-        int dst_x = x*4+pattern->x*2;
-        tmp_filtered[y*block_width+x] = dst_off[dst_y*dst_stride+dst_x];
-        tmp_pic[y*block_width+x] = pic->y[orig->x+x + (orig->y+y)*pic->width];
-      }
-    }
-
-    cost = satd(tmp_pic,tmp_filtered);
-
-    cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
-
-    }
-  }
-
-  //Set mv to best match
-  mv.x += square[best_index].x;
-  mv.y += square[best_index].y;
-
-  halfpel_offset.x = square[best_index].x*2;
-  halfpel_offset.y = square[best_index].y*2;
-
-  //Set mv to quarterpel precision
-  mv.x <<= 1;
-  mv.y <<= 1;
-
-  //Search quarterpel points around best halfpel mv
-  for (i = 0; i < 9; ++i) {
-    const vector2d_t *pattern = &square[i];
-
-    kvz_pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
-    kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
-
-    int y,x;
-    for(y = 0; y < block_width; ++y) {
-      int dst_y = y*4+halfpel_offset.y+pattern->y;
-      for(x = 0; x < block_width; ++x) {
-        int dst_x = x*4+halfpel_offset.x+pattern->x;
-        tmp_filtered[y*block_width+x] = dst_off[dst_y*dst_stride+dst_x];
-        tmp_pic[y*block_width+x] = pic->y[orig->x+x + (orig->y+y)*pic->width];
-      }
-    }
-
-    cost = satd(tmp_pic,tmp_filtered);
-
-    cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
-
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
-    }
-  }
-
-  //Set mv to best final best match
-  mv.x += square[best_index].x;
-  mv.y += square[best_index].y;
-
-  mv_in_out->x = mv.x;
-  mv_in_out->y = mv.y;
-
-  *bitcost_out = best_bitcost;
-
-
-  return best_cost;
-
-}
-
-/**
- * Update lcu to have best modes at this depth.
- * \return Cost of best mode.
- */
-static int search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu)
-{
-  const videoframe_t * const frame = state->tile->frame;
-  uint32_t ref_idx = 0;
-  int x_local = (x&0x3f), y_local = (y&0x3f);
-  int x_cu = x>>3;
-  int y_cu = y>>3;
-  int cu_pos = LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH;
-
-  cu_info_t *cur_cu = &lcu->cu[cu_pos];
-
-  int16_t mv_cand[2][2];
-  // Search for merge mode candidate
-  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS];
-  // Get list of candidates
-  int16_t num_cand = inter_get_merge_cand(state, x, y, depth, merge_cand, lcu);
-
-  int max_lcu_below = -1;
-  
-  if (state->encoder_control->owf) {
-    max_lcu_below = 1;
-  }
-
-  // Default to candidate 0
-  cur_cu->inter.mv_cand[0] = 0;
-  cur_cu->inter.mv_cand[1] = 0;
-
-  cur_cu->inter.cost = UINT_MAX;
-
-  for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
-    kvz_picture *ref_image = state->global->ref->images[ref_idx];
-    uint32_t temp_bitcost = 0;
-    uint32_t temp_cost = 0;
-    vector2d_t orig, mvd;
-    int32_t merged = 0;
-    uint8_t cu_mv_cand = 0;
-    int8_t merge_idx = 0;
-    int8_t ref_list = state->global->refmap[ref_idx].list-1;
-    int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
-    orig.x = x_cu * CU_MIN_SIZE_PIXELS;
-    orig.y = y_cu * CU_MIN_SIZE_PIXELS;
-    // Get MV candidates
-    cur_cu->inter.mv_ref[ref_list] = ref_idx;
-    inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, ref_list);
-    cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
-
-    vector2d_t mv = { 0, 0 };
-    {
-      // Take starting point for MV search from previous frame.
-      // When temporal motion vector candidates are added, there is probably
-      // no point to this anymore, but for now it helps.
-      int mid_x_cu = (x + (LCU_WIDTH >> (depth+1))) / 8;
-      int mid_y_cu = (y + (LCU_WIDTH >> (depth+1))) / 8;
-      cu_info_t *ref_cu = &state->global->ref->cu_arrays[ref_idx]->data[mid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH)];
-      if (ref_cu->type == CU_INTER) {
-        if (ref_cu->inter.mv_dir & 1) {
-          mv.x = ref_cu->inter.mv[0][0];
-          mv.y = ref_cu->inter.mv[0][1];
-        } else {
-          mv.x = ref_cu->inter.mv[1][0];
-          mv.y = ref_cu->inter.mv[1][1];
-        }
-      }
-    }
-
-#if SEARCH_MV_FULL_RADIUS
-    temp_cost += search_mv_full(depth, frame, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-#else
-    switch (state->encoder_control->cfg->ime_algorithm) {
-      case KVZ_IME_TZ:
-        temp_cost += tz_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-        break;
-
-      default:
-        temp_cost += hexagon_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-        break;
-      }
-#endif
-    if (state->encoder_control->cfg->fme_level > 0) {
-      temp_cost = search_frac(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-    }
-
-    merged = 0;
-    // Check every candidate to find a match
-    for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-      if (merge_cand[merge_idx].dir != 3 &&
-          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x &&
-          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y &&          
-          (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
-        merged = 1;
-        break;
-      }
-    }
-
-    // Only check when candidates are different
-    if (!merged && (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
-      vector2d_t mvd_temp1, mvd_temp2;
-      int cand1_cost,cand2_cost;
-
-      mvd_temp1.x = mv.x - mv_cand[0][0];
-      mvd_temp1.y = mv.y - mv_cand[0][1];
-      cand1_cost = get_mvd_coding_cost(&mvd_temp1);
-
-      mvd_temp2.x = mv.x - mv_cand[1][0];
-      mvd_temp2.y = mv.y - mv_cand[1][1];
-      cand2_cost = get_mvd_coding_cost(&mvd_temp2);
-
-      // Select candidate 1 if it has lower cost
-      if (cand2_cost < cand1_cost) {
-        cu_mv_cand = 1;
-      }
-    }
-    mvd.x = mv.x - mv_cand[cu_mv_cand][0];
-    mvd.y = mv.y - mv_cand[cu_mv_cand][1];
-
-    if(temp_cost < cur_cu->inter.cost) {
-
-      // Map reference index to L0/L1 pictures
-      cur_cu->inter.mv_dir = ref_list+1;
-      cur_cu->inter.mv_ref_coded[ref_list] = state->global->refmap[ref_idx].idx;
-
-      cur_cu->merged        = merged;
-      cur_cu->merge_idx     = merge_idx;
-      cur_cu->inter.mv_ref[ref_list] = ref_idx;
-      cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x;
-      cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y;
-      cur_cu->inter.mvd[ref_list][0] = (int16_t)mvd.x;
-      cur_cu->inter.mvd[ref_list][1] = (int16_t)mvd.y;
-      cur_cu->inter.cost    = temp_cost;
-      cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[ref_list];
-      cur_cu->inter.mv_cand[ref_list] = cu_mv_cand;
-    }
-  }
-
-  // Search bi-pred positions
-  if (state->global->slicetype == SLICE_B && state->encoder_control->cfg->bipred) {
-    lcu_t *templcu = MALLOC(lcu_t, 1);
-    cost_pixel_nxn_func *satd = pixels_get_satd_func(LCU_WIDTH >> depth);
-    #define NUM_PRIORITY_LIST 12;
-    static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
-    static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
-    uint8_t cutoff = num_cand;
-    for (int32_t idx = 0; idx<cutoff*(cutoff - 1); idx++) {
-      uint8_t i = priorityList0[idx];
-      uint8_t j = priorityList1[idx];
-      if (i >= num_cand || j >= num_cand) break;
-
-      // Find one L0 and L1 candidate according to the priority list
-      if ((merge_cand[i].dir & 0x1) && (merge_cand[j].dir & 0x2)) {
-        if (merge_cand[i].ref[0] != merge_cand[j].ref[1] ||
-          merge_cand[i].mv[0][0] != merge_cand[j].mv[1][0] ||
-          merge_cand[i].mv[0][1] != merge_cand[j].mv[1][1]) {
-          uint32_t bitcost[2];
-          uint32_t cost = 0;
-          int8_t cu_mv_cand = 0;
-          int16_t mv[2][2];
-          kvz_pixel tmp_block[64 * 64];
-          kvz_pixel tmp_pic[64 * 64];
-          // Force L0 and L1 references
-          if (state->global->refmap[merge_cand[i].ref[0]].list == 2 || state->global->refmap[merge_cand[j].ref[1]].list == 1) continue;
-
-          // TODO: enable fractional pixel bipred search
-          mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
-          mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
-          mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
-          mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
-
-          // Check boundaries when using owf to process multiple frames at the same time
-          if (max_lcu_below >= 0) {
-            // When SAO is off, row is considered reconstructed when the last LCU
-            // is done, although the bottom 2 pixels might still need deblocking.
-            // To work around this, add 2 luma pixels to the reach of the mv
-            // in order to avoid referencing those possibly non-deblocked pixels.
-            int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int cur_lcu_row = y / LCU_WIDTH;
-            if (mv_lcu_row_reach_1 > cur_lcu_row + max_lcu_below || mv_lcu_row_reach_2 > cur_lcu_row + max_lcu_below) {
-              continue;
-            }
-          }
-
-          inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu);
-
-          for (int ypos = 0; ypos < LCU_WIDTH >> depth; ++ypos) {
-            int dst_y = ypos*(LCU_WIDTH >> depth);
-            for (int xpos = 0; xpos < (LCU_WIDTH >> depth); ++xpos) {
-              tmp_block[dst_y + xpos] = templcu->rec.y[((y + ypos)&(LCU_WIDTH - 1))*LCU_WIDTH + ((x + xpos)&(LCU_WIDTH - 1))];              
-              tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width];
-            }
-          }
-
-          cost = satd(tmp_pic, tmp_block);
-
-          // TODO: enable fractional pixel bipred search
-          cost += calc_mvd_cost(state, merge_cand[i].mv[0][0] & 0xfff8, merge_cand[i].mv[0][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[0]);
-          cost += calc_mvd_cost(state, merge_cand[i].mv[1][0] & 0xfff8, merge_cand[i].mv[1][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[1]);
-
-          if (cost < cur_cu->inter.cost) {
-
-            cur_cu->inter.mv_dir = 3;
-            cur_cu->inter.mv_ref_coded[0] = state->global->refmap[merge_cand[i].ref[0]].idx;
-            cur_cu->inter.mv_ref_coded[1] = state->global->refmap[merge_cand[j].ref[1]].idx;
-
-
-
-            cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0];
-            cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1];
-
-            // TODO: enable fractional pixel bipred search
-            cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
-            cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
-            cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
-            cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
-            cur_cu->merged = 0;
-                        
-            // Check every candidate to find a match
-            for(int merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-              if (
-                  merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
-                  merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&     
-                  merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
-                  merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&    
-                  merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && 
-                  merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) {
-                cur_cu->merged = 1;
-                cur_cu->merge_idx = merge_idx;
-                break;
-              }
-            }
-
-            // Each motion vector has its own candidate
-            for (int reflist = 0; reflist < 2; reflist++) {
-              cu_mv_cand = 0;
-              inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, reflist);
-              if ((mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
-                vector2d_t mvd_temp1, mvd_temp2;
-                int cand1_cost, cand2_cost;
-
-                mvd_temp1.x = cur_cu->inter.mv[reflist][0] - mv_cand[0][0];
-                mvd_temp1.y = cur_cu->inter.mv[reflist][1] - mv_cand[0][1];
-                cand1_cost = get_mvd_coding_cost(&mvd_temp1);
-
-                mvd_temp2.x = cur_cu->inter.mv[reflist][0] - mv_cand[1][0];
-                mvd_temp2.y = cur_cu->inter.mv[reflist][1] - mv_cand[1][1];
-                cand2_cost = get_mvd_coding_cost(&mvd_temp2);
-
-                // Select candidate 1 if it has lower cost
-                if (cand2_cost < cand1_cost) {
-                  cu_mv_cand = 1;                  
-                }
-              }
-              cur_cu->inter.mvd[reflist][0] = cur_cu->inter.mv[reflist][0] - mv_cand[cu_mv_cand][0];
-              cur_cu->inter.mvd[reflist][1] = cur_cu->inter.mv[reflist][1] - mv_cand[cu_mv_cand][1];
-              cur_cu->inter.mv_cand[reflist] = cu_mv_cand;
-            }
-            cur_cu->inter.cost = cost;
-            cur_cu->inter.bitcost = bitcost[0] + bitcost[1] + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[0] + cur_cu->inter.mv_ref_coded[1];
-          }
-        }
-      }
-    }
-    FREE_POINTER(templcu);
-  }
-
-  return cur_cu->inter.cost;
-}
-
 
 /**
  * Copy all non-reference CU data from depth+1 to depth.
diff --git a/src/search_inter.c b/src/search_inter.c
index d38999d9..bbf21c91 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -23,3 +23,1177 @@
 */
 
 #include "search_inter.h"
+
+#include <stdlib.h>
+
+#include "inter.h"
+#include "strategies/strategies-picture.h"
+#include "strategies/strategies-ipol.h"
+
+
+// Temporarily for debugging.
+#define SEARCH_MV_FULL_RADIUS 0
+
+
+/**
+ * This is used in the hexagon_search to select 3 points to search.
+ *
+ * The start of the hexagonal pattern has been repeated at the end so that
+ * the indices between 1-6 can be used as the start of a 3-point list of new
+ * points to search.
+ *
+ *   6 o-o 1 / 7
+ *    /   \
+ * 5 o  0  o 2 / 8
+ *    \   /
+ *   4 o-o 3
+ */
+const vector2d_t large_hexbs[10] = {
+  { 0, 0 },
+  { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }, { -1, -2 },
+  { 1, -2 }, { 2, 0 }
+};
+
+/**
+ * This is used as the last step of the hexagon search.
+ */
+const vector2d_t small_hexbs[5] = {
+  { 0, 0 },
+  { -1, -1 }, { -1, 0 }, { 1, 0 }, { 1, 1 }
+};
+
+/*
+ *  6 7 8
+ *  3 4 5
+ *  0 1 2
+ */
+const vector2d_t square[9] = {
+  { -1, 1 },
+  { 0, 1 }, { 1, 1 }, { -1, 0 }, { 0, 0 }, { 1, 0 }, { -1, -1 },
+  { 0, -1 }, { 1, -1 }
+};
+
+
+static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
+{
+  int32_t num_bins = 0;
+  while (symbol >= (uint32_t)(1 << count)) {
+    ++num_bins;
+    symbol -= 1 << count;
+    ++count;
+  }
+  num_bins ++;
+
+  return num_bins;
+}
+
+
+static uint32_t get_mvd_coding_cost(vector2d_t *mvd)
+{
+  uint32_t bitcost = 0;
+  const int32_t mvd_hor = mvd->x;
+  const int32_t mvd_ver = mvd->y;
+  const int8_t hor_abs_gr0 = mvd_hor != 0;
+  const int8_t ver_abs_gr0 = mvd_ver != 0;
+  const uint32_t mvd_hor_abs = abs(mvd_hor);
+  const uint32_t mvd_ver_abs = abs(mvd_ver);
+
+  // Greater than 0 for x/y
+  bitcost += 2;
+
+  if (hor_abs_gr0) {
+    if (mvd_hor_abs > 1) {
+      bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs-2, 1) - 2; // TODO: tune the costs
+    }
+    // Greater than 1 + sign
+    bitcost += 2;
+  }
+
+  if (ver_abs_gr0) {
+    if (mvd_ver_abs > 1) {
+      bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs-2, 1) - 2; // TODO: tune the costs
+    }
+    // Greater than 1 + sign
+    bitcost += 2;
+  }
+
+  return bitcost;
+}
+
+
+static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int mv_shift,
+                         int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                         int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
+{
+  uint32_t temp_bitcost = 0;
+  uint32_t merge_idx;
+  int cand1_cost,cand2_cost;
+  vector2d_t mvd_temp1, mvd_temp2;
+  int8_t merged      = 0;
+  int8_t cur_mv_cand = 0;
+
+  x <<= mv_shift;
+  y <<= mv_shift;
+
+  // Check every candidate to find a match
+  for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
+    if (merge_cand[merge_idx].dir == 3) continue;
+    if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x &&
+        merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y &&
+        merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
+      temp_bitcost += merge_idx;
+      merged = 1;
+      break;
+    }
+  }
+
+  // Check mvd cost only if mv is not merged
+  if(!merged) {
+    mvd_temp1.x = x - mv_cand[0][0];
+    mvd_temp1.y = y - mv_cand[0][1];
+    cand1_cost = get_mvd_coding_cost(&mvd_temp1);
+
+    mvd_temp2.x = x - mv_cand[1][0];
+    mvd_temp2.y = y - mv_cand[1][1];
+    cand2_cost = get_mvd_coding_cost(&mvd_temp2);
+
+    // Select candidate 1 if it has lower cost
+    if (cand2_cost < cand1_cost) {
+      cur_mv_cand = 1;
+    }
+    temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
+  }
+  *bitcost = temp_bitcost;
+  return temp_bitcost*(int32_t)(state->global->cur_lambda_cost_sqrt+0.5);
+}
+
+
+unsigned tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
+                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
+                           int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
+                           int block_width, int max_lcu_below)
+{
+  int n_points;
+  int best_index = -1;
+  int i;
+  
+  vector2d_t mv_best = { 0, 0 };
+
+  assert(pattern_type < 4);
+
+  //implemented search patterns
+  vector2d_t pattern[4][8] = {
+      //diamond (8 points)
+      //[ ][ ][ ][ ][1][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][8][ ][ ][ ][5][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[4][ ][ ][ ][o][ ][ ][ ][2]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][7][ ][ ][ ][6][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][3][ ][ ][ ][ ]
+      {
+        { 0, iDist }, { iDist, 0 }, { 0, -iDist }, { -iDist, 0 },
+        { iDist / 2, iDist / 2 }, { iDist / 2, -iDist / 2 }, { -iDist / 2, -iDist / 2 }, { -iDist / 2, iDist / 2 }
+      },
+
+      //square (8 points)
+      //[8][ ][ ][ ][1][ ][ ][ ][2]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[7][ ][ ][ ][o][ ][ ][ ][3]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[6][ ][ ][ ][5][ ][ ][ ][4]
+      {
+        { 0, iDist }, { iDist, iDist }, { iDist, 0 }, { iDist, -iDist }, { 0, -iDist },
+        { -iDist, -iDist }, { -iDist, 0 }, { -iDist, iDist }
+      },
+
+      //octagon (8 points)
+      //[ ][ ][5][ ][ ][ ][1][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][2]
+      //[4][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][o][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[8][ ][ ][ ][ ][ ][ ][ ][6]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][7][ ][ ][ ][3][ ][ ]
+      {
+        { iDist / 2, iDist }, { iDist, iDist / 2 }, { iDist / 2, -iDist }, { -iDist, iDist / 2 },
+        { -iDist / 2, iDist }, { iDist, -iDist / 2 }, { -iDist / 2, -iDist }, { -iDist, -iDist / 2 }
+      },
+
+      //hexagon (6 points)
+      //[ ][ ][5][ ][ ][ ][1][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[4][ ][ ][ ][o][ ][ ][ ][2]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][ ][ ][ ][ ][ ][ ][ ]
+      //[ ][ ][6][ ][ ][ ][3][ ][ ]
+      {
+        { iDist / 2, iDist }, { iDist, 0 }, { iDist / 2, -iDist }, { -iDist, 0 },
+        { iDist / 2, iDist }, { -iDist / 2, -iDist }, { 0, 0 }, { 0, 0 }
+      }
+
+  };
+
+  //set the number of points to be checked
+  if (iDist == 1)
+  {
+    switch (pattern_type)
+    {
+      case 0:
+        n_points = 4;
+        break;
+      case 2:
+        n_points = 4;
+        break;
+      case 3:
+        n_points = 4;
+        break;
+      default:
+        n_points = 8;
+        break;
+    };
+  }
+  else
+  {
+    switch (pattern_type)
+    {
+      case 3:
+        n_points = 6;
+        break;
+      default:
+        n_points = 8;
+        break;
+    };
+  }
+
+  //compute SAD values for all chosen points
+  for (i = 0; i < n_points; i++)
+  {
+    vector2d_t *current = &pattern[pattern_type][i];
+    unsigned cost;
+    uint32_t bitcost;
+
+    {
+      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+      cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
+                            block_width, block_width, max_lcu_below);
+      cost += calc_mvd_cost(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
+        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x + block_width,
+        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
+        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y + block_width);
+    }
+
+    if (cost < best_cost)
+    {
+      best_cost = cost;
+      *best_bitcost = bitcost;
+      best_index = i;
+    }
+
+  }
+
+  if (best_index >= 0)
+  {
+    mv_best = pattern[pattern_type][best_index];
+    *best_dist = iDist;
+  }
+  
+  mv->x += mv_best.x;
+  mv->y += mv_best.y;
+
+  return best_cost;
+
+}
+
+
+unsigned tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
+                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
+                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
+                          int block_width, int iSearchRange, int iRaster, int max_lcu_below)
+{
+  int i;
+  int k;
+
+  vector2d_t mv_best = { 0, 0 };
+  
+  //compute SAD values for every point in the iRaster downsampled version of the current search area
+  for (i = iSearchRange; i >= -iSearchRange; i -= iRaster)
+  {
+    for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
+    {
+      vector2d_t current = { k, i };
+      unsigned cost;
+      uint32_t bitcost;
+
+      {
+        PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+        cost = image_calc_sad(pic, ref, orig->x, orig->y,
+          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
+          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
+          block_width, block_width, max_lcu_below);
+        cost += calc_mvd_cost(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+        PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
+          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k + block_width,
+          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
+          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i + block_width);
+      }
+
+      if (cost < best_cost)
+      {
+        best_cost = cost;
+        *best_bitcost = bitcost;
+        mv_best = current;
+      }
+
+    }
+  }
+  
+  mv->x += mv_best.x;
+  mv->y += mv_best.y;
+
+  return best_cost;
+
+}
+
+
+static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
+                          const kvz_picture *pic, const kvz_picture *ref,
+                          const vector2d_t *orig, vector2d_t *mv_in_out,
+                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                          int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+{
+
+  //TZ parameters
+  const int iSearchRange = 96;  // search range for each stage
+  const int iRaster = 5;  // search distance limit and downsampling factor for step 3                   
+  const unsigned step2_type = 0;  // search patterns for steps 2 and 4
+  const unsigned step4_type = 0;
+  const bool bRasterRefinementEnable = true;  // enable step 4 mode 1
+  const bool bStarRefinementEnable = false;   // enable step 4 mode 2 (only one mode will be executed)
+  
+  const int block_width = CU_WIDTH_FROM_DEPTH(depth);
+
+  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+
+  unsigned best_cost = UINT32_MAX;
+  uint32_t best_bitcost = 0;
+  int iDist;
+  int best_dist = 0;
+  unsigned best_index = num_cand;
+  int max_lcu_below = -1;
+
+  if (state->encoder_control->owf) {
+    max_lcu_below = 1;
+  }
+
+  //step 1, compare (0,0) vector to predicted vectors
+  
+  // Check whatever input vector we got, unless its (0, 0) which will be checked later.
+  if (mv.x || mv.y) 
+  {
+    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+
+    best_cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                                        block_width, block_width, max_lcu_below);
+    best_cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
+
+    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
+  }
+
+  int i;
+  // Select starting point from among merge candidates. These should include
+  // both mv_cand vectors and (0, 0).
+  for (i = 0; i < num_cand; ++i) 
+  {
+    if (merge_cand[i].dir == 3) continue;
+    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
+    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
+
+    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+
+	  uint32_t bitcost;
+    unsigned cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                                   block_width, block_width, max_lcu_below);
+    cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
+
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_index = i;
+      best_bitcost = bitcost;
+    }
+  }
+  
+  if (best_index < (unsigned)num_cand) {
+    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
+    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
+  } else {
+    mv.x = mv_in_out->x >> 2;
+    mv.y = mv_in_out->y >> 2;
+  }
+
+  //step 2, grid search
+  for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
+  {
+    best_cost = tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
+                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+  }
+
+  //step 3, raster scan
+  if (best_dist > iRaster)
+  {
+    best_dist = iRaster;
+
+    best_cost = tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand, 
+                                 num_cand, ref_idx, &best_bitcost, block_width, iSearchRange, iRaster, max_lcu_below);
+  }
+
+  //step 4
+
+  //raster refinement
+  if (bRasterRefinementEnable && best_dist > 0)
+  {
+    iDist = best_dist >> 1;
+    while (iDist > 0)
+    {
+      best_cost = tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+
+      iDist = iDist >> 1;
+    }
+  }
+
+  //star refinement (repeat step 2 for the current starting point)
+  if (bStarRefinementEnable && best_dist > 0)
+  {
+    for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
+    {
+      best_cost = tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, block_width, max_lcu_below);
+    }
+  }
+
+  mv.x = mv.x << 2;
+  mv.y = mv.y << 2;
+
+  *mv_in_out = mv;
+  *bitcost_out = best_bitcost;
+
+  return best_cost;
+}
+
+
+/**
+ * \brief Do motion search using the HEXBS algorithm.
+ *
+ * \param depth      log2 depth of the search
+ * \param pic        Picture motion vector is searched for.
+ * \param ref        Picture motion vector is searched from.
+ * \param orig       Top left corner of the searched for block.
+ * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
+ *
+ * \returns  Cost of the motion vector.
+ *
+ * Motion vector is searched by first searching iteratively with the large
+ * hexagon pattern until the best match is at the center of the hexagon.
+ * As a final step a smaller hexagon is used to check the adjacent pixels.
+ *
+ * If a non 0,0 predicted motion vector predictor is given as mv_in_out,
+ * the 0,0 vector is also tried. This is hoped to help in the case where
+ * the predicted motion vector is way off. In the future even more additional
+ * points like 0,0 might be used, such as vectors from top or left.
+ */
+static unsigned hexagon_search(const encoder_state_t * const state, unsigned depth,
+                               const kvz_picture *pic, const kvz_picture *ref,
+                               const vector2d_t *orig, vector2d_t *mv_in_out,
+                               int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+{
+  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+  int block_width = CU_WIDTH_FROM_DEPTH(depth);
+  unsigned best_cost = UINT32_MAX;
+  uint32_t best_bitcost = 0, bitcost;
+  unsigned i;
+  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
+  int max_lcu_below = -1;
+  
+  if (state->encoder_control->owf) {
+    max_lcu_below = 1;
+  }
+
+  // Check mv_in, if it's not in merge candidates.
+  bool mv_in_merge_cand = false;
+  for (int i = 0; i < num_cand; ++i) {
+    if (merge_cand[i].dir == 3) continue;
+    if (merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2 == mv.x &&
+        merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2 == mv.y) {
+      mv_in_merge_cand = true;
+      break;
+    }
+  }
+
+  if (!mv_in_merge_cand) {
+    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+
+    best_cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                                        block_width, block_width, max_lcu_below);
+    best_cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+    best_bitcost = bitcost;
+    best_index = num_cand; 
+
+    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
+  }
+
+  // Select starting point from among merge candidates. These should include
+  // both mv_cand vectors and (0, 0).
+  for (i = 0; i < num_cand; ++i) {
+    if (merge_cand[i].dir == 3) continue;
+    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
+    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;
+
+    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+
+    unsigned cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                                   block_width, block_width, max_lcu_below);
+    cost += calc_mvd_cost(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+    PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + block_width,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + block_width);
+
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_index = i;
+      best_bitcost = bitcost;
+    }
+  }
+  if (best_index < num_cand) {
+    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
+    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
+  } else {
+    mv.x = mv_in_out->x >> 2;
+    mv.y = mv_in_out->y >> 2;
+  }
+  
+  // Search the initial 7 points of the hexagon.
+  best_index = 0;
+  for (i = 0; i < 7; ++i) {
+    const vector2d_t *pattern = &large_hexbs[i];
+    unsigned cost;
+    {
+      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+      cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
+                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
+                             block_width, block_width, max_lcu_below);
+      cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+
+      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
+                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
+                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x + block_width, 
+                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, 
+                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y + block_width);
+    }
+
+    if (cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+    }
+  }
+
+  // Iteratively search the 3 new points around the best match, until the best
+  // match is in the center.
+  while (best_index != 0) {
+    unsigned start; // Starting point of the 3 offsets to be searched.
+    if (best_index == 1) {
+      start = 6;
+    } else if (best_index == 8) {
+      start = 1;
+    } else {
+      start = best_index - 1;
+    }
+
+    // Move the center to the best match.
+    mv.x += large_hexbs[best_index].x;
+    mv.y += large_hexbs[best_index].y;
+    best_index = 0;
+
+    // Iterate through the next 3 points.
+    for (i = 0; i < 3; ++i) {
+      const vector2d_t *offset = &large_hexbs[start + i];
+      unsigned cost;
+      {
+        PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+        cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                               (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                               (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                               block_width, block_width, max_lcu_below);
+        cost += calc_mvd_cost(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+        PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
+              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
+              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + block_width, 
+              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
+              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + block_width);
+      }
+
+      if (cost < best_cost) {
+        best_cost    = cost;
+        best_index   = start + i;
+        best_bitcost = bitcost;
+      }
+      ++offset;
+    }
+  }
+
+  // Move the center to the best match.
+  mv.x += large_hexbs[best_index].x;
+  mv.y += large_hexbs[best_index].y;
+  best_index = 0;
+
+  // Do the final step of the search with a small pattern.
+  for (i = 1; i < 5; ++i) {
+    const vector2d_t *offset = &small_hexbs[i];
+    unsigned cost;
+    {
+      PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);
+      cost = image_calc_sad(pic, ref, orig->x, orig->y,
+                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                             block_width, block_width, max_lcu_below);
+      cost += calc_mvd_cost(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+      PERFORMANCE_MEASURE_END(_DEBUG_PERF_SEARCH_PIXELS, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + block_width, orig->y, orig->y + block_width, 
+            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
+            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + block_width, 
+            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
+            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + block_width);
+    }
+
+    if (cost > 0 && cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+    }
+  }
+
+  // Adjust the movement vector according to the final best match.
+  mv.x += small_hexbs[best_index].x;
+  mv.y += small_hexbs[best_index].y;
+
+  // Return final movement vector in quarter-pixel precision.
+  mv_in_out->x = mv.x << 2;
+  mv_in_out->y = mv.y << 2;
+
+  *bitcost_out = best_bitcost;
+
+  return best_cost;
+}
+
+
+#if SEARCH_MV_FULL_RADIUS
+static unsigned search_mv_full(unsigned depth,
+                               const picture *pic, const picture *ref,
+                               const vector2d *orig, vector2d *mv_in_out,
+                               int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+{
+  vector2d mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+  int block_width = CU_WIDTH_FROM_DEPTH(depth);
+  unsigned best_cost = UINT32_MAX;
+  int x, y;
+  uint32_t best_bitcost = 0, bitcost;
+  vector2d min_mv, max_mv;
+
+  /*if (abs(mv.x) > SEARCH_MV_FULL_RADIUS || abs(mv.y) > SEARCH_MV_FULL_RADIUS) {
+    best_cost = calc_sad(pic, ref, orig->x, orig->y,
+                         orig->x, orig->y,
+                         block_width, block_width);
+    mv.x = 0;
+    mv.y = 0;
+  }*/
+
+  min_mv.x = mv.x - SEARCH_MV_FULL_RADIUS;
+  min_mv.y = mv.y - SEARCH_MV_FULL_RADIUS;
+  max_mv.x = mv.x + SEARCH_MV_FULL_RADIUS;
+  max_mv.y = mv.y + SEARCH_MV_FULL_RADIUS;
+
+  for (y = min_mv.y; y < max_mv.y; ++y) {
+    for (x = min_mv.x; x < max_mv.x; ++x) {
+      unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
+                               orig->x + x,
+                               orig->y + y,
+                               block_width, block_width);
+      cost += calc_mvd_cost(x, y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+      if (cost < best_cost) {
+        best_cost    = cost;
+        best_bitcost = bitcost;
+        mv.x = x;
+        mv.y = y;
+      }
+    }
+  }
+
+  mv_in_out->x = mv.x << 2;
+  mv_in_out->y = mv.y << 2;
+
+  *bitcost_out = best_bitcost;
+
+  return best_cost;
+}
+#endif
+
+
+/**
+ * \brief Do fractional motion estimation
+ *
+ * \param depth      log2 depth of the search
+ * \param pic        Picture motion vector is searched for.
+ * \param ref        Picture motion vector is searched from.
+ * \param orig       Top left corner of the searched for block.
+ * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
+ *
+ * \returns  Cost of the motion vector.
+ *
+ * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
+ * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
+ */
+static unsigned search_frac(const encoder_state_t * const state,
+                            unsigned depth,
+                            const kvz_picture *pic, const kvz_picture *ref,
+                            const vector2d_t *orig, vector2d_t *mv_in_out,
+                            int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                            int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+{
+
+  //Set mv to halfpel precision
+  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+  int block_width = CU_WIDTH_FROM_DEPTH(depth);
+  unsigned best_cost = UINT32_MAX;
+  uint32_t best_bitcost = 0, bitcost;
+  unsigned i;
+  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
+
+  unsigned cost = 0;
+
+  cost_pixel_nxn_func *satd = pixels_get_satd_func(block_width);
+
+  vector2d_t halfpel_offset;
+
+  #define FILTER_SIZE 8
+  #define HALF_FILTER (FILTER_SIZE>>1)
+
+  //create buffer for block + extra for filter
+  int src_stride = block_width+FILTER_SIZE+1;
+  kvz_pixel src[(LCU_WIDTH+FILTER_SIZE+1) * (LCU_WIDTH+FILTER_SIZE+1)];
+  kvz_pixel* src_off = &src[HALF_FILTER+HALF_FILTER*(block_width+FILTER_SIZE+1)];
+
+  //destination buffer for interpolation
+  int dst_stride = (block_width+1)*4;
+  kvz_pixel dst[(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16];
+  kvz_pixel* dst_off = &dst[dst_stride*4+4];
+
+  extend_borders(orig->x, orig->y, mv.x-1, mv.y-1,
+                state->tile->lcu_offset_x * LCU_WIDTH,
+                state->tile->lcu_offset_y * LCU_WIDTH,
+                ref->y, ref->width, ref->height, FILTER_SIZE, block_width+1, block_width+1, src);
+
+  filter_inter_quarterpel_luma(state->encoder_control, src_off, src_stride, block_width+1,
+      block_width+1, dst, dst_stride, 1, 1);
+
+
+  //Set mv to half-pixel precision
+  mv.x <<= 1;
+  mv.y <<= 1;
+
+  // Search halfpel positions around best integer mv
+  for (i = 0; i < 9; ++i) {
+    const vector2d_t *pattern = &square[i];
+
+    kvz_pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
+    kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
+
+    int y,x;
+    for(y = 0; y < block_width; ++y) {
+      int dst_y = y*4+pattern->y*2;
+      for(x = 0; x < block_width; ++x) {
+        int dst_x = x*4+pattern->x*2;
+        tmp_filtered[y*block_width+x] = dst_off[dst_y*dst_stride+dst_x];
+        tmp_pic[y*block_width+x] = pic->y[orig->x+x + (orig->y+y)*pic->width];
+      }
+    }
+
+    cost = satd(tmp_pic,tmp_filtered);
+
+    cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+
+    }
+  }
+
+  //Set mv to best match
+  mv.x += square[best_index].x;
+  mv.y += square[best_index].y;
+
+  halfpel_offset.x = square[best_index].x*2;
+  halfpel_offset.y = square[best_index].y*2;
+
+  //Set mv to quarterpel precision
+  mv.x <<= 1;
+  mv.y <<= 1;
+
+  //Search quarterpel points around best halfpel mv
+  for (i = 0; i < 9; ++i) {
+    const vector2d_t *pattern = &square[i];
+
+    kvz_pixel tmp_filtered[LCU_WIDTH*LCU_WIDTH];
+    kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH];
+
+    int y,x;
+    for(y = 0; y < block_width; ++y) {
+      int dst_y = y*4+halfpel_offset.y+pattern->y;
+      for(x = 0; x < block_width; ++x) {
+        int dst_x = x*4+halfpel_offset.x+pattern->x;
+        tmp_filtered[y*block_width+x] = dst_off[dst_y*dst_stride+dst_x];
+        tmp_pic[y*block_width+x] = pic->y[orig->x+x + (orig->y+y)*pic->width];
+      }
+    }
+
+    cost = satd(tmp_pic,tmp_filtered);
+
+    cost += calc_mvd_cost(state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost    = cost;
+      best_index   = i;
+      best_bitcost = bitcost;
+    }
+  }
+
+  //Set mv to best final best match
+  mv.x += square[best_index].x;
+  mv.y += square[best_index].y;
+
+  mv_in_out->x = mv.x;
+  mv_in_out->y = mv.y;
+
+  *bitcost_out = best_bitcost;
+
+  return best_cost;
+}
+
+
+/**
+ * Update lcu to have best modes at this depth.
+ * \return Cost of best mode.
+ */
+int search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu)
+{
+  const videoframe_t * const frame = state->tile->frame;
+  uint32_t ref_idx = 0;
+  int x_local = (x&0x3f), y_local = (y&0x3f);
+  int x_cu = x>>3;
+  int y_cu = y>>3;
+  int cu_pos = LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH;
+
+  cu_info_t *cur_cu = &lcu->cu[cu_pos];
+
+  int16_t mv_cand[2][2];
+  // Search for merge mode candidate
+  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS];
+  // Get list of candidates
+  int16_t num_cand = inter_get_merge_cand(state, x, y, depth, merge_cand, lcu);
+
+  int max_lcu_below = -1;
+  
+  if (state->encoder_control->owf) {
+    max_lcu_below = 1;
+  }
+
+  // Default to candidate 0
+  cur_cu->inter.mv_cand[0] = 0;
+  cur_cu->inter.mv_cand[1] = 0;
+
+  cur_cu->inter.cost = UINT_MAX;
+
+  for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
+    kvz_picture *ref_image = state->global->ref->images[ref_idx];
+    uint32_t temp_bitcost = 0;
+    uint32_t temp_cost = 0;
+    vector2d_t orig, mvd;
+    int32_t merged = 0;
+    uint8_t cu_mv_cand = 0;
+    int8_t merge_idx = 0;
+    int8_t ref_list = state->global->refmap[ref_idx].list-1;
+    int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
+    orig.x = x_cu * CU_MIN_SIZE_PIXELS;
+    orig.y = y_cu * CU_MIN_SIZE_PIXELS;
+    // Get MV candidates
+    cur_cu->inter.mv_ref[ref_list] = ref_idx;
+    inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, ref_list);
+    cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
+
+    vector2d_t mv = { 0, 0 };
+    {
+      // Take starting point for MV search from previous frame.
+      // When temporal motion vector candidates are added, there is probably
+      // no point to this anymore, but for now it helps.
+      int mid_x_cu = (x + (LCU_WIDTH >> (depth+1))) / 8;
+      int mid_y_cu = (y + (LCU_WIDTH >> (depth+1))) / 8;
+      cu_info_t *ref_cu = &state->global->ref->cu_arrays[ref_idx]->data[mid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH)];
+      if (ref_cu->type == CU_INTER) {
+        if (ref_cu->inter.mv_dir & 1) {
+          mv.x = ref_cu->inter.mv[0][0];
+          mv.y = ref_cu->inter.mv[0][1];
+        } else {
+          mv.x = ref_cu->inter.mv[1][0];
+          mv.y = ref_cu->inter.mv[1][1];
+        }
+      }
+    }
+
+#if SEARCH_MV_FULL_RADIUS
+    temp_cost += search_mv_full(depth, frame, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+#else
+    switch (state->encoder_control->cfg->ime_algorithm) {
+      case KVZ_IME_TZ:
+        temp_cost += tz_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+        break;
+
+      default:
+        temp_cost += hexagon_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+        break;
+      }
+#endif
+    if (state->encoder_control->cfg->fme_level > 0) {
+      temp_cost = search_frac(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+    }
+
+    merged = 0;
+    // Check every candidate to find a match
+    for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
+      if (merge_cand[merge_idx].dir != 3 &&
+          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x &&
+          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y &&          
+          (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
+        merged = 1;
+        break;
+      }
+    }
+
+    // Only check when candidates are different
+    if (!merged && (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
+      vector2d_t mvd_temp1, mvd_temp2;
+      int cand1_cost,cand2_cost;
+
+      mvd_temp1.x = mv.x - mv_cand[0][0];
+      mvd_temp1.y = mv.y - mv_cand[0][1];
+      cand1_cost = get_mvd_coding_cost(&mvd_temp1);
+
+      mvd_temp2.x = mv.x - mv_cand[1][0];
+      mvd_temp2.y = mv.y - mv_cand[1][1];
+      cand2_cost = get_mvd_coding_cost(&mvd_temp2);
+
+      // Select candidate 1 if it has lower cost
+      if (cand2_cost < cand1_cost) {
+        cu_mv_cand = 1;
+      }
+    }
+    mvd.x = mv.x - mv_cand[cu_mv_cand][0];
+    mvd.y = mv.y - mv_cand[cu_mv_cand][1];
+
+    if(temp_cost < cur_cu->inter.cost) {
+
+      // Map reference index to L0/L1 pictures
+      cur_cu->inter.mv_dir = ref_list+1;
+      cur_cu->inter.mv_ref_coded[ref_list] = state->global->refmap[ref_idx].idx;
+
+      cur_cu->merged        = merged;
+      cur_cu->merge_idx     = merge_idx;
+      cur_cu->inter.mv_ref[ref_list] = ref_idx;
+      cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x;
+      cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y;
+      cur_cu->inter.mvd[ref_list][0] = (int16_t)mvd.x;
+      cur_cu->inter.mvd[ref_list][1] = (int16_t)mvd.y;
+      cur_cu->inter.cost    = temp_cost;
+      cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[ref_list];
+      cur_cu->inter.mv_cand[ref_list] = cu_mv_cand;
+    }
+  }
+
+  // Search bi-pred positions
+  if (state->global->slicetype == SLICE_B && state->encoder_control->cfg->bipred) {
+    lcu_t *templcu = MALLOC(lcu_t, 1);
+    cost_pixel_nxn_func *satd = pixels_get_satd_func(LCU_WIDTH >> depth);
+    #define NUM_PRIORITY_LIST 12;
+    static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+    static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+    uint8_t cutoff = num_cand;
+    for (int32_t idx = 0; idx<cutoff*(cutoff - 1); idx++) {
+      uint8_t i = priorityList0[idx];
+      uint8_t j = priorityList1[idx];
+      if (i >= num_cand || j >= num_cand) break;
+
+      // Find one L0 and L1 candidate according to the priority list
+      if ((merge_cand[i].dir & 0x1) && (merge_cand[j].dir & 0x2)) {
+        if (merge_cand[i].ref[0] != merge_cand[j].ref[1] ||
+          merge_cand[i].mv[0][0] != merge_cand[j].mv[1][0] ||
+          merge_cand[i].mv[0][1] != merge_cand[j].mv[1][1]) {
+          uint32_t bitcost[2];
+          uint32_t cost = 0;
+          int8_t cu_mv_cand = 0;
+          int16_t mv[2][2];
+          kvz_pixel tmp_block[64 * 64];
+          kvz_pixel tmp_pic[64 * 64];
+          // Force L0 and L1 references
+          if (state->global->refmap[merge_cand[i].ref[0]].list == 2 || state->global->refmap[merge_cand[j].ref[1]].list == 1) continue;
+
+          // TODO: enable fractional pixel bipred search
+          mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
+          mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
+          mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
+          mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
+
+          // Check boundaries when using owf to process multiple frames at the same time
+          if (max_lcu_below >= 0) {
+            // When SAO is off, row is considered reconstructed when the last LCU
+            // is done, although the bottom 2 pixels might still need deblocking.
+            // To work around this, add 2 luma pixels to the reach of the mv
+            // in order to avoid referencing those possibly non-deblocked pixels.
+            int mv_lcu_row_reach_1 = ((y+(mv[0][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
+            int mv_lcu_row_reach_2 = ((y+(mv[1][1]>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
+            int cur_lcu_row = y / LCU_WIDTH;
+            if (mv_lcu_row_reach_1 > cur_lcu_row + max_lcu_below || mv_lcu_row_reach_2 > cur_lcu_row + max_lcu_below) {
+              continue;
+            }
+          }
+
+          inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu);
+
+          for (int ypos = 0; ypos < LCU_WIDTH >> depth; ++ypos) {
+            int dst_y = ypos*(LCU_WIDTH >> depth);
+            for (int xpos = 0; xpos < (LCU_WIDTH >> depth); ++xpos) {
+              tmp_block[dst_y + xpos] = templcu->rec.y[((y + ypos)&(LCU_WIDTH - 1))*LCU_WIDTH + ((x + xpos)&(LCU_WIDTH - 1))];              
+              tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width];
+            }
+          }
+
+          cost = satd(tmp_pic, tmp_block);
+
+          // TODO: enable fractional pixel bipred search
+          cost += calc_mvd_cost(state, merge_cand[i].mv[0][0] & 0xfff8, merge_cand[i].mv[0][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[0]);
+          cost += calc_mvd_cost(state, merge_cand[i].mv[1][0] & 0xfff8, merge_cand[i].mv[1][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[1]);
+
+          if (cost < cur_cu->inter.cost) {
+
+            cur_cu->inter.mv_dir = 3;
+            cur_cu->inter.mv_ref_coded[0] = state->global->refmap[merge_cand[i].ref[0]].idx;
+            cur_cu->inter.mv_ref_coded[1] = state->global->refmap[merge_cand[j].ref[1]].idx;
+
+
+
+            cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0];
+            cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1];
+
+            // TODO: enable fractional pixel bipred search
+            cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
+            cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
+            cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
+            cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
+            cur_cu->merged = 0;
+                        
+            // Check every candidate to find a match
+            for(int merge_idx = 0; merge_idx < num_cand; merge_idx++) {
+              if (
+                  merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
+                  merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&     
+                  merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
+                  merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&    
+                  merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && 
+                  merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) {
+                cur_cu->merged = 1;
+                cur_cu->merge_idx = merge_idx;
+                break;
+              }
+            }
+
+            // Each motion vector has its own candidate
+            for (int reflist = 0; reflist < 2; reflist++) {
+              cu_mv_cand = 0;
+              inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, reflist);
+              if ((mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
+                vector2d_t mvd_temp1, mvd_temp2;
+                int cand1_cost, cand2_cost;
+
+                mvd_temp1.x = cur_cu->inter.mv[reflist][0] - mv_cand[0][0];
+                mvd_temp1.y = cur_cu->inter.mv[reflist][1] - mv_cand[0][1];
+                cand1_cost = get_mvd_coding_cost(&mvd_temp1);
+
+                mvd_temp2.x = cur_cu->inter.mv[reflist][0] - mv_cand[1][0];
+                mvd_temp2.y = cur_cu->inter.mv[reflist][1] - mv_cand[1][1];
+                cand2_cost = get_mvd_coding_cost(&mvd_temp2);
+
+                // Select candidate 1 if it has lower cost
+                if (cand2_cost < cand1_cost) {
+                  cu_mv_cand = 1;                  
+                }
+              }
+              cur_cu->inter.mvd[reflist][0] = cur_cu->inter.mv[reflist][0] - mv_cand[cu_mv_cand][0];
+              cur_cu->inter.mvd[reflist][1] = cur_cu->inter.mv[reflist][1] - mv_cand[cu_mv_cand][1];
+              cur_cu->inter.mv_cand[reflist] = cu_mv_cand;
+            }
+            cur_cu->inter.cost = cost;
+            cur_cu->inter.bitcost = bitcost[0] + bitcost[1] + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[0] + cur_cu->inter.mv_ref_coded[1];
+          }
+        }
+      }
+    }
+    FREE_POINTER(templcu);
+  }
+
+  return cur_cu->inter.cost;
+}
diff --git a/src/search_inter.h b/src/search_inter.h
index 8483139e..2115ccd0 100644
--- a/src/search_inter.h
+++ b/src/search_inter.h
@@ -27,4 +27,8 @@
 
 #include "global.h"
 
+#include "encoderstate.h"
+
+int search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu);
+
 #endif // SEARCH_INTER_H_