diff --git a/README.md b/README.md
index cd96e124..a481d70d 100644
--- a/README.md
+++ b/README.md
@@ -150,11 +150,20 @@ Video structure:
                                    - frametile: Constrain within the tile.
                                    - frametilemargin: Constrain even more.
       --roi <filename>       : Use a delta QP map for region of interest.
-                               Reads an array of delta QP values from a text
-                               file. The file format is: width and height of
-                               the QP delta map followed by width*height delta
-                               QP values in raster order. The map can be of any
-                               size and will be scaled to the video size.
+                               Reads an array of delta QP values from a file.
+                               Text and binary files are supported and detected
+                               from the file extension (.txt/.bin). If a known
+                               extension is not found, the file is treated as
+                               a text file. The file can include one or many
+                               ROI frames each in the following format:
+                               width and height of the QP delta map followed
+                               by width * height delta QP values in raster
+                               order. In binary format, width and height are
+                               32-bit integers whereas the delta QP values are
+                               signed 8-bit values. The map can be of any size
+                               and will be scaled to the video size. The file
+                               reading will loop if end of the file is reached.
+                               See roi.txt in the examples folder.
       --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
                                in PPS and slice_qp_delta in slize header zero.
       --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
diff --git a/configure.ac b/configure.ac
index 3a0d1582..08a35042 100644
--- a/configure.ac
+++ b/configure.ac
@@ -22,8 +22,8 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #   - Increment when making new releases and major or minor was not changed since last release.
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
-ver_major=6
-ver_minor=7
+ver_major=7
+ver_minor=2
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
diff --git a/doc/kvazaar.1 b/doc/kvazaar.1
index f6f7821a..c3f80f6b 100644
--- a/doc/kvazaar.1
+++ b/doc/kvazaar.1
@@ -164,11 +164,20 @@ Constrain movement vectors. [none]
 .TP
 \fB\-\-roi <filename>      
 Use a delta QP map for region of interest.
-Reads an array of delta QP values from a text
-file. The file format is: width and height of
-the QP delta map followed by width*height delta
-QP values in raster order. The map can be of any
-size and will be scaled to the video size.
+Reads an array of delta QP values from a file.
+Text and binary files are supported and detected
+from the file extension (.txt/.bin). If a known
+extension is not found, the file is treated as
+a text file. The file can include one or many
+ROI frames each in the following format:
+width and height of the QP delta map followed
+by width * height delta QP values in raster
+order. In binary format, width and height are
+32\-bit integers whereas the delta QP values are
+signed 8\-bit values. The map can be of any size
+and will be scaled to the video size. The file
+reading will loop if end of the file is reached.
+See roi.txt in the examples folder.
 .TP
 \fB\-\-set\-qp\-in\-cu        
 Set QP at CU level keeping pic_init_qp_minus26.
diff --git a/src/bitstream.c b/src/bitstream.c
index c524e6e2..3ba866eb 100644
--- a/src/bitstream.c
+++ b/src/bitstream.c
@@ -33,6 +33,7 @@
 #include "bitstream.h"
 
 #include <math.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/src/cabac.c b/src/cabac.c
index 26ff0e34..a35358ae 100644
--- a/src/cabac.c
+++ b/src/cabac.c
@@ -70,6 +70,7 @@ void kvz_cabac_start(cabac_data_t * const data)
   data->num_buffered_bytes = 0;
   data->buffered_byte = 0xff;
   data->only_count = 0; // By default, write bits out
+  data->update = 0; 
 }
 
 /**
@@ -349,26 +350,28 @@ void kvz_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem
 /**
  * \brief
  */
-void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol)
+void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, 
+  cabac_ctx_t * const ctx, 
+  uint32_t symbol,
+  const int32_t offset,
+  const uint32_t max_symbol, 
+  double* bits_out)
 {
   int8_t code_last = max_symbol > symbol;
 
   assert(symbol <= max_symbol);
 
   if (!max_symbol) return;
-
-  data->cur_ctx = ctx;
-  CABAC_BIN(data, symbol, "ums");
+  
+  CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums");
 
   if (!symbol) return;
 
   while (--symbol) {
-    //data->cur_ctx = &ctx[offset];
-    CABAC_BIN(data, 1, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums");
   }
   if (code_last) {
-    //data->cur_ctx = &ctx[offset];
-    CABAC_BIN(data, 0, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums");
   }
 }
 
@@ -405,7 +408,7 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int
 /**
  * \brief
  */
-void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
                                   cabac_data_t * const data,
                                   uint32_t symbol,
                                   uint32_t count)
@@ -426,4 +429,5 @@ void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
   num_bins += count;
 
   CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb");
+  return num_bins;
 }
diff --git a/src/cabac.h b/src/cabac.h
index 8489333c..92c2d6b8 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -59,7 +59,8 @@ typedef struct
   uint32_t   buffered_byte;
   int32_t    num_buffered_bytes;
   int32_t    bits_left;
-  int8_t     only_count;
+  int8_t     only_count : 4;
+  int8_t     update : 4;
   bitstream_t *stream;
 
   // CONTEXTS
@@ -140,11 +141,11 @@ void kvz_cabac_write(cabac_data_t *data);
 void kvz_cabac_finish(cabac_data_t *data);
 void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                               uint32_t r_param, const unsigned int cutoff);
-void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
                 uint32_t symbol, uint32_t count);
 void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
-                                  uint32_t symbol, int32_t offset,
-                                  uint32_t max_symbol);
+                                      uint32_t symbol, int32_t offset,
+                                      uint32_t max_symbol, double* bits_out);
 void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol);
 
 #define CTX_PROB_BITS 15
@@ -153,6 +154,18 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol
 #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0))
 #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1))
 
+// Floating point fractional bits, derived from kvz_entropy_bits
+extern const float kvz_f_entropy_bits[512];
+#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
+
+#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \
+  if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \
+  if((cabac)->update) {\
+    (cabac)->cur_ctx = ctx;\
+    CABAC_BIN((cabac), (val), (name));\
+  } \
+} while(0)
+
 // Macros
 #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] )
 #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 )
diff --git a/src/cfg.c b/src/cfg.c
index 2be8c8c6..8b74e8d5 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -149,9 +149,9 @@ int kvz_config_init(kvz_config *cfg)
   cfg->gop_lp_definition.t = 1;
   cfg->open_gop = true;
 
-  cfg->roi.width = 0;
-  cfg->roi.height = 0;
-  cfg->roi.dqps = NULL;
+  cfg->roi.file_path = NULL;
+  cfg->roi.format = KVZ_ROI_TXT;
+
   cfg->set_qp_in_cu = false;
 
   cfg->erp_aqp = false;
@@ -214,6 +214,9 @@ int kvz_config_init(kvz_config *cfg)
 
   cfg->cclm = 0;
 
+
+  cfg->combine_intra_cus = 1;
+  cfg->force_inter = 0;
   return 1;
 }
 
@@ -221,11 +224,11 @@ int kvz_config_destroy(kvz_config *cfg)
 {
   if (cfg) {
     FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->roi.file_path);
     FREE_POINTER(cfg->fast_coeff_table_fn);
     FREE_POINTER(cfg->tiles_width_split);
     FREE_POINTER(cfg->tiles_height_split);
     FREE_POINTER(cfg->slice_addresses_in_ts);
-    FREE_POINTER(cfg->roi.dqps);
     FREE_POINTER(cfg->fastrd_learning_outdir_fn);
   }
   free(cfg);
@@ -1295,60 +1298,29 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
   }
   else if OPT("implicit-rdpcm")
     cfg->implicit_rdpcm = (bool)atobool(value);
+
   else if OPT("roi") {
-    // The ROI description is as follows:
-    // First number is width, second number is height,
-    // then follows width * height number of dqp values.
-    FILE* f = fopen(value, "rb");
-    if (!f) {
-      fprintf(stderr, "Could not open ROI file.\n");
+    static enum kvz_roi_format const formats[] = { KVZ_ROI_TXT, KVZ_ROI_BIN };
+    static const char * const format_names[] = { "txt", "bin", NULL };
+
+    char *roi_file = strdup(value);
+    if (!roi_file) {
+      fprintf(stderr, "Failed to allocate memory for ROI file name.\n");
       return 0;
     }
+    FREE_POINTER(cfg->roi.file_path);
+    cfg->roi.file_path = roi_file;
 
-    int width = 0;
-    int height = 0;
-    if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) {
-      fprintf(stderr, "Failed to read ROI size.\n");
-      fclose(f);
-      return 0;
+    // Get file extension or the substring after the last dot
+    char *maybe_extension = strrchr(cfg->roi.file_path, '.');
+    if (!maybe_extension) {
+      cfg->roi.format = KVZ_ROI_TXT;
+    } else {
+      maybe_extension++;
+      int8_t format;
+      bool unknown_format = !parse_enum(maybe_extension, format_names, &format);
+      cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formats[format];
     }
-
-    if (width <= 0 || height <= 0) {
-      fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height);
-      fclose(f);
-      return 0;
-    }
-
-    if (width > 10000 || height > 10000) {
-      fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
-      fclose(f);
-      return 0;
-    }
-
-    const unsigned size = width * height;
-    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps[0]));
-    if (!dqp_array) {
-      fprintf(stderr, "Failed to allocate memory for ROI table.\n");
-      fclose(f);
-      return 0;
-    }
-
-    FREE_POINTER(cfg->roi.dqps);
-    cfg->roi.dqps   = dqp_array;
-    cfg->roi.width  = width;
-    cfg->roi.height = height;
-
-    for (int i = 0; i < size; ++i) {
-      int number; // Need a pointer to int for fscanf
-      if (fscanf(f, "%d", &number) != 1) {
-        fprintf(stderr, "Reading ROI file failed.\n");
-        fclose(f);
-        return 0;
-      }
-      dqp_array[i] = CLIP(-51, 51, number);
-    }
-
-    fclose(f);
   }
   else if OPT("set-qp-in-cu") {
     cfg->set_qp_in_cu = (bool)atobool(value);
@@ -1502,6 +1474,12 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
   else if OPT("cclm") {
     cfg->cclm = (bool)atobool(value);
   }
+  else if OPT("combine-intra-cus") {
+    cfg->combine_intra_cus = atobool(value);
+  }
+  else if OPT("force-inter") {
+    cfg->force_inter = atobool(value);
+  }
   else {
     return 0;
   }
diff --git a/src/cli.c b/src/cli.c
index baa5a07a..a68a5ce0 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -145,6 +145,7 @@ static const struct option long_options[] = {
   { "force-level",        required_argument, NULL, 0 },
   { "high-tier",                no_argument, NULL, 0 },
   { "me-steps",           required_argument, NULL, 0 },
+  { "roi-file",           required_argument, NULL, 0 },
   { "fast-residual-cost", required_argument, NULL, 0 },
   { "set-qp-in-cu",             no_argument, NULL, 0 },
   { "open-gop",                 no_argument, NULL, 0 },
@@ -183,6 +184,10 @@ static const struct option long_options[] = {
   { "no-amvr",                  no_argument, NULL, 0 },
   { "cclm",                     no_argument, NULL, 0 },
   { "no-cclm",                  no_argument, NULL, 0 },
+  { "combine-intra-cus",        no_argument, NULL, 0 },
+  { "no-combine-intra-cus",     no_argument, NULL, 0 },
+  { "force-inter",              no_argument, NULL, 0 },
+  { "no-force-inter",           no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -504,11 +509,20 @@ void print_help(void)
     "                                   - frametile: Constrain within the tile.\n"
     "                                   - frametilemargin: Constrain even more.\n"
     "      --roi <filename>       : Use a delta QP map for region of interest.\n"
-    "                               Reads an array of delta QP values from a text\n"
-    "                               file. The file format is: width and height of\n"
-    "                               the QP delta map followed by width*height delta\n"
-    "                               QP values in raster order. The map can be of any\n"
-    "                               size and will be scaled to the video size.\n"
+    "                               Reads an array of delta QP values from a file.\n"
+    "                               Text and binary files are supported and detected\n"
+    "                               from the file extension (.txt/.bin). If a known\n"
+    "                               extension is not found, the file is treated as\n"
+    "                               a text file. The file can include one or many\n"
+    "                               ROI frames each in the following format:\n"
+    "                               width and height of the QP delta map followed\n"
+    "                               by width * height delta QP values in raster\n"
+    "                               order. In binary format, width and height are\n"
+    "                               32-bit integers whereas the delta QP values are\n"
+    "                               signed 8-bit values. The map can be of any size\n"
+    "                               and will be scaled to the video size. The file\n"
+    "                               reading will loop if end of the file is reached.\n"
+    "                               See roi.txt in the examples folder.\n"
     "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
     "                               in PPS and slice_qp_delta in slize header zero.\n"
     "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
@@ -594,6 +608,16 @@ void print_help(void)
     "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
     "                                learning trees, overrides the\n"
     "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
+    "                                   on lower depth even when search is not\n"
+    "                                   performed on said depth. Should only\n"
+    "                                   be disabled if cus absolutely must not\n"
+    "                                   be larger than limited by the search.\n"
+    "                                   [enabled]"
+    "      --force-inter          : Force the encoder to use inter always.\n"
+    "                               This is mostly for debugging and is not\n"
+    "                               guaranteed to produce sensible bitstream or\n"
+    "                               work at all. [disabled]"
     "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
     "      --(no-)bipred          : Bi-prediction [disabled]\n"
     "      --cu-split-termination <string> : CU split search termination [zero]\n"
diff --git a/src/encmain.c b/src/encmain.c
index 0cdea6f7..1d9175fc 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -441,6 +441,7 @@ int main(int argc, char *argv[])
   FILE *input  = NULL; //!< input file (YUV)
   FILE *output = NULL; //!< output file (HEVC NAL stream)
   FILE *recout = NULL; //!< reconstructed YUV output, --debug
+  FILE *roifile = NULL;
   clock_t start_time = clock();
   clock_t encoding_start_cpu_time;
   KVZ_CLOCK_T encoding_start_real_time;
@@ -584,7 +585,7 @@ int main(int argc, char *argv[])
     // Give arguments via struct to the input thread
     input_handler_args in_args = {
       .available_input_slots = available_input_slots,
-      .filled_input_slots    = filled_input_slots,
+      .filled_input_slots = filled_input_slots,
 
       .input = input,
       .api = api,
@@ -825,6 +826,7 @@ done:
   if (input)  fclose(input);
   if (output) fclose(output);
   if (recout) fclose(recout);
+  if (roifile) fclose(roifile);
 
   DBG_YUVIEW_CLEANUP();
   CHECKPOINTS_FINALIZE();
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 21c368e0..a6adb249 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -581,7 +581,7 @@ static void encode_transform_coeff(encoder_state_t * const state,
 
       // cu_qp_delta_abs prefix
       cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0];
-      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5);
+      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL);
 
       if (qp_delta_abs >= 5) {
         // cu_qp_delta_abs suffix
@@ -610,17 +610,19 @@ static void encode_transform_coeff(encoder_state_t * const state,
  * \param depth           Depth from LCU.
  * \return if non-zero mvd is coded
  */
-static bool encode_inter_prediction_unit(encoder_state_t * const state,
-                                         cabac_data_t * const cabac,
-                                         const cu_info_t * const cur_cu,
-                                         int x, int y, int width, int height,
-                                         int depth)
+int kvz_encode_inter_prediction_unit(encoder_state_t * const state,
+                                      cabac_data_t * const cabac,
+                                      const cu_info_t * const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, lcu_t* lcu, double* bits_out)
 {
   // Mergeflag
   int16_t num_cand = 0;
   bool non_zero_mvd = false;
-  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
-  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+  double bits = 0;
+
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag");
+
   num_cand = state->encoder_control->cfg.max_merge;
   if (cur_cu->merged) { //merge
     if (num_cand > 1) {
@@ -628,10 +630,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       for (ui = 0; ui < num_cand - 1; ui++) {
         int32_t symbol = (ui != cur_cu->merge_idx);
         if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
         } else {
           CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          if(cabac->only_count) bits += 1;
         }
         if (symbol == 0) break;
       }
@@ -650,12 +652,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
         uint32_t inter_dir_ctx = (7 - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height) + 1) >> 1));
 
-        cabac->cur_ctx = &(cabac->ctx.inter_dir[inter_dir_ctx]);
-        CABAC_BIN(cabac, (inter_dir == 3), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
       }
       if (inter_dir < 3) {
-        cabac->cur_ctx = &(cabac->ctx.inter_dir[5]);
-        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[5]), (inter_dir == 2), bits, "inter_pred_idc");
       }
    }
 
@@ -674,20 +674,21 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (ref_LX_size > 1) {
         // parseRefFrmIdx
         int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx];
-
-        cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-        CABAC_BIN(cabac, (ref_frame > 0), "ref_idx_lX");
+        
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");
 
         if (ref_frame > 0 && ref_LX_size > 2) {
           cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
-          CABAC_BIN(cabac, (ref_frame > 1), "ref_idx_lX");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), (ref_frame > 1), bits, "ref_idx_lX");
 
           if (ref_frame > 1 && ref_LX_size > 3) {
             for (int idx = 3; idx < ref_LX_size; idx++)
             {
               uint8_t val = (ref_frame > idx - 1) ? 1 : 0;
               CABAC_BIN_EP(cabac, val, "ref_idx_lX");
+              if (cabac->only_count) bits += 1;
               if (!val) break;
+
             }
           }
         }
@@ -697,28 +698,37 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) {
 
         mv_t mv_cand[2][2];
-        kvz_inter_get_mv_cand_cua(
+        if (lcu) {
+          kvz_inter_get_mv_cand(
+            state, 
+            x, y, width, height,
+            mv_cand, cur_cu, 
+            lcu, ref_list_idx);
+        }
+        else {
+          kvz_inter_get_mv_cand_cua(
             state,
             x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx);
+            mv_cand, cur_cu, ref_list_idx
+          );
+        }
 
         uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
         mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0];
         mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1];
 
         kvz_change_precision(INTERNAL_MV_PREC, kvz_g_imv_to_prec[KVZ_IMV_OFF], &mvd_hor, &mvd_ver);
-
-        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver);
+        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out);
 
         non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0);
       }
 
       // Signal which candidate MV to use
-      cabac->cur_ctx = &(cabac->ctx.mvp_idx_model);
-      CABAC_BIN(cabac, CU_GET_MV_CAND(cur_cu, ref_list_idx), "mvp_flag");
+      CABAC_FBITS_UPDATE(cabac,&(cabac->ctx.mvp_idx_model), CU_GET_MV_CAND(cur_cu, ref_list_idx), bits, "mvp_flag");
 
     } // for ref_list
   } // if !merge
+  if(bits_out) *bits_out += bits;
   return non_zero_mvd;
 }
 
@@ -807,7 +817,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
 static void encode_intra_coding_unit(encoder_state_t * const state,
                                      cabac_data_t * const cabac,
                                      const cu_info_t * const cur_cu,
-                                     int x, int y, int depth, lcu_coeff_t* coeff)
+                                     int x, int y, int depth, lcu_t* lcu, lcu_coeff_t* coeff, double* bits_out)
 {
   const videoframe_t * const frame = state->tile->frame;
   uint8_t intra_pred_mode_actual[4];
@@ -1050,6 +1060,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
         kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT);
       }
+      if (cabac->only_count && bits_out) *bits_out += 5;
     }
   }
 
@@ -1057,14 +1068,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
     encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
   }
+  // if we are counting bits, the cost for transform coeffs is done separately
+  // To get the distortion at the same time
+  if (!cabac->only_count) {
+    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
 
-  encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
+    encode_mts_idx(state, cabac, cur_cu);
 
-  encode_mts_idx(state, cabac, cur_cu);
-
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
-    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
+      encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
+      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
+    }
   }
 
 }
@@ -1105,32 +1119,32 @@ static void encode_part_mode(encoder_state_t * const state,
   //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
   //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
   // ------------------------------+------------------
-
+  double bits = 0;
   if (cur_cu->type == CU_INTRA) {
     if (depth == MAX_DEPTH) {
       cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
       if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
       } else {
-        CABAC_BIN(cabac, 0, "part_mode NxN");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN");
       }
     }
   } else {
 
     cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
     if (cur_cu->part_size == SIZE_2Nx2N) {
-      CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      return;
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
+      return bits;
     }
-    CABAC_BIN(cabac, 0, "part_mode split");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split");
 
     cabac->cur_ctx = &(cabac->ctx.part_size_model[1]);
     if (cur_cu->part_size == SIZE_2NxN ||
         cur_cu->part_size == SIZE_2NxnU ||
         cur_cu->part_size == SIZE_2NxnD) {
-      CABAC_BIN(cabac, 1, "part_mode vertical");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical");
     } else {
-      CABAC_BIN(cabac, 0, "part_mode horizontal");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal");
     }
 
     if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) {
@@ -1138,19 +1152,22 @@ static void encode_part_mode(encoder_state_t * const state,
 
       if (cur_cu->part_size == SIZE_2NxN ||
           cur_cu->part_size == SIZE_Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode SMP");
-        return;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP");
+        return bits;
       }
-      CABAC_BIN(cabac, 0, "part_mode AMP");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP");
 
       if (cur_cu->part_size == SIZE_2NxnU ||
           cur_cu->part_size == SIZE_nLx2N) {
         CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
       } else {
         CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
       }
     }
   }
+  return bits;
 }
 **/
 
@@ -1191,7 +1208,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,
   bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
   bool border = border_x || border_y; /*!< are we in any border CU */
 
-  if (depth <= ctrl->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
     state->must_code_qp_delta = true;
   }
 
@@ -1456,7 +1473,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,
       const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
       const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
 
-      non_zero_mvd |= encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth);
+      non_zero_mvd |= kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL);
       DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
       kvz_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu);
     }
@@ -1494,7 +1511,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,
 
     }
   } else if (cur_cu->type == CU_INTRA) {
-    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, coeff);
+    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL);
   }
 
   else {
@@ -1511,11 +1528,128 @@ end:
 
 }
 
+double kvz_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu) {
+  double bits = 0;
+  const encoder_control_t* const ctrl = state->encoder_control;
+
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+
+  const int cu_width = LCU_WIDTH >> depth;
+  
+  const cu_info_t* left_cu = NULL, *above_cu = NULL;
+  if (x) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+  }
+  if (y) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1);
+  }
+  uint8_t split_model = 0;
+
+  // Absolute coordinates
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y;
+
+  // Check for slice border
+  bool border_x = ctrl->in.width < abs_x + cu_width;
+  bool border_y = ctrl->in.height < abs_y + cu_width;
+  bool border = border_x || border_y; /*!< are we in any border CU */
+
+  if (depth <= state->frame->max_qp_delta_depth) {
+    state->must_code_qp_delta = true;
+  }
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    // Implicit split flag when on border
+    if (!border) {
+      // Get left and top block split_flags and if they are present and true, increase model number
+      if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
+        split_model++;
+      }
+
+      if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
+        split_model++;
+      }
+
+      // This mocks encoding the current CU so it should be never split
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag");
+    }
+  }
+
+  // Encode skip flag
+  if (state->frame->slicetype != KVZ_SLICE_I) {
+    int8_t ctx_skip = 0;
+
+    if (left_cu && left_cu->skipped) {
+      ctx_skip++;
+    }
+    if (above_cu && above_cu->skipped) {
+      ctx_skip++;
+    }
+    
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag");
+
+    if (cur_cu->skipped) {
+      int16_t num_cand = state->encoder_control->cfg.max_merge;
+      if (num_cand > 1) {
+        for (int ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
+          }
+          else {
+            CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+            if(cabac->only_count) bits += 1;
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return bits;
+    }
+  }
+  // Prediction mode
+  if (state->frame->slicetype != KVZ_SLICE_I && cu_width != 4) {
+
+    int8_t ctx_predmode = 0;
+
+    if ((left_cu && left_cu->type == CU_INTRA) || (above_cu && above_cu->type == CU_INTRA)) {
+      ctx_predmode = 1;
+    }
+
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode");
+  }
+  
+  if (cur_cu->type == CU_INTER) {
+    const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size];
+
+    for (int i = 0; i < num_pu; ++i) {
+      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
+      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
+      const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
+      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
+      const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+      kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits);
+    }
+  }
+  else if (cur_cu->type == CU_INTRA) {
+    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits);
+  }
+  return bits;
+}
+
 
 void kvz_encode_mvd(encoder_state_t * const state,
                     cabac_data_t *cabac,
                     int32_t mvd_hor,
-                    int32_t mvd_ver)
+                    int32_t mvd_ver, double* bits_out)
 {
   const int8_t hor_abs_gr0 = mvd_hor != 0;
   const int8_t ver_abs_gr0 = mvd_ver != 0;
@@ -1523,29 +1657,33 @@ void kvz_encode_mvd(encoder_state_t * const state,
   const uint32_t mvd_ver_abs = abs(mvd_ver);
 
   cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0];
-  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver");
 
   cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1];
   if (hor_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor");
   }
   if (ver_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver");
   }
 
   if (hor_abs_gr0) {
     if (mvd_hor_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      if(cabac->only_count) *bits_out += bits;
     }
     uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1;
     CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
+    if (cabac->only_count) *bits_out += 1;
   }
   if (ver_abs_gr0) {
     if (mvd_ver_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      if (cabac->only_count) *bits_out += bits;
     }
     uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1;
     CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
+    if (cabac->only_count) *bits_out += 1;
   }
 }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index ea792845..24f2759d 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -56,7 +56,22 @@ void kvz_encode_ts_residual(encoder_state_t* const state,
 void kvz_encode_mvd(encoder_state_t * const state,
                     cabac_data_t *cabac,
                     int32_t mvd_hor,
-                    int32_t mvd_ver);
+                    int32_t mvd_ver,
+                    double* bits_out);
+
+double kvz_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu);
+
+int kvz_encode_inter_prediction_unit(encoder_state_t* const state,
+                                      cabac_data_t* const cabac,
+                                      const cu_info_t* const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, 
+                                      lcu_t* lcu,
+                                      double* bits_out);
 
 void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t lastpos_x, uint8_t lastpos_y,
diff --git a/src/encoder.c b/src/encoder.c
index 98d87690..6ecddb86 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -32,7 +32,6 @@
 
 #include "encoder.h"
 
-// This define is required for M_PI on Windows.
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <stdio.h>
@@ -45,14 +44,6 @@
 #include "kvz_math.h"
 #include "fast_coeff_cost.h"
 
-/**
- * \brief Strength of QP adjustments when using adaptive QP for 360 video.
- *
- * Determined empirically.
- */
-static const double ERP_AQP_STRENGTH = 3.0;
-
-
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
 static unsigned cfg_num_threads(void)
@@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder)
 }
 
 
-/**
- * \brief Return weight for 360 degree ERP video
- *
- * Returns the scaling factor of area from equirectangular projection to
- * spherical surface.
- *
- * \param y   y-coordinate of the pixel
- * \param h   height of the picture
- */
-static double ws_weight(int y, int h)
-{
-  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
-}
-
-
-
 /**
  * \brief Update ROI QPs for 360 video with equirectangular projection.
  *
@@ -162,55 +137,6 @@ static double ws_weight(int y, int h)
  * \param orig_width    width of orig_roi
  * \param orig_height   height of orig_roi
  */
-static void init_erp_aqp_roi(encoder_control_t* encoder,
-                             int8_t *orig_roi,
-                             int32_t orig_width,
-                             int32_t orig_height)
-{
-  // Update ROI with WS-PSNR delta QPs.
-  int height = encoder->in.height_in_lcu;
-  int width  = orig_roi ? orig_width : 1;
-
-  int frame_height = encoder->in.real_height;
-
-  encoder->cfg.roi.width  = width;
-  encoder->cfg.roi.height = height;
-  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi[0]));
-
-  double total_weight = 0.0;
-  for (int y = 0; y < frame_height; y++) {
-    total_weight += ws_weight(y, frame_height);
-  }
-
-  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
-    int y_orig = LCU_WIDTH * y_lcu;
-    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
-
-    double lcu_weight = 0.0;
-    for (int y = y_orig; y < y_orig + lcu_height; y++) {
-      lcu_weight += ws_weight(y, frame_height);
-    }
-    // Normalize.
-    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
-
-    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
-
-    if (orig_roi) {
-      // If a ROI array already exists, we copy the existing values to the
-      // new array while adding qp_delta to each.
-      int y_roi = y_lcu * orig_height / height;
-      for (int x = 0; x < width; x++) {
-        encoder->cfg.roi.dqps[x + y_lcu * width] =
-          CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta);
-      }
-
-    } else {
-      // Otherwise, simply write qp_delta to the ROI array.
-      encoder->cfg.roi.dqps[y_lcu] = qp_delta;
-    }
-  }
-}
-
 
 static int8_t* derive_chroma_QP_mapping_table(const kvz_config* const cfg, int i)
 {
@@ -394,6 +320,16 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
     encoder->scaling_list.use_default_list = 1;
   }
 
+  // ROI / delta QP
+  if (cfg->roi.file_path) {
+    const char *mode[2] = { "r", "rb" };
+    encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]);
+    if (!encoder->roi_file) {
+      fprintf(stderr, "Could not open ROI file.\n");
+      goto init_failed;
+    }
+  }
+
   if (cfg->fast_coeff_table_fn) {
     FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
     if (fast_coeff_table_f == NULL) {
@@ -435,32 +371,10 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
     goto init_failed;
   }
 
-  if (cfg->erp_aqp) {
-    init_erp_aqp_roi(encoder,
-                     cfg->roi.dqps,
-                     cfg->roi.width,
-                     cfg->roi.height);
-
-  } else if (cfg->roi.dqps) {
-    // Copy delta QP array for ROI coding.
-    const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
-    encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0]));
-    memcpy(encoder->cfg.roi.dqps,
-           cfg->roi.dqps,
-           roi_size * sizeof(*cfg->roi.dqps));
-
-  }
-
   // NOTE: When tr_depth_inter is equal to 0, the transform is still split
   // for SMP and AMP partition units.
   encoder->tr_depth_inter = 0;
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
-    encoder->max_qp_delta_depth = 0;
-  } else {
-    encoder->max_qp_delta_depth = -1;
-  }
-
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
@@ -761,7 +675,7 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
 
   FREE_POINTER(encoder->tiles_tile_id);
 
-  FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.roi.file_path);
 
   kvz_scalinglist_destroy(&encoder->scaling_list);
 
@@ -773,6 +687,10 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
 
   kvz_close_rdcost_outfiles();
 
+  if (encoder->roi_file) {
+    fclose(encoder->roi_file);
+  }
+
   free(encoder);
 }
 
diff --git a/src/encoder.h b/src/encoder.h
index 6d301611..c0d0fda3 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -130,7 +130,7 @@ typedef struct encoder_control_t
   //! Picture weights when GOP is used.
   double gop_layer_weights[MAX_GOP_LAYERS];
 
-  int8_t max_qp_delta_depth;
+  FILE *roi_file;
 
   int tr_depth_inter;
 
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 0f84b512..2f24894e 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -805,10 +805,10 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream,
   WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag");
 
   WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26");
-  WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
-  if (encoder->max_qp_delta_depth >= 0) {
+  WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
+  if (state->frame->max_qp_delta_depth >= 0) {
     // Use separate QP for each LCU when rate control is enabled.    
-    WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth");
+    WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth");
   }
 
   WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag");
diff --git a/src/encoderstate.c b/src/encoderstate.c
index b0691ac7..db5b93f3 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -32,6 +32,9 @@
 
 #include "encoderstate.h"
 
+ // This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <ctype.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -53,6 +56,13 @@
 
 #include "strategies/strategies-picture.h"
 
+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -572,7 +582,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
   cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
   const int cu_width = LCU_WIDTH >> depth;
 
-  if (depth <= state->encoder_control->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
   }
 
@@ -665,7 +675,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
 
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
     int last_qp = state->last_qp;
     int prev_qp = -1;
     set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
@@ -716,6 +726,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   const uint64_t existing_bits = kvz_bitstream_tell(&state->stream);
 
   //Encode SAO
+  state->cabac.update = 1;
   if (encoder->cfg.sao_type) {
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
   }
@@ -771,6 +782,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
       kvz_cabac_start(&state->cabac);
     }
   }
+  state->cabac.update = 0;
 
 
   pthread_mutex_lock(&state->frame->rc_lock);
@@ -1421,6 +1433,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
   }
 }
 
+
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Updates the ROI parameters in frame->roi.
+ *
+ * \param encoder       encoder control
+ * \param frame         frame that will have the ROI map
+ */
+static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame)
+{
+  int8_t *orig_roi    = frame->roi.roi_array;
+  int32_t orig_width  = frame->roi.width;
+  int32_t orig_height = frame->roi.height;
+
+  // Update ROI with WS-PSNR delta QPs.
+  int new_height = encoder->in.height_in_lcu;
+  int new_width = orig_roi ? orig_width : 1;
+  int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0]));
+
+  int frame_height = encoder->in.real_height;
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < new_height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / new_height;
+      for (int x = 0; x < new_width; x++) {
+        new_array[x + y_lcu * new_width] =
+          CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta);
+      }
+
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      new_array[y_lcu] = qp_delta;
+    }
+  }
+
+  // Update new values
+  frame->roi.width = new_width;
+  frame->roi.height = new_height;
+  frame->roi.roi_array = new_array;
+  FREE_POINTER(orig_roi);
+}
+
+
+static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) {
+  // The ROI description is as follows:
+  // First number is width, second number is height,
+  // then follows width * height number of dqp values.
+
+  // Rewind the (seekable) ROI file when end of file is reached.
+  // Allows a single ROI frame to be used for a whole sequence
+  // and looping with --loop-input. Skips possible whitespace.
+  if (ftell(file) != -1L) {
+    int c = fgetc(file);
+    while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file);
+    ungetc(c, file);
+    if (c == EOF) rewind(file);
+  }
+
+  int *width  = &frame->roi.width;
+  int *height = &frame->roi.height;
+
+  bool failed = false;
+
+  if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height);
+  if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2;
+  
+  if (failed) {
+    fprintf(stderr, "Failed to read ROI size.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width <= 0 || *height <= 0) {
+    fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height);
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width > 10000 || *height > 10000) {
+    fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  const unsigned size = (*width) * (*height);
+  int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0]));
+  if (!dqp_array) {
+    fprintf(stderr, "Failed to allocate memory for ROI table.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  FREE_POINTER(frame->roi.roi_array);
+  frame->roi.roi_array = dqp_array;
+
+  if (format == KVZ_ROI_TXT) {
+    for (int i = 0; i < size; ++i) {
+      int number; // Need a pointer to int for fscanf
+      if (fscanf(file, "%d", &number) != 1) {
+        fprintf(stderr, "Reading ROI file failed.\n");
+        fclose(file);
+        assert(0);
+      }
+      dqp_array[i] = CLIP(-51, 51, number);
+    }
+  } else if (format == KVZ_ROI_BIN) {
+    if (fread(dqp_array, 1, size, file) != size) {
+      fprintf(stderr, "Reading ROI file failed.\n");
+      assert(0);
+    }
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
   assert(state->type == ENCODER_STATE_TYPE_MAIN);
 
@@ -1437,6 +1597,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
     memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu);
   }
 
+  // ROI / delta QP maps
+  if (frame->roi.roi_array && cfg->roi.file_path) {
+    assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified.");
+  }
+
+  // Read frame from the file. If no file is specified,
+  // ROI data should be already set by the application.
+  if (cfg->roi.file_path) {
+    next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format);
+  }
+  
+  if (cfg->erp_aqp) {
+    init_erp_aqp_roi(state->encoder_control, state->tile->frame->source);
+  }
+
   // Variance adaptive quantization
   if (cfg->vaq) {
     const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
@@ -1523,6 +1698,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
   }
   // Variance adaptive quantization - END
 
+  if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) {
+    state->frame->max_qp_delta_depth = 0;
+  } else {
+    state->frame->max_qp_delta_depth = -1;
+  }
+
   // Use this flag to handle closed gop irap picture selection.
   // If set to true, irap is already set and we avoid
   // setting it based on the intra period
@@ -1834,10 +2015,9 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y)
 
 int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
-  const encoder_control_t *ctrl = state->encoder_control;
   const cu_array_t *cua = state->tile->frame->cu_array;
   // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth);
 
   // Coordinates of the top-left corner of the quantization group
   const int x_qg = x & ~(qg_width - 1);
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 8100cf31..19c0d196 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t {
   */
   double *aq_offsets;
 
+  int8_t max_qp_delta_depth;
+
   /**
    * \brief Whether next NAL is the first NAL in the access unit.
    */
@@ -320,6 +322,7 @@ typedef struct encoder_state_t {
   
   bitstream_t stream;
   cabac_data_t cabac;
+  cabac_data_t search_cabac;
 
   uint32_t stats_bitstream_length; //Bitstream length written in bytes
 
@@ -402,10 +405,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
  */
 static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+  if (state->frame->max_qp_delta_depth < 0) return false;
 
   const int cu_width = LCU_WIDTH >> depth;
-  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
   const int right  = x + cu_width;
   const int bottom = y + cu_width;
   return (right % qg_width == 0 || right >= state->tile->frame->width) &&
diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c
index 4fc392bf..cf6173db 100644
--- a/src/fast_coeff_cost.c
+++ b/src/fast_coeff_cost.c
@@ -40,7 +40,7 @@ static uint16_t to_q88(float f)
   return (uint16_t)(f * 256.0f + 0.5f);
 }
 
-static uint64_t to_4xq88(const float f[4])
+static uint64_t to_4xq88(const double f[4])
 {
   int i;
   uint64_t result = 0;
@@ -58,9 +58,9 @@ int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_
   uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
 
   for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
-    float curr_wts[4];
+    double curr_wts[4];
 
-    if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
+    if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0,
                                                     curr_wts + 1,
                                                     curr_wts + 2,
                                                     curr_wts + 3) != 4) {
diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h
index dcd67c8d..8dcfbd08 100644
--- a/src/fast_coeff_cost.h
+++ b/src/fast_coeff_cost.h
@@ -45,7 +45,7 @@ typedef struct {
 
 // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
 // 0 to MAX_FAST_COEFF_COST_QP
-static const float default_fast_coeff_cost_wts[][4] = {
+static const double default_fast_coeff_cost_wts[][4] = {
   // Just extend it by stretching the first actual values..
   {0.164240f, 4.161530f, 3.509033f, 6.928047f},
   {0.164240f, 4.161530f, 3.509033f, 6.928047f},
diff --git a/src/filter.c b/src/filter.c
index aad84dbc..5b2d5641 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir)
 
 static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) {
+  if (state->frame->max_qp_delta_depth < 0) {
     return state->qp;
   }
 
diff --git a/src/image.c b/src/image.c
index 39d17ea3..f3aee439 100644
--- a/src/image.c
+++ b/src/image.c
@@ -106,6 +106,10 @@ kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_
 
   im->interlacing = KVZ_INTERLACING_NONE;
 
+  im->roi.roi_array = NULL;
+  im->roi.width = 0;
+  im->roi.height = 0;
+
   return im;
 }
 
@@ -132,6 +136,7 @@ void kvz_image_free(kvz_picture *const im)
     kvz_image_free(im->base_image);
   } else {
     free(im->fulldata_buf);
+    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
   }
 
   // Make sure freed data won't be used.
@@ -192,6 +197,8 @@ kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image,
   im->pts = 0;
   im->dts = 0;
 
+  im->roi = orig_image->roi;
+
   return im;
 }
 
diff --git a/src/inter.c b/src/inter.c
index 9fad9619..44ac599f 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -1290,7 +1290,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
                                         int32_t width,
                                         int32_t height,
                                         const merge_candidates_t *merge_cand,
-                                        const cu_info_t *cur_cu,
+                                        const cu_info_t * const cur_cu,
                                         int8_t reflist,
                                         mv_t mv_cand[2][2])
 {
@@ -1396,7 +1396,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t width,
                            int32_t height,
                            mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t  * const cur_cu,
                            lcu_t *lcu,
                            int8_t reflist)
 {
diff --git a/src/inter.h b/src/inter.h
index 981017dc..017ee3a5 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -96,7 +96,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t width,
                            int32_t height,
                            mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t* cur_cu,
                            lcu_t *lcu,
                            int8_t reflist);
 
diff --git a/src/kvazaar.h b/src/kvazaar.h
index 00052f83..32e77ec2 100644
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@@ -267,6 +267,12 @@ enum kvz_amvr_resolution
   KVZ_IMV_HPEL    = 3
 };
 
+enum kvz_roi_format
+{
+  KVZ_ROI_TXT = 0,
+  KVZ_ROI_BIN = 1
+};
+
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)format)
 
@@ -410,10 +416,9 @@ typedef struct kvz_config
   int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */
 
   struct {
-    int32_t width;
-    int32_t height;
-    int8_t *dqps;
-  } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
+    char *file_path;
+    enum kvz_roi_format format;
+  } roi; /*!< \brief Specify delta QPs for region of interest coding. */
 
   unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */
 
@@ -526,6 +531,12 @@ typedef struct kvz_config
   int8_t cclm;
 
   int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
+
+  /** \brief whether to try combining intra cus at the lower depth when search
+   *         is not performed at said depth*/
+  uint8_t combine_intra_cus;
+
+  uint8_t force_inter;
 } kvz_config;
 
 /**
@@ -557,6 +568,14 @@ typedef struct kvz_picture {
   enum kvz_chroma_format chroma_format;
 
   int32_t ref_pocs[16];
+
+  struct
+  {
+    int width;
+    int height;
+    int8_t *roi_array;
+  } roi;
+
 } kvz_picture;
 
 /**
@@ -782,6 +801,9 @@ typedef struct kvz_api {
    * the bitstream, length of the bitstream, the reconstructed frame, the
    * original frame and frame info in data_out, len_out, pic_out, src_out and
    * info_out, respectively. Otherwise, set the output parameters to NULL.
+   * 
+   * Region of interest (ROI) / delta QP map can be specified in the input
+   * picture's ROI field but only when a ROI file is not used.
    *
    * After passing all of the input frames, the caller should keep calling this
    * function with pic_in set to NULL, until no more data is returned in the
diff --git a/src/rate_control.c b/src/rate_control.c
index de4046b0..8196d7de 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -1088,17 +1088,20 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
   const encoder_control_t * const ctrl = state->encoder_control;
   lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y);
 
-  if (ctrl->cfg.roi.dqps != NULL) {
-    vector2d_t lcu = {
+  if (state->tile->frame->source->roi.roi_array) {
+    vector2d_t lcu_vec = {
       pos.x + state->tile->lcu_offset_x,
       pos.y + state->tile->lcu_offset_y
     };
     vector2d_t roi = {
-      lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu,
-      lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu
+      lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu,
+      lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu
     };
-    int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
-    int dqp = ctrl->cfg.roi.dqps[roi_index];
+    int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width;
+    int dqp = state->tile->frame->source->roi.roi_array[roi_index];
+    if(dqp != 0) {
+      pos.x = 0;
+    }
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
     state->lambda = qp_to_lambda(state, state->qp);
     state->lambda_sqrt = sqrt(state->lambda);
diff --git a/src/rdo.c b/src/rdo.c
index be85b817..2ead71df 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost(
   // Take a copy of the CABAC so that we don't overwrite the contexts when
   // counting the bits.
   cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
 
   // Clear bytes and bits and set mode to "count"
   cabac_copy.only_count = 1;
-  cabac_copy.num_buffered_bytes = 0;
-  cabac_copy.bits_left = 23;
+  int num_buffered_bytes = cabac_copy.num_buffered_bytes;
+  int bits_left = cabac_copy.bits_left;
 
   // Execute the coding function.
   // It is safe to drop the const modifier since state won't be modified
@@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost(
       type,
       scan_mode);
   }
-
-  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+  if(cabac_copy.update) {
+    memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy));
+  }
+  return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3);
 }
 
 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
@@ -1741,37 +1743,33 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
 /**
  * Calculate cost of actual motion vectors using CABAC coding
  */
-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       const int32_t mvd_hor,
-                                       const int32_t mvd_ver)
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     const int32_t mvd_hor,
+                                     const int32_t mvd_ver)
 {
   cabac_data_t cabac_copy = *cabac;
   cabac_copy.only_count = 1;
-
+  double bits = 0;
   // It is safe to drop const here because cabac->only_count is set.
-  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits);
 
-  uint32_t bitcost =
-    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
-    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
-
-  return bitcost;
+  return bits;
 }
 
 /** MVD cost calculation with CABAC
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
-                                 int x,
-                                 int y,
-                                 int mv_shift,
-                                 mv_t mv_cand[2][2],
-                                 inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                                 int16_t num_cand,
-                                 int32_t ref_idx,
-                                 uint32_t *bitcost)
+double kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
+                               int x,
+                               int y,
+                               int mv_shift,
+                               mv_t mv_cand[2][2],
+                               inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                               int16_t num_cand,
+                               int32_t ref_idx,
+                               double* bitcost)
 {
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
@@ -1798,14 +1796,13 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
   }
 
   // Store cabac state and contexts
-  memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+  memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t));
 
   // Clear bytes and bits and set mode to "count"
   state_cabac_copy.only_count = 1;
-  state_cabac_copy.num_buffered_bytes = 0;
-  state_cabac_copy.bits_left = 23;
 
   cabac = &state_cabac_copy;
+  double bits = 0;
 
   if (!merged) {
     vector2d_t mvd1 = {
@@ -1820,8 +1817,8 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
     kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1);
     kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2);
 
-    uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
-    uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
+    double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -1834,7 +1831,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
 
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
 
-  CABAC_BIN(cabac, merged, "MergeFlag");
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag");
   num_cand = state->encoder_control->cfg.max_merge;
   if (merged) {
     if (num_cand > 1) {
@@ -1842,10 +1839,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
       for (ui = 0; ui < num_cand - 1; ui++) {
         int32_t symbol = (ui != merge_idx);
         if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
         } else {
           CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+          bits += 1;
         }
         if (symbol == 0) break;
       }
@@ -1868,24 +1865,23 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
         if (ref_list[ref_list_idx] > 1) {
           // parseRefFrmIdx
           int32_t ref_frame = ref_idx;
-
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+          
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");
 
           if (ref_frame > 0) {
             int32_t i;
             int32_t ref_num = ref_list[ref_list_idx] - 2;
-
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
+            
             ref_frame--;
 
             for (i = 0; i < ref_num; ++i) {
               const uint32_t symbol = (i == ref_frame) ? 0 : 1;
 
               if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
+                CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX");
               } else {
                 CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+                bits += 1;
               }
               if (symbol == 0) break;
             }
@@ -1895,7 +1891,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
         // ToDo: Bidir vector support
         if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
           // It is safe to drop const here because cabac->only_count is set.
-          kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
+          kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits);
         }
 
         // Signal which candidate MV to use
@@ -1905,10 +1901,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
     }
   }
 
-  *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
+  *bitcost = bits;
 
   // Store bitcost before restoring cabac
-  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
+  return *bitcost * state->lambda_sqrt;
 }
 
 void kvz_close_rdcost_outfiles(void)
diff --git a/src/rdo.h b/src/rdo.h
index da6cb7d4..02b218f2 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -77,10 +77,10 @@ uint32_t kvz_get_coded_level(encoder_state_t * state, double* coded_cost, double
 
 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
 
-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       int32_t mvd_hor,
-                                       int32_t mvd_ver);
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     int32_t mvd_hor,
+                                     int32_t mvd_ver);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15
@@ -90,8 +90,5 @@ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
 extern const uint32_t kvz_entropy_bits[512];
 #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
 
-// Floating point fractional bits, derived from kvz_entropy_bits
-extern const float kvz_f_entropy_bits[512];
-#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
 
 #endif
diff --git a/src/sao.c b/src/sao.c
index 461bdf90..1bf1ec29 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) {
 }
 
 
-static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
+static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {    
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, none = 0
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+  CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type");
 
   return mode_bits;
 }
 
-static float sao_mode_bits_merge(const encoder_state_t * const state,
+static double sao_mode_bits_merge(const encoder_state_t * const state,
                                  int8_t merge_cand) {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   ctx = &(cabac->ctx.sao_merge_flag_model);
 
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag");
   if (merge_cand == 1) return mode_bits;
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag");
   return mode_bits;
 }
 
 
-static float sao_mode_bits_edge(const encoder_state_t * const state,
+static double sao_mode_bits_edge(const encoder_state_t * const state,
                               int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES],
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
-    ctx = &(cabac->ctx.sao_merge_flag_model);   
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    ctx = &(cabac->ctx.sao_merge_flag_model);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, edge = 2 = cMax
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) {
@@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state,
 }
 
 
-static float sao_mode_bits_band(const encoder_state_t * const state,
+static double sao_mode_bits_band(const encoder_state_t * const state,
                               int band_position[2], int offsets[10],
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded sao_type_idx_, band = 1
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets and possible FL coded offset signs.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++)
@@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
+    float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left);
+    int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
       merge_cost[0] = cost_of_nothing;
diff --git a/src/search.c b/src/search.c
index 1bdc67d5..3bd39e6b 100644
--- a/src/search.c
+++ b/src/search.c
@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@@ -59,14 +60,6 @@
 // Cost threshold for doing intra search in inter frames with --rd=0.
 static const int INTRA_THRESHOLD = 8;
 
-// Modify weight of luma SSD.
-#ifndef LUMA_MULT
-# define LUMA_MULT 0.8
-#endif
-// Modify weight of chroma SSD.
-#ifndef CHROMA_MULT
-# define CHROMA_MULT 1.5
-#endif
 
 static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
@@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
   const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
   double ssd = 0.0;
-  ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+  ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd(
     &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
     LCU_WIDTH, LCU_WIDTH, cu_width
     );
   if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
       &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
       &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
@@ -294,11 +287,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * prediction unit data needs to be coded.
 */
 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu)
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
   // cur_cu is used for TU parameters.
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@@ -324,14 +319,36 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
     return sum + tr_tree_bits * state->lambda;
   }
 
+
+  if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) {
+    // Because these need to be coded before the luma cbf they also need to be counted
+    // before the cabac state changes. However, since this branch is only executed when
+    // calculating the last RD cost it is not problem to include the chroma cbf costs in
+    // luma, because the chroma cost is calculated right after the luma cost.
+    // However, if we have different tr_depth, the bits cannot be written in correct
+    // order anyways so do not touch the chroma cbf here.
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+      cabac->cur_ctx = cr_ctx;
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+      cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
   // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
   if (pred_cu->type == CU_INTRA ||
-      tr_depth > 0 ||
+      is_tr_split ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_V))
   {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
   }
 
   // SSD between reconstruction and original
@@ -343,7 +360,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
                                         width);
   }
 
-  {
+
+  if (!skip_residual_coding) {
     int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
@@ -351,18 +369,19 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->lambda;
+  return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda;
 }
 
 
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu)
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
   const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
 
   double tr_tree_bits = 0;
   double joint_cbcr_tr_tree_bits = 0;
@@ -378,22 +397,27 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
     return 0;
   }
 
-  if (depth < MAX_PU_DEPTH) {
+  // See luma for why the second condition
+  if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) {
     const int tr_depth = depth - pred_cu->depth;
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
     }
     if(state->encoder_control->cfg.jccr) {
       joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
     }
     int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-    ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
+    ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]);
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
     }
     if(state->encoder_control->cfg.jccr) {
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
+      ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
       joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
     }
   }
@@ -401,7 +425,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
 
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
-    int sum = 0;
+    double sum = 0;
 
     sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
@@ -448,6 +472,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
     }
   }
 
+  if (!skip_residual_coding)
   {
     int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
     const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
@@ -464,8 +489,8 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
   double bits = tr_tree_bits + coeff_bits;
   double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;
 
-  double cost = (double)ssd + bits * state->c_lambda;
-  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  double cost = (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd * KVZ_CHROMA_MULT + joint_bits * state->c_lambda;
   if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
     pred_cu->joint_cb_cr = 0;
     return cost;    
@@ -485,6 +510,117 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
   return joint_cost;
 }
 
+static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
+                                           const int x_px, const int y_px, const int depth,
+                                           const cu_info_t* const pred_cu,
+                                           lcu_t* const lcu) {
+  const int width = LCU_WIDTH >> depth;
+
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  // cur_cu is used for TU parameters.
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+
+  double coeff_bits = 0;
+  double tr_tree_bits = 0;
+
+  // Check that lcu is not in 
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  const uint8_t tr_depth = tr_cu->tr_depth - depth;
+
+  const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+
+  {
+    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    // Only need to signal coded block flag if not skipped or merged
+    // skip = no coded residual, merge = coded residual
+    if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+    }
+
+  }
+
+  if(state->encoder_control->chroma_format != KVZ_CSP_400 && !skip_residual_coding) {
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
+    } 
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
+    } 
+  }
+
+  if (tr_depth > 0) {
+    int offset = LCU_WIDTH >> (depth + 1);
+    double sum = 0;
+
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    return sum + tr_tree_bits * state->lambda;
+  }
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ;
+
+  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = depth - tr_cu->depth;
+  if ((pred_cu->type == CU_INTRA ||
+    is_tr_split ||
+    cb_flag_u ||
+    cb_flag_v) 
+      && !skip_residual_coding)
+  {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+  }
+  // SSD between reconstruction and original
+  unsigned luma_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    int index = y_px * LCU_WIDTH + x_px;
+    luma_ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      width);
+  }
+
+  {
+    int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+
+    coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip);
+  }
+
+  unsigned chroma_ssd = 0;
+  if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) {
+    const vector2d_t lcu_px = { x_px / 2, y_px / 2 };
+    const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+    if (!state->encoder_control->cfg.lossless) {
+      int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+      unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        chroma_width);
+      unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        chroma_width);
+      chroma_ssd = ssd_u + ssd_v;
+    }
+
+     {
+      int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+      const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0);
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0);
+    }
+  }
+
+  double bits = tr_tree_bits + coeff_bits;
+  return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda;
+}
+
 
 // Return estimate of bits used to code prediction mode of cur_cu.
 static double calc_mode_bits(const encoder_state_t *state,
@@ -518,6 +654,7 @@ static double calc_mode_bits(const encoder_state_t *state,
 }
 
 
+// TODO: replace usages of this by the kvz_sort_indices_by_cost function.
 /**
  * \brief Sort modes and costs to ascending order according to costs.
  */
@@ -567,6 +704,23 @@ void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf
   }
 }
 
+/**
+ * \brief Sort keys (indices) to ascending order according to costs.
+ */
+void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map)
+{
+  // Size of sorted arrays is expected to be "small". No need for faster algorithm.
+  for (uint8_t i = 1; i < map->size; ++i) {
+    const int8_t cur_indx = map->keys[i];
+    const double cur_cost = map->cost[cur_indx];
+    uint8_t j = i;
+    while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) {
+      map->keys[j] = map->keys[j - 1];
+      --j;
+    }
+    map->keys[j] = cur_indx;
+  }
+}
 
 
 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
@@ -592,10 +746,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
-  double cost = MAX_INT;
-  double inter_zero_coeff_cost = MAX_INT;
-  uint32_t inter_bitcost = MAX_INT;
+  double cost = MAX_DOUBLE;
+  double inter_zero_coeff_cost = MAX_DOUBLE;
+  double inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
+  cabac_data_t pre_search_cabac;
+  memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));
 
   const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
   const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
@@ -626,7 +782,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
   // Assign correct depth limit
   constraint_t* constr = state->constraint;
- if(constr->ml_intra_depth_ctu) {
+  if(constr->ml_intra_depth_ctu) {
     pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8];
     pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8];
   }
@@ -670,7 +826,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
     if (can_use_inter) {
       double mode_cost;
-      uint32_t mode_bitcost;
+      double mode_bitcost;
       kvz_search_cu_inter(state,
                           x, y,
                           depth,
@@ -721,12 +877,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
     int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
         (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width ||
-        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height;
+        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) &&
+      !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I);
 
     if (can_use_intra && !skip_intra) {
       int8_t intra_mode;
@@ -737,6 +894,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       bool mip_transposed = false;
       kvz_search_cu_intra(state, x, y, depth, lcu,
                           &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
+#ifdef COMPLETE_PRED_MODE_BITS
+      // Technically counting these bits would be correct, however counting
+      // them universally degrades quality so this block is disabled by default
+      if(state->frame->slicetype != KVZ_SLICE_I) {
+        double pred_mode_type_bits = 0;
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag");
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag");
+        intra_cost += pred_mode_type_bits * state->lambda;
+      }
+#endif
       if (intra_cost < cost) {
         cost = intra_cost;
         cur_cu->type = CU_INTRA;
@@ -828,9 +995,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
           cur_cu->merged = 0;
           cur_cu->skipped = 1;
           // Selecting skip reduces bits needed to code the CU
-          if (inter_bitcost > 1) {
-            inter_bitcost -= 1;
-          }
+          int skip_ctx = kvz_get_skip_context(x, y, lcu, NULL, NULL);
+          inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1);
+          inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0);
+          inter_bitcost += cur_cu->merge_idx;        
         }
       }
       lcu_fill_inter(lcu, x_local, y_local, cu_width);
@@ -839,20 +1007,26 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   }
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
+    double bits = 0;
+    cabac_data_t* cabac  = &state->search_cabac;
+    cabac->update = 1;
+
+    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
+      bits += kvz_mock_encode_coding_unit(
+        state,
+        cabac,
+        x, y, depth,
+        lcu,
+        cur_cu);
     }
-
-    double mode_bits;
-    if (cur_cu->type == CU_INTRA) {
-      mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
-    } else {
-      mode_bits = inter_bitcost;
+    else {
+      assert(0);
     }
+    
+    cost = bits * state->lambda;
 
-    cost += mode_bits * state->lambda;
-
+    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+    
     if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
 
@@ -874,13 +1048,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       cur_cu->cbf = 0;
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
-  }
+    cabac->update = 0;
+  } 
 
   bool can_split_cu =
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    depth < pu_depth_intra.max ||
+    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != KVZ_SLICE_I)) ||
     (state->frame->slicetype != KVZ_SLICE_I &&
       depth < pu_depth_inter.max);
 
@@ -889,21 +1064,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     int half_cu = cu_width / 2;
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+    cabac_data_t post_seach_cabac;
+    memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    state->search_cabac.update = 1;
+
+    double split_bits = 0;
 
     if (depth < MAX_DEPTH) {
       // Add cost of cu_split_flag.
       uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]);
+      CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search");
     }
 
     if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
       // Add cost of intra part_size.
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;  // NxN
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]);
+      CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search");
     }
+    state->search_cabac.update = 0;
+    split_cost += split_bits * state->lambda;
 
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting
@@ -925,13 +1106,29 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // searching.
     
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height 
+        && state->encoder_control->cfg.combine_intra_cus)
     {
+
       cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
+        memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
         cost = 0;
+        double bits = 0;
+        if (depth < MAX_DEPTH) {
+          uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
+          cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]);
+          CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search");
+        }
+        else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) {
+          // Add cost of intra part_size.
+          cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]);
+          CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search");
+        }
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -952,19 +1149,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
                            lcu);
 
-        cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-        if (has_chroma) {
-          cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
-        }
-
-        // Add the cost of coding no-split.
-        uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-        const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-
-        // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
         cost += mode_bits * state->lambda;
+
+        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+
+        memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
       }
     }
 
@@ -978,6 +1169,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     } else if (depth > 0) {
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
+      memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
       work_tree_copy_down(x_local, y_local, depth, work_tree);
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@@ -1167,6 +1359,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
  */
 void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff)
 {
+  memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t));
+  state->search_cabac.only_count = 1;
   assert(x % LCU_WIDTH == 0);
   assert(y % LCU_WIDTH == 0);
 
diff --git a/src/search.h b/src/search.h
index 4eb5943f..db87c298 100644
--- a/src/search.h
+++ b/src/search.h
@@ -44,22 +44,53 @@
 #include "image.h"
 #include "constraint.h"
 
+#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS)
+
+ // Modify weight of luma SSD.
+#ifndef KVZ_LUMA_MULT
+# define KVZ_LUMA_MULT 0.8
+#endif
+// Modify weight of chroma SSD.
+#ifndef KVZ_CHROMA_MULT
+# define KVZ_CHROMA_MULT 1.5
+#endif
+
+ /**
+  *  \brief Data collected during search processes.
+  * 
+  *         The intended use is to collect statistics of the
+  *         searched coding/prediction units. Data related to
+  *         a specific unit is found at index i. The arrays
+  *         should be indexed by elements of the "keys" array
+  *         that will be sorted by the RD costs of the units.         
+  */
+typedef struct unit_stats_map_t {
+
+  cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units
+  double    cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs
+  double    bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs  
+  int8_t    keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays
+  int       size;                    //!< number of active elements in the lists
+} unit_stats_map_t;
+
 #define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)
 #define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1
 
 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);
 
+void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map);
+
 void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
 
 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu);
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu);
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu);
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu);
 void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);
 
 void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
diff --git a/src/search_inter.c b/src/search_inter.c
index 7c8bc0bb..73e15f95 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "image.h"
 #include "imagelist.h"
 #include "inter.h"
@@ -68,7 +69,7 @@ typedef struct {
   /**
    * \brief Top-left corner of the PU
    */
-  const vector2d_t origin;
+  vector2d_t origin;
   int32_t width;
   int32_t height;
 
@@ -78,19 +79,6 @@ typedef struct {
 
   kvz_mvd_cost_func *mvd_cost_func;
 
-  /**
-   * \brief Best motion vector among the ones tested so far
-   */
-  vector2d_t best_mv;
-  /**
-   * \brief Cost of best_mv
-   */
-  uint32_t best_cost;
-  /**
-   * \brief Bit cost of best_mv
-   */
-  uint32_t best_bitcost;
-
   /**
    * \brief Possible optimized SAD implementation for the width, leave as
    *        NULL for arbitrary-width blocks
@@ -205,20 +193,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int
 /**
  * \brief Calculate cost for an integer motion vector.
  *
- * Updates info->best_mv, info->best_cost and info->best_bitcost to the new
+ * Updates best_mv, best_cost and best_bitcost to the new
  * motion vector if it yields a lower cost than the current one.
  *
  * If the motion vector violates the MV constraints for tiles or WPP, the
  * cost is not set.
  *
- * \return true if info->best_mv was changed, false otherwise
+ * \return true if best_mv was changed, false otherwise
  */
-static bool check_mv_cost(inter_search_info_t *info, int x, int y)
+static bool check_mv_cost(inter_search_info_t *info,
+                          int x,
+                          int y,
+                          double *best_cost,
+                          double* best_bits,
+                          vector2d_t *best_mv)
 {
   if (!intmv_within_tile(info, x, y)) return false;
 
-  uint32_t bitcost = 0;
-  uint32_t cost = kvz_image_calc_sad(
+  double bitcost = 0;
+  double cost = kvz_image_calc_sad(
       info->pic,
       info->ref,
       info->origin.x,
@@ -230,25 +223,25 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y)
       info->optimized_sad
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   cost += info->mvd_cost_func(
       info->state,
       x, y, INTERNAL_MV_PREC,
       info->mv_cand,
-      info->merge_cand,
-      info->num_merge_cand,
+      NULL,
+      0,
       info->ref_idx,
       &bitcost
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   // Set to motion vector in internal pixel precision.
-  info->best_mv.x = x * (1 << INTERNAL_MV_PREC);
-  info->best_mv.y = y * (1 << INTERNAL_MV_PREC);
-  info->best_cost = cost;
-  info->best_bitcost = bitcost;
+  best_mv->x = x * (1 << INTERNAL_MV_PREC);
+  best_mv->y = y * (1 << INTERNAL_MV_PREC);
+  *best_cost = cost;
+  *best_bits = bitcost;
 
   return true;
 }
@@ -256,10 +249,10 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y)
 
 static unsigned get_ep_ex_golomb_bitcost(unsigned symbol)
 {
-  // Calculate 2 * log2(symbol + 2)
+  // Calculate 2 * log2(symbol )
 
   unsigned bins = 0;
-  symbol += 2;
+  symbol += 0;
   if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; }
   if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; }
   if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; }
@@ -299,12 +292,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv)
  * \brief Select starting point for integer motion estimation search.
  *
  * Checks the zero vector, extra_mv and merge candidates and updates
- * info->best_mv to the best one.
+ * best_mv to the best one.
  */
-static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv)
+static void select_starting_point(inter_search_info_t *info,
+                                  vector2d_t extra_mv,
+                                  double *best_cost,
+                                  double* best_bits,
+                                  vector2d_t *best_mv)
 {
   // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  check_mv_cost(info, 0, 0);
+  check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv);
 
   // Change to integer precision.
   extra_mv.x >>= INTERNAL_MV_PREC;
@@ -312,7 +309,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv
 
   // Check mv_in if it's not one of the merge candidates.
   if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) {
-    check_mv_cost(info, extra_mv.x, extra_mv.y);
+    check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv);
   }
 
   // Go through candidates
@@ -324,17 +321,17 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv
 
     if (x == 0 && y == 0) continue;
 
-    check_mv_cost(info, x, y);
+    check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
   }
 }
 
 
-static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
-                                    const cabac_data_t* cabac,
-                                    const int32_t mvd_hor,
-                                    const int32_t mvd_ver)
+static double get_mvd_coding_cost(const encoder_state_t* state,
+  const cabac_data_t* cabac,
+  const int32_t mvd_hor,
+  const int32_t mvd_ver)
 {
-  unsigned bitcost = 0;
+  double bitcost = 0;
 
   const int8_t hor_abs_gr0 = mvd_hor != 0;
   const int8_t ver_abs_gr0 = mvd_ver != 0;
@@ -366,7 +363,7 @@ static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
 
 
   // Round and shift back to integer bits.
-  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+  return bitcost / (1 << CTX_FRAC_BITS);
 }
 
 
@@ -374,7 +371,7 @@ static int select_mv_cand(const encoder_state_t *state,
                           mv_t mv_cand[2][2],
                           int32_t mv_x,
                           int32_t mv_y,
-                          uint32_t *cost_out)
+                          double*cost_out)
 {
   const bool same_cand =
     (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]);
@@ -384,7 +381,7 @@ static int select_mv_cand(const encoder_state_t *state,
     return 0;
   }
 
-  uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+  double (*mvd_coding_cost)(const encoder_state_t * const state,
                               const cabac_data_t*,
                               int32_t, int32_t);
   if (state->encoder_control->cfg.mv_rdo) {
@@ -397,12 +394,12 @@ static int select_mv_cand(const encoder_state_t *state,
 
   kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd);
 
-  uint32_t cand1_cost = mvd_coding_cost(
+  double cand1_cost = mvd_coding_cost(
       state, &state->cabac,
       mvd.x,
       mvd.y);
 
-  uint32_t cand2_cost;
+  double cand2_cost;
   if (same_cand) {
     cand2_cost = cand1_cost;
   } else {
@@ -423,17 +420,17 @@ static int select_mv_cand(const encoder_state_t *state,
 }
 
 
-static uint32_t calc_mvd_cost(const encoder_state_t *state,
-                              int x,
-                              int y,
-                              int mv_shift,
-                              mv_t mv_cand[2][2],
-                              inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                              int16_t num_cand,
-                              int32_t ref_idx,
-                              uint32_t *bitcost)
+static double calc_mvd_cost(const encoder_state_t *state,
+                            int x,
+                            int y,
+                            int mv_shift,
+                            mv_t mv_cand[2][2],
+                            inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                            int16_t num_cand,
+                            int32_t ref_idx,
+                            double* bitcost)
 {
-  uint32_t temp_bitcost = 0;
+  double temp_bitcost = 0;
   uint32_t merge_idx;
   int8_t merged      = 0;
 
@@ -456,23 +453,26 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state,
 
   // Check mvd cost only if mv is not merged
   if (!merged) {
-    uint32_t mvd_cost = 0;
+    double mvd_cost = 0;
     select_mv_cand(state, mv_cand, x, y, &mvd_cost);
     temp_bitcost += mvd_cost;
   }
   *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5);
+  return temp_bitcost * state->lambda_sqrt;
 }
 
 
-static bool early_terminate(inter_search_info_t *info)
+static bool early_terminate(inter_search_info_t *info,
+                            double *best_cost,
+                            double* best_bits,
+                            vector2d_t *best_mv)
 {
   static const vector2d_t small_hexbs[7] = {
       { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 },
       { 0, -1 }, { -1, 0 }, { 0, 0 },
   };
 
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 };
 
   int first_index = 0;
   int last_index = 3;
@@ -482,9 +482,9 @@ static bool early_terminate(inter_search_info_t *info)
     if (info->state->encoder_control->cfg.me_early_termination ==
         KVZ_ME_EARLY_TERMINATION_SENSITIVE)
     {
-      threshold = info->best_cost * 0.95;
+      threshold = *best_cost * 0.95;
     } else {
-      threshold = info->best_cost;
+      threshold = *best_cost;
     }
 
     int best_index = 6;
@@ -492,7 +492,7 @@ static bool early_terminate(inter_search_info_t *info)
       int x = mv.x + small_hexbs[i].x;
       int y = mv.y + small_hexbs[i].y;
 
-      if (check_mv_cost(info, x, y)) {
+      if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) {
         best_index = i;
       }
     }
@@ -502,7 +502,7 @@ static bool early_terminate(inter_search_info_t *info)
     mv.y += small_hexbs[best_index].y;
 
     // If best match is not better than threshold, we stop the search.
-    if (info->best_cost >= threshold) {
+    if (*best_cost >= threshold) {
       return true;
     }
 
@@ -517,7 +517,10 @@ void kvz_tz_pattern_search(inter_search_info_t *info,
                            unsigned pattern_type,
                            const int iDist,
                            vector2d_t mv,
-                           int *best_dist)
+                           int *best_dist,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   assert(pattern_type < 4);
 
@@ -619,7 +622,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info,
     int x = mv.x + offset.x;
     int y = mv.y + offset.y;
 
-    if (check_mv_cost(info, x, y)) {
+    if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -632,20 +635,27 @@ void kvz_tz_pattern_search(inter_search_info_t *info,
 
 void kvz_tz_raster_search(inter_search_info_t *info,
                           int iSearchRange,
-                          int iRaster)
+                          int iRaster,
+                          double *best_cost,
+                          double* best_bits,
+                          vector2d_t *best_mv)
 {
-  const vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  const vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   //compute SAD values for every point in the iRaster downsampled version of the current search area
   for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) {
     for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) {
-      check_mv_cost(info, mv.x + x, mv.y + y);
+      check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv);
     }
   }
 }
 
 
-static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
+static void tz_search(inter_search_info_t *info,
+                      vector2d_t extra_mv,
+                      double *best_cost,
+                      double* best_bits,
+                      vector2d_t *best_mv)
 {
   //TZ parameters
   const int iSearchRange = 96;  // search range for each stage
@@ -657,25 +667,13 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   const bool use_star_refinement = true;   // enable step 4 mode 2 (only one mode will be executed)
 
   int best_dist = 0;
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-      early_terminate(info))
-  {
-    return;
-  }
-
-  vector2d_t start = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  
+  vector2d_t start = { best_mv->x >> 2, best_mv->y >> 2 };
 
   // step 2, grid search
   int rounds_without_improvement = 0;
   for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-    kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+    kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
 
     // Break the loop if the last three rounds didn't produce a better MV.
     if (best_dist != iDist) rounds_without_improvement++;
@@ -688,7 +686,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
     start.y = 0;
     rounds_without_improvement = 0;
     for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) {
-      kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+      kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
 
       if (best_dist != iDist) rounds_without_improvement++;
       if (rounds_without_improvement >= 3) break;
@@ -698,7 +696,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   //step 3, raster scan
   if (use_raster_scan && best_dist > iRaster) {
     best_dist = iRaster;
-    kvz_tz_raster_search(info, iSearchRange, iRaster);
+    kvz_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv);
   }
 
   //step 4
@@ -706,19 +704,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   //raster refinement
   if (use_raster_refinement && best_dist > 0) {
     for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) {
-      start.x = info->best_mv.x >> INTERNAL_MV_PREC;
-      start.y = info->best_mv.y >> INTERNAL_MV_PREC;
-      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
+      start.x = best_mv->x >> INTERNAL_MV_PREC;
+      start.y = best_mv->y >> INTERNAL_MV_PREC;
+      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
     }
   }
 
   //star refinement (repeat step 2 for the current starting point)
   while (use_star_refinement && best_dist > 0) {
     best_dist = 0;
-    start.x = info->best_mv.x >> INTERNAL_MV_PREC;
-    start.y = info->best_mv.y >> INTERNAL_MV_PREC;
+    start.x = best_mv->x >> INTERNAL_MV_PREC;
+    start.y = best_mv->y >> INTERNAL_MV_PREC;
     for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
+      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
     }
   }
 }
@@ -740,7 +738,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
  * the predicted motion vector is way off. In the future even more additional
  * points like 0,0 might be used, such as vectors from top or left.
  */
-static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps)
+static void hexagon_search(inter_search_info_t *info,
+                           vector2d_t extra_mv,
+                           uint32_t steps,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   // The start of the hexagonal pattern has been repeated at the end so that
   // the indices between 1-6 can be used as the start of a 3-point list of new
@@ -765,27 +768,14 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
       { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 }
   };
 
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-      early_terminate(info))
-  {
-    return;
-  }
-
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> 2, best_mv->y >> 2 };
 
   // Current best index, either to merge_cands, large_hexbs or small_hexbs.
   int best_index = 0;
 
   // Search the initial 7 points of the hexagon.
   for (int i = 1; i < 7; ++i) {
-    if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) {
+    if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -814,7 +804,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
     // Iterate through the next 3 points.
     for (int i = 0; i < 3; ++i) {
       vector2d_t offset = large_hexbs[start + i];
-      if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) {
+      if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) {
         best_index = start + i;
       }
     }
@@ -826,7 +816,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 
   // Do the final step of the search with a small pattern.
   for (int i = 1; i < 9; ++i) {
-    check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y);
+    check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv);
   }
 }
 
@@ -846,7 +836,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 * the predicted motion vector is way off. In the future even more additional
 * points like 0,0 might be used, such as vectors from top or left.
 **/
-static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) 
+static void diamond_search(inter_search_info_t *info,
+                           vector2d_t extra_mv,
+                           uint32_t steps,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   enum diapos {
     DIA_UP = 0,
@@ -864,29 +859,16 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
     {0, -1}, {1, 0}, {0, 1}, {-1, 0},
     {0, 0}
   };
-
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-    early_terminate(info))
-  {
-    return;
-  }
   
   // current motion vector
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   // current best index
   enum diapos best_index = DIA_CENTER;
 
   // initial search of the points of the diamond
   for (int i = 0; i < 5; ++i) {
-    if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) {
+    if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -916,7 +898,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
       // this is where we came from so it's checked already
       if (i == from_dir) continue;
 
-      if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) {
+      if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) {
         best_index = i;
         better_found = 1;
       }
@@ -938,12 +920,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 
 static void search_mv_full(inter_search_info_t *info,
                            int32_t search_range,
-                           vector2d_t extra_mv)
+                           vector2d_t extra_mv,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   // Search around the 0-vector.
   for (int y = -search_range; y <= search_range; y++) {
     for (int x = -search_range; x <= search_range; x++) {
-      check_mv_cost(info, x, y);
+      check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
     }
   }
 
@@ -955,7 +940,7 @@ static void search_mv_full(inter_search_info_t *info,
   if (!mv_in_merge(info, extra_mv)) {
     for (int y = -search_range; y <= search_range; y++) {
       for (int x = -search_range; x <= search_range; x++) {
-        check_mv_cost(info, extra_mv.x + x, extra_mv.y + y);
+        check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv);
       }
     }
   }
@@ -1002,7 +987,7 @@ static void search_mv_full(inter_search_info_t *info,
         }
         if (already_tested) continue;
 
-        check_mv_cost(info, x, y);
+        check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
       }
     }
   }
@@ -1015,7 +1000,10 @@ static void search_mv_full(inter_search_info_t *info,
  * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
  * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
  */
-static void search_frac(inter_search_info_t *info)
+static void search_frac(inter_search_info_t *info,
+                        double *best_cost,
+                        double *best_bits,
+                        vector2d_t *best_mv)
 {
   // Map indexes to relative coordinates in the following way:
   // 5 3 6
@@ -1028,13 +1016,14 @@ static void search_frac(inter_search_info_t *info)
   };
 
   // Set mv to pixel precision
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0;
-  uint32_t bitcosts[4] = { 0 };
+  double cost = MAX_DOUBLE;
+  double bitcost = 0;
+  double bitcosts[4] = { 0 };
   unsigned best_index = 0;
 
+// Keep this as unsigned until SAD / SATD functions are updated
   unsigned costs[4] = { 0 };
 
   ALIGNED(64) kvz_pixel filtered[4][LCU_LUMA_SIZE];
@@ -1100,12 +1089,12 @@ static void search_frac(inter_search_info_t *info)
   costs[0] += info->mvd_cost_func(state,
                                   mv.x, mv.y, INTERNAL_MV_PREC,
                                   info->mv_cand,
-                                  info->merge_cand,
-                                  info->num_merge_cand,
+                                  NULL,
+                                  0,
                                   info->ref_idx,
                                   &bitcosts[0]);
-  best_cost = costs[0];
-  best_bitcost = bitcosts[0];
+  cost = costs[0];
+  bitcost = bitcosts[0];
   
   //Set mv to half-pixel precision
   mv.x *= 2;
@@ -1160,8 +1149,8 @@ static void search_frac(inter_search_info_t *info)
             mv.y + pattern[j]->y,
             mv_shift,
             info->mv_cand,
-            info->merge_cand,
-            info->num_merge_cand,
+            NULL,
+            0,
             info->ref_idx,
             &bitcosts[j]
         );
@@ -1169,9 +1158,9 @@ static void search_frac(inter_search_info_t *info)
     }
 
     for (int j = 0; j < 4; ++j) {
-      if (within_tile[j] && costs[j] < best_cost) {
-        best_cost = costs[j];
-        best_bitcost = bitcosts[j];
+      if (within_tile[j] && costs[j] < cost) {
+        cost = costs[j];
+        bitcost = bitcosts[j];
         best_index = i + j;
       }
     }
@@ -1201,9 +1190,38 @@ static void search_frac(inter_search_info_t *info)
   mv.x *= 1 << (INTERNAL_MV_PREC - 2);
   mv.y *= 1 << (INTERNAL_MV_PREC - 2);
 
-  info->best_mv = mv;
-  info->best_cost = best_cost;
-  info->best_bitcost = best_bitcost;
+  *best_mv = mv;
+  *best_cost = cost;
+  *best_bits = bitcost;
+}
+
+int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx) {
+  assert(!(lcu && cu_a));
+  int context = 0;
+  const cu_info_t* left_pu = NULL;
+  const cu_info_t* top_pu = NULL;
+  if(lcu) {
+    int x_local = SUB_SCU(x);
+    int y_local = SUB_SCU(y);
+    if (x) {
+      left_pu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+    }
+    if (y) {
+      top_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
+    }
+  }
+  else {
+    if (x > 0) {
+      left_pu = kvz_cu_array_at_const(cu_a, x - 1, y);
+    }
+    if (y > 0) {
+      top_pu = kvz_cu_array_at_const(cu_a, x, y - 1);
+    }
+  }
+  context += left_pu && left_pu->skipped;
+  context += top_pu && top_pu->skipped;
+  if (predmode_ctx) *predmode_ctx = (left_pu && left_pu->type == CU_INTRA) || (top_pu && top_pu->type == CU_INTRA);
+  return context;
 }
 
 /**
@@ -1251,46 +1269,37 @@ static void apply_mv_scaling(int32_t current_poc,
  */
 static void search_pu_inter_ref(inter_search_info_t *info,
   int depth,
-  lcu_t *lcu, cu_info_t *cur_cu,
-  double *inter_cost,
-  uint32_t *inter_bitcost,
-  double *best_LX_cost,
-  cu_info_t *unipred_LX)
+  lcu_t *lcu,
+  cu_info_t *cur_cu,
+  unit_stats_map_t *amvp)
 {
   const kvz_config *cfg = &info->state->encoder_control->cfg;
 
-  // which list, L0 or L1, ref_idx is in and in what index
-  int8_t ref_list = -1;
-  // the index of the ref_idx in L0 or L1 list
-  int8_t LX_idx;
-  // max value of LX_idx plus one
-  const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0],
-    info->state->frame->ref_LX_size[1]);
+  // Reference picture might be in both lists
+  bool ref_list_active[2] = { false, false };
+  // Reference picture indices in L0 and L1 lists
+  int8_t ref_list_idx[2] = { -1, -1 };
 
-  for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++)
-  {
-    // check if ref_idx is in L0
-    if (LX_idx < info->state->frame->ref_LX_size[0] &&
-      info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) {
-      ref_list = 0;
-      break;
-    }
-
-    // check if ref_idx is in L1
-    if (LX_idx < info->state->frame->ref_LX_size[1] &&
-      info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) {
-      ref_list = 1;
-      break;
+  // Check if ref picture is present in the lists
+  for (int ref_list = 0; ref_list < 2; ++ref_list) {
+    for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) {
+      if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) {
+        ref_list_active[ref_list] = true;
+        ref_list_idx[ref_list] = i;
+        break;
+      }
     }
   }
-  // ref_idx has to be found in either L0 or L1
-  assert(LX_idx < LX_IDX_MAX_PLUS_1);
 
-  // store temp values to be stored back later
-  int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
+  // Must find at least one reference picture
+  assert(ref_list_active[0] || ref_list_active[1]);
+
+  // Does not matter which list is used, if in both.
+  int ref_list = ref_list_active[0] ? 0 : 1;
+  int LX_idx = ref_list_idx[ref_list];
 
   // Get MV candidates
-  cur_cu->inter.mv_ref[ref_list] = LX_idx;
+  cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];
 
   kvz_inter_get_mv_cand(info->state,
     info->origin.x,
@@ -1302,10 +1311,7 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     lcu,
     ref_list);
 
-  // store old values back
-  cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
-
-  vector2d_t mv = { 0, 0 };
+  vector2d_t best_mv = { 0, 0 };
 
   // Take starting point for MV search from previous frame.
   // When temporal motion vector candidates are added, there is probably
@@ -1319,8 +1325,7 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     if (ref_cu->inter.mv_dir & 1) {
       mv_previous.x = ref_cu->inter.mv[0][0];
       mv_previous.y = ref_cu->inter.mv[0][1];
-    }
-    else {
+    } else {
       mv_previous.x = ref_cu->inter.mv[1][0];
       mv_previous.y = ref_cu->inter.mv[1][1];
     }
@@ -1353,16 +1358,16 @@ static void search_pu_inter_ref(inter_search_info_t *info,
         info->state->frame->ref->pocs[neighbor_poc_index],
         info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[
           info->state->frame->ref->ref_LXs[neighbor_poc_index]
-          [col_list]
+            [col_list]
           [ref_cu->inter.mv_ref[col_list]]
         ],
         &mv_previous
-      );
+          );
     }
 
     // Check if the mv is valid after scaling
     if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) {
-      mv = mv_previous;
+      best_mv = mv_previous;
     }
   }
 
@@ -1375,102 +1380,90 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     default: break;
   }
 
-  info->best_cost = UINT32_MAX;
+  double best_cost = MAX_DOUBLE;
+  double best_bits = MAX_INT;
 
-  switch (cfg->ime_algorithm) {
-    case KVZ_IME_TZ:
-      tz_search(info, mv);
-      break;
+  // Select starting point from among merge candidates. These should
+  // include both mv_cand vectors and (0, 0).
+  select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv);
+  bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv);
+      
+  if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) {
 
-    case KVZ_IME_FULL64:
-    case KVZ_IME_FULL32:
-    case KVZ_IME_FULL16:
-    case KVZ_IME_FULL8:
-    case KVZ_IME_FULL:
-      search_mv_full(info, search_range, mv);
-      break;
+    switch (cfg->ime_algorithm) {
+      case KVZ_IME_TZ:
+        tz_search(info, best_mv, &best_cost, &best_bits, &best_mv);
+        break;
 
-    case KVZ_IME_DIA:
-      diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
-      break;
+      case KVZ_IME_FULL64:
+      case KVZ_IME_FULL32:
+      case KVZ_IME_FULL16:
+      case KVZ_IME_FULL8:
+      case KVZ_IME_FULL:
+        search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv);
+        break;
 
-    default:
-      hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
-      break;
-  }
+      case KVZ_IME_DIA:
+        diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps,
+                       &best_cost, &best_bits, &best_mv);
+        break;
 
-  if (cfg->fme_level > 0 && info->best_cost < *inter_cost) {
-    search_frac(info);
-
-  } else if (info->best_cost < UINT32_MAX) {
-    // Recalculate inter cost with SATD.
-    info->best_cost = kvz_image_calc_satd(
-        info->state->tile->frame->source,
-        info->ref,
-        info->origin.x,
-        info->origin.y,
-        info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> INTERNAL_MV_PREC),
-        info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> INTERNAL_MV_PREC),
-        info->width,
-        info->height);
-    info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5);
-  }
-
-  mv = info->best_mv;
-
-  int merged = 0;
-  int merge_idx = 0;
-  // Check every candidate to find a match
-  for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
-    if (info->merge_cand[merge_idx].dir != 3 &&
-        info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x &&
-        info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y &&
-        (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][
-        info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx)
-    {
-      merged = 1;
-      break;
+      default:
+        hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps,
+                       &best_cost, &best_bits, &best_mv);
+        break;
     }
   }
 
-  // Only check when candidates are different
-  int cu_mv_cand = 0;
-  if (!merged) {
-    cu_mv_cand =
-      select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL);
+  if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) {
+    // Recalculate inter cost with SATD.
+    best_cost = kvz_image_calc_satd(
+      info->state->tile->frame->source,
+      info->ref,
+      info->origin.x,
+      info->origin.y,
+      info->state->tile->offset_x + info->origin.x + (best_mv.x >> INTERNAL_MV_PREC),
+      info->state->tile->offset_y + info->origin.y + (best_mv.y >> INTERNAL_MV_PREC),
+      info->width,
+      info->height);
+    best_cost += best_bits * info->state->lambda_sqrt;
   }
 
-  if (info->best_cost < *inter_cost) {
-    // Map reference index to L0/L1 pictures
-    cur_cu->inter.mv_dir = ref_list+1;
+  double LX_cost[2] = { best_cost, best_cost };
+  double LX_bits[2] = { best_bits, best_bits };
+
+  // Compute costs and add entries for both lists, if necessary
+  for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) {
+
+    LX_idx = ref_list_idx[ref_list];
     uint8_t mv_ref_coded = LX_idx;
+    int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL);
+    const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing
+    LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt;
+    LX_bits[ref_list] += extra_bits;
 
-    cur_cu->merged                  = merged;
-    cur_cu->merge_idx               = merge_idx;
-    cur_cu->inter.mv_ref[ref_list]  = LX_idx;
-    cur_cu->inter.mv[ref_list][0]   = (mv_t)mv.x;
-    cur_cu->inter.mv[ref_list][1]   = (mv_t)mv.y;
+    // Update best unipreds for biprediction
+    bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y);
+    if (valid_mv && best_cost < MAX_DOUBLE) {
 
-    CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand);
-
-    *inter_cost = info->best_cost;
-    *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
-  }
-
-
-  // Update best unipreds for biprediction
-  if (info->best_cost < best_LX_cost[ref_list]) {
-    bool valid_mv = fracmv_within_tile(info, mv.x, mv.y);
-    if (valid_mv) {
       // Map reference index to L0/L1 pictures
-      unipred_LX[ref_list].inter.mv_dir = ref_list + 1;
-      unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx;
-      unipred_LX[ref_list].inter.mv[ref_list][0] = (mv_t)mv.x;
-      unipred_LX[ref_list].inter.mv[ref_list][1] = (mv_t)mv.y;
+      unit_stats_map_t *cur_map = &amvp[ref_list];
+      int entry = cur_map->size;
+      cu_info_t *unipred_pu = &cur_map->unit[entry];
+      *unipred_pu = *cur_cu;
+      unipred_pu->type = CU_INTER;
+      unipred_pu->merged  = false;
+      unipred_pu->skipped = false;
+      unipred_pu->inter.mv_dir = ref_list + 1;
+      unipred_pu->inter.mv_ref[ref_list] = LX_idx;
+      unipred_pu->inter.mv[ref_list][0] = (mv_t)best_mv.x;
+      unipred_pu->inter.mv[ref_list][1] = (mv_t)best_mv.y;
+      CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand);
 
-      CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand);
-
-      best_LX_cost[ref_list] = info->best_cost;
+      cur_map->cost[entry] = best_cost;
+      cur_map->bits[entry] = best_bits;
+      cur_map->keys[entry] = entry;
+      cur_map->size++;
     }
   }
 }
@@ -1481,9 +1474,8 @@ static void search_pu_inter_ref(inter_search_info_t *info,
  */
 static void search_pu_inter_bipred(inter_search_info_t *info,
                                    int depth,
-                                   lcu_t *lcu, cu_info_t *cur_cu,
-                                   double *inter_cost,
-                                   uint32_t *inter_bitcost)
+                                   lcu_t *lcu,
+                                   unit_stats_map_t *amvp_bipred)
 {
   const image_list_t *const ref = info->state->frame->ref;
   uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
@@ -1515,11 +1507,26 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
       continue;
     }
 
-    mv_t mv[2][2];
+    cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size];
+    *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+
+    bipred_pu->inter.mv_dir = 3;
+
+    bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0];
+    bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1];
+
+    int16_t(*mv)[2] = bipred_pu->inter.mv;
     mv[0][0] = merge_cand[i].mv[0][0];
     mv[0][1] = merge_cand[i].mv[0][1];
     mv[1][0] = merge_cand[j].mv[1][0];
     mv[1][1] = merge_cand[j].mv[1][1];
+    
+    bipred_pu->merged  = false;
+    bipred_pu->skipped = false;
+
+    for (int reflist = 0; reflist < 2; reflist++) {
+      kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+    }
 
     // Don't try merge candidates that don't satisfy mv constraints.
     if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) ||
@@ -1541,10 +1548,10 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
 
     const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
     const kvz_pixel *src = &frame->source->y[x + y * frame->source->stride];
-    uint32_t cost =
+    double cost =
       kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->stride);
 
-    uint32_t bitcost[2] = { 0, 0 };
+    double bitcost[2] = { 0, 0 };
 
     cost += info->mvd_cost_func(info->state,
                                merge_cand[i].mv[0][0],
@@ -1566,51 +1573,25 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
       merge_cand[j].ref[1]
     };
     const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */;
-    cost += info->state->lambda_sqrt * extra_bits + 0.5;
+    cost += info->state->lambda_sqrt * extra_bits;
 
-    if (cost < *inter_cost) {
-      cur_cu->inter.mv_dir = 3;
-
-      cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0];
-      cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1];
-
-      cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0];
-      cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1];
-      cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0];
-      cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1];
-      cur_cu->merged = 0;
-
-      // Check every candidate to find a match
-      for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
-        if (merge_cand[merge_idx].dir != 3) continue;
-        if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
-            merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&
-            merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
-            merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&
-            merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] &&
-            merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1])
-        {
-          cur_cu->merged = 1;
-          cur_cu->merge_idx = merge_idx;
-          break;
-        }
-      }
-
-      // Each motion vector has its own candidate
-      for (int reflist = 0; reflist < 2; reflist++) {
-        kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist);
-        int cu_mv_cand = select_mv_cand(
-            info->state,
-            info->mv_cand,
-            cur_cu->inter.mv[reflist][0],
-            cur_cu->inter.mv[reflist][1],
-            NULL);
-        CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
-      }
-
-      *inter_cost = cost;
-      *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits;
+    // Each motion vector has its own candidate
+    for (int reflist = 0; reflist < 2; reflist++) {
+      int cu_mv_cand = select_mv_cand(
+        info->state,
+        info->mv_cand,
+        bipred_pu->inter.mv[reflist][0],
+        bipred_pu->inter.mv[reflist][1],
+        NULL);
+      CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand);
     }
+
+    bipred_pu->type = CU_INTER;
+
+    amvp_bipred->cost[amvp_bipred->size] = cost;
+    amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits;
+    amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size;
+    amvp_bipred->size++;
   }
 }
 
@@ -1624,14 +1605,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
  *
  * \return                Does an identical candidate exist in list
  */
-static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
-                                    inter_merge_cand_t * cand_to_add,
-                                    int8_t * added_idx_list,
-                                    int list_size)
+static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
+                                    inter_merge_cand_t *cand_to_add,
+                                    unit_stats_map_t *merge)
 {
   bool found = false;
-  for (int i = 0; i < list_size && !found; ++i) {
-    inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]];
+  for (int i = 0; i < merge->size && !found; ++i) {
+    int key = merge->keys[i];
+    inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx];
 
     found = cand_to_add->dir == list_cand->dir &&
         cand_to_add->ref[0] == list_cand->ref[0] &&
@@ -1646,7 +1627,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
 }
 
 /**
- * \brief Update PU to have best modes at this depth.
+ * \brief Collect PU parameters and costs at this depth.
  *
  * \param state       encoder state
  * \param x_cu        x-coordinate of the containing CU
@@ -1656,28 +1637,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
  * \param i_pu        index of the PU in the CU
  * \param lcu         containing LCU
  *
- * \param inter_cost    Return inter cost of the best mode
- * \param inter_bitcost Return inter bitcost of the best mode
+ * \param amvp        Return searched AMVP PUs sorted by costs
+ * \param merge       Return searched Merge PUs sorted by costs
  */
 static void search_pu_inter(encoder_state_t * const state,
-                            int x_cu, int y_cu,
-                            int depth,
-                            part_mode_t part_mode,
-                            int i_pu,
-                            lcu_t *lcu,
-                            double *inter_cost,
-                            uint32_t *inter_bitcost)
+  int x_cu, int y_cu,
+  int depth,
+  part_mode_t part_mode,
+  int i_pu,
+  lcu_t *lcu,
+  unit_stats_map_t *amvp,
+  unit_stats_map_t *merge,
+  inter_search_info_t *info)
 {
-  *inter_cost = MAX_INT;
-  *inter_bitcost = MAX_INT;
-
   const kvz_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
-  const int width_cu  = LCU_WIDTH >> depth;
-  const int x         = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y         = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width     = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height    = PU_GET_H(part_mode, width_cu, i_pu);
+  const int width_cu = LCU_WIDTH >> depth;
+  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
+  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
+  const int width = PU_GET_W(part_mode, width_cu, i_pu);
+  const int height = PU_GET_H(part_mode, width_cu, i_pu);
 
   // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
   // nRx2N partitions.
@@ -1686,129 +1665,160 @@ static void search_pu_inter(encoder_state_t * const state,
   // 2NxnD partitions.
   const bool merge_b1 = i_pu == 0 || width <= height;
 
-  const int x_local   = SUB_SCU(x);
-  const int y_local   = SUB_SCU(y);
-  cu_info_t *cur_cu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  const int x_local = SUB_SCU(x);
+  const int y_local = SUB_SCU(y);
+  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  cur_pu->type = CU_NOTSET;
+  cur_pu->part_size = part_mode;
+  cur_pu->depth = depth;
+  cur_pu->qp = state->qp;
 
-  inter_search_info_t info = {
-    .state          = state,
-    .pic            = frame->source,
-    .origin         = { x, y },
-    .width          = width,
-    .height         = height,
-    .mvd_cost_func  = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost,
-    .optimized_sad  = kvz_get_optimized_sad(width),
-  };
+  // Default to candidate 0
+  CU_SET_MV_CAND(cur_pu, 0, 0);
+  CU_SET_MV_CAND(cur_pu, 1, 0);
+
+  FILL(*info, 0);
+
+  info->state          = state;
+  info->pic            = frame->source;
+  info->origin.x       = x;
+  info->origin.y       = y;
+  info->width          = width;
+  info->height         = height;
+  info->mvd_cost_func  = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost;
+  info->optimized_sad  = kvz_get_optimized_sad(width);
 
   // Search for merge mode candidates
-  info.num_merge_cand = kvz_inter_get_merge_cand(
+  info->num_merge_cand = kvz_inter_get_merge_cand(
       state,
       x, y,
       width, height,
       merge_a1, merge_b1,
-      info.merge_cand,
+      info->merge_cand,
       lcu
   );
 
-  // Default to candidate 0
-  CU_SET_MV_CAND(cur_cu, 0, 0);
-  CU_SET_MV_CAND(cur_cu, 1, 0);
-
   // Merge Analysis starts here
-  int8_t mrg_cands[MRG_MAX_NUM_CANDS];
-  double mrg_costs[MRG_MAX_NUM_CANDS];
+  merge->size = 0;
   for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) {
-    mrg_cands[i] = -1;
-    mrg_costs[i] = MAX_DOUBLE;
+    merge->keys[i] = -1;
+    merge->cost[i] = MAX_DOUBLE;
   }
 
-  int num_rdo_cands = 0;
-
+  const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1);
+#ifdef COMPLETE_PRED_MODE_BITS
+  // Technically counting these bits would be correct, however counting
+  // them universally degrades quality so this block is disabled by default
+  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0);
+#else
+  const double no_skip_flag = 0;
+#endif
   // Check motion vector constraints and perform rough search
-  for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {    
-    inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx];
+  for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) {
 
-    cur_cu->inter.mv_dir = cur_cand->dir;
-    cur_cu->inter.mv_ref[0] = cur_cand->ref[0];
-    cur_cu->inter.mv_ref[1] = cur_cand->ref[1];
-    cur_cu->inter.mv[0][0] = cur_cand->mv[0][0];
-    cur_cu->inter.mv[0][1] = cur_cand->mv[0][1];
-    cur_cu->inter.mv[1][0] = cur_cand->mv[1][0];
-    cur_cu->inter.mv[1][1] = cur_cand->mv[1][1];
+    inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx];
+    cur_pu->inter.mv_dir = cur_cand->dir;
+    cur_pu->inter.mv_ref[0] = cur_cand->ref[0];
+    cur_pu->inter.mv_ref[1] = cur_cand->ref[1];
+    cur_pu->inter.mv[0][0] = cur_cand->mv[0][0];
+    cur_pu->inter.mv[0][1] = cur_cand->mv[0][1];
+    cur_pu->inter.mv[1][0] = cur_cand->mv[1][0];
+    cur_pu->inter.mv[1][1] = cur_cand->mv[1][1];
 
     // If bipred is not enabled, do not try candidates with mv_dir == 3.
     // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
-    if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
-    if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue;
+    if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
+    if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
 
-    bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand,
-      mrg_cands, 
-      num_rdo_cands);
+    bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);
 
     // Don't try merge candidates that don't satisfy mv constraints.
     // Don't add duplicates to list
-    if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) ||
-        !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) ||
+    bool active_L0 = cur_pu->inter.mv_dir & 1;
+    bool active_L1 = cur_pu->inter.mv_dir & 2;
+    if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) ||
+        (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) ||
         is_duplicate)
     {
       continue;
     }
     kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
-    mrg_costs[num_rdo_cands] = kvz_satd_any_size(width, height,
-      lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
-      lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
-    
-    // Add cost of coding the merge index
-    mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt;
+    merge->unit[merge->size] = *cur_pu;
+    merge->unit[merge->size].type = CU_INTER;
+    merge->unit[merge->size].merge_idx = merge_idx;
+    merge->unit[merge->size].merged = true;
+    merge->unit[merge->size].skipped = false;
 
-    mrg_cands[num_rdo_cands] = merge_idx;
-    num_rdo_cands++;
+    double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
+    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+      kvz_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    }
+    else {
+      merge->cost[merge->size] = kvz_satd_any_size(width, height,
+        lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
+        lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
+      bits += no_skip_flag;
+      merge->cost[merge->size] += bits * info->state->lambda_sqrt;
+    }
+    // Add cost of coding the merge index
+    merge->bits[merge->size] = bits;
+    merge->keys[merge->size] = merge->size;
+
+
+    merge->size++;
   }
 
-  // Sort candidates by cost
-  kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
+  assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE);
+  kvz_sort_keys_by_cost(merge);
 
-  // Limit by availability
-  // TODO: Do not limit to just 1
-  num_rdo_cands = MIN(1, num_rdo_cands);
+  // Try early skip decision on just one merge candidate if available
+  int num_rdo_cands = MIN(1, merge->size);
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
-  if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
-    for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) {
-
-      // Reconstruct blocks with merge candidate.
-      // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
-      // and chroma exists.
-      // Early terminate if merge candidate with zero CBF is found.
-      int merge_idx = mrg_cands[merge_rdo_idx];
-      inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx];
-
-      cur_cu->inter.mv_dir    = cur_cand->dir;
-      cur_cu->inter.mv_ref[0] = cur_cand->ref[0];
-      cur_cu->inter.mv_ref[1] = cur_cand->ref[1];
-      cur_cu->inter.mv[0][0]  = cur_cand->mv[0][0];
-      cur_cu->inter.mv[0][1]  = cur_cand->mv[0][1];
-      cur_cu->inter.mv[1][0]  = cur_cand->mv[1][0];
-      cur_cu->inter.mv[1][1]  = cur_cand->mv[1][1];
-
-      kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
-      kvz_inter_recon_cu(state, lcu, x, y, width, true, false);
-      kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true);
-
-      if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) {
-        continue;
+  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+    for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
+      if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
+        merge->size = 1;
+        merge->bits[0] = merge->bits[merge->keys[merge_key]];
+        merge->cost[0] = merge->cost[merge->keys[merge_key]];
+        merge->unit[0] = merge->unit[merge->keys[merge_key]];
+        merge->keys[0] = 0;
       }
-      else if (has_chroma) {
-        kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
-        kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true);
-        if (!cbf_is_set_any(cur_cu->cbf, depth)) {
-          cur_cu->type = CU_INTER;
-          cur_cu->merge_idx = merge_idx;
-          cur_cu->skipped = true;
-          *inter_cost = 0.0;  // TODO: Check this
-          *inter_bitcost = merge_idx; // TODO: Check this
-          return;
+      else if(cfg->rdo < 2) {
+        // Reconstruct blocks with merge candidate.
+        // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
+        // and chroma exists.
+        // Early terminate if merge candidate with zero CBF is found.
+        int merge_idx           = merge->unit[merge->keys[merge_key]].merge_idx;
+        cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
+        cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0];
+        cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1];
+        cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
+        cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
+        cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
+        cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
+        kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
+        kvz_inter_recon_cu(state, lcu, x, y, width, true, false);
+        kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_pu, lcu, true);
+
+        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+          continue;
+        }
+        else if (has_chroma) {
+          kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_pu, lcu, true);
+          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+            cur_pu->type = CU_INTER;
+            cur_pu->merge_idx = merge_idx;
+            cur_pu->skipped = true;
+
+            merge->size = 1;
+            merge->cost[0] = 0.0; // TODO: Check this
+            merge->bits[0] = merge_idx; // TODO: Check this
+            merge->unit[0] = *cur_pu;
+            return;
+          }
         }
       }
     }
@@ -1816,16 +1826,139 @@ static void search_pu_inter(encoder_state_t * const state,
 
   // AMVP search starts here
 
-  // Store unipred information of L0 and L1 for biprediction
-  // Best cost will be left at MAX_DOUBLE if no valid CU is found
-  double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE };
-  cu_info_t unipreds[2];
+  amvp[0].size = 0;
+  amvp[1].size = 0;
+  amvp[2].size = 0;
+
+  for (int mv_dir = 1; mv_dir < 4; ++mv_dir) {
+    for (int i = 0; i < state->frame->ref->used_size; ++i) {
+      amvp[mv_dir - 1].cost[i] = MAX_DOUBLE;
+    }
+  }
 
   for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
-    info.ref_idx = ref_idx;
-    info.ref = state->frame->ref->images[ref_idx];
+    info->ref_idx = ref_idx;
+    info->ref = state->frame->ref->images[ref_idx];
 
-    search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds);
+    search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
+  }
+
+  assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
+  assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE);
+  kvz_sort_keys_by_cost(&amvp[0]);
+  kvz_sort_keys_by_cost(&amvp[1]);
+
+  int best_keys[2] = { 
+    amvp[0].size > 0 ? amvp[0].keys[0] : 0, 
+    amvp[1].size > 0 ? amvp[1].keys[0] : 0
+  };
+
+  cu_info_t *best_unipred[2] = {
+    &amvp[0].unit[best_keys[0]],
+    &amvp[1].unit[best_keys[1]]
+  };
+
+  // Prevent using the same ref picture with both lists.
+  // TODO: allow searching two MVs from the same reference picture.
+  if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) {
+
+    uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
+
+    int L0_idx = best_unipred[0]->inter.mv_ref[0];
+    int L1_idx = best_unipred[1]->inter.mv_ref[1];
+    
+    int L0_ref_idx = ref_LX[0][L0_idx];
+    int L1_ref_idx = ref_LX[1][L1_idx];
+
+    if (L0_ref_idx == L1_ref_idx) {
+      // Invalidate the other based the list that has the 2nd best PU
+      double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE;
+      double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE;
+      int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0;
+      amvp[list].cost[best_keys[list]] = MAX_DOUBLE;
+      kvz_sort_keys_by_cost(&amvp[list]);
+      amvp[list].size--;
+      best_keys[list]    =  amvp[list].keys[0];
+      best_unipred[list] = &amvp[list].unit[best_keys[list]];
+    }
+  }
+
+  // Fractional-pixel motion estimation.
+  // Refine the best PUs so far from both lists, if available.
+  for (int list = 0; list < 2; ++list) {
+
+    // TODO: make configurable
+    int n_best = MIN(1, amvp[list].size);
+    if (cfg->fme_level > 0) {
+
+      for (int i = 0; i < n_best; ++i) {
+
+        int key = amvp[list].keys[i];
+        cu_info_t *unipred_pu = &amvp[list].unit[key];
+
+        // Find the reference picture
+        const image_list_t *const ref = info->state->frame->ref;
+        uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
+
+        int LX_idx = unipred_pu->inter.mv_ref[list];
+        info->ref_idx = ref_LX[list][LX_idx];
+        info->ref = ref->images[info->ref_idx];
+
+        kvz_inter_get_mv_cand(info->state,
+          info->origin.x,
+          info->origin.y,
+          info->width,
+          info->height,
+          info->mv_cand,
+          unipred_pu,
+          lcu,
+          list);
+
+        double     frac_cost = MAX_DOUBLE;
+        double   frac_bits = MAX_INT;
+        vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] };
+
+        search_frac(info, &frac_cost, &frac_bits, &frac_mv);
+
+        uint8_t mv_ref_coded = LX_idx;
+        int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL);
+        const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing
+        frac_cost += extra_bits * info->state->lambda_sqrt;
+        frac_bits += extra_bits;
+
+        bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y);
+        if (valid_mv) {
+
+          unipred_pu->inter.mv[list][0] = frac_mv.x;
+          unipred_pu->inter.mv[list][1] = frac_mv.y;
+          CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);
+
+          if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+            kvz_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
+          }
+
+          amvp[list].cost[key] = frac_cost;
+          amvp[list].bits[key] = frac_bits;
+        }
+      }
+
+      // Invalidate PUs with SAD-based costs. (FME not performed).
+      // TODO: Recalculate SAD costs with SATD for further processing.
+      for (int i = n_best; i < amvp[list].size; ++i) {
+        int key = amvp[list].keys[i];
+        amvp[list].cost[key] = MAX_DOUBLE;
+      }
+    }
+
+    // Costs are now, SATD-based. Omit PUs with SAD-based costs.
+    // TODO: Recalculate SAD costs with SATD for further processing.
+    kvz_sort_keys_by_cost(&amvp[list]);
+    amvp[list].size = n_best;
+  }
+
+  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
+    if (amvp[0].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
+    if (amvp[1].size) kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
   }
 
   // Search bi-pred positions
@@ -1835,25 +1968,39 @@ static void search_pu_inter(encoder_state_t * const state,
 
   if (can_use_bipred) {
 
+    cu_info_t *bipred_pu = &amvp[2].unit[0];
+    *bipred_pu = *cur_pu;
+    double   best_bipred_cost = MAX_DOUBLE;
+
     // Try biprediction from valid acquired unipreds.
-    if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) {
+    if (amvp[0].size > 0 && amvp[1].size > 0) {
 
       // TODO: logic is copy paste from search_pu_inter_bipred.
       // Get rid of duplicate code asap.
-      const image_list_t *const ref = info.state->frame->ref;
-      uint8_t(*ref_LX)[16] = info.state->frame->ref_LX;
+      const image_list_t *const ref = info->state->frame->ref;
+      uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
 
-      inter_merge_cand_t *merge_cand = info.merge_cand;
+      bipred_pu->inter.mv_dir = 3;
 
-      mv_t mv[2][2];
-      mv[0][0] = unipreds[0].inter.mv[0][0];
-      mv[0][1] = unipreds[0].inter.mv[0][1];
-      mv[1][0] = unipreds[1].inter.mv[1][0];
-      mv[1][1] = unipreds[1].inter.mv[1][1];
+      bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0];
+      bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1];
 
-      kvz_inter_recon_bipred(info.state,
-        ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]],
-        ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]],
+      int16_t (*mv)[2] = bipred_pu->inter.mv;
+      mv[0][0] = best_unipred[0]->inter.mv[0][0];
+      mv[0][1] = best_unipred[0]->inter.mv[0][1];
+      mv[1][0] = best_unipred[1]->inter.mv[1][0];
+      mv[1][1] = best_unipred[1]->inter.mv[1][1];
+      
+      bipred_pu->merged  = false;
+      bipred_pu->skipped = false;
+
+      for (int reflist = 0; reflist < 2; reflist++) {
+        kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+      }
+
+      kvz_inter_recon_bipred(info->state,
+        ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
+        ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
         x, y,
         width,
         height,
@@ -1864,104 +2011,79 @@ static void search_pu_inter(encoder_state_t * const state,
 
       const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
       const kvz_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
-      uint32_t cost =
+
+      best_bipred_cost =
         kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
 
-      uint32_t bitcost[2] = { 0, 0 };
+      double bitcost[2] = { 0, 0 };
 
-      cost += info.mvd_cost_func(info.state,
-        unipreds[0].inter.mv[0][0],
-        unipreds[0].inter.mv[0][1],
+      best_bipred_cost += info->mvd_cost_func(info->state,
+        bipred_pu->inter.mv[0][0],
+        bipred_pu->inter.mv[0][1],
         0,
-        info.mv_cand,
+        info->mv_cand,
         NULL, 0, 0,
         &bitcost[0]);
-      cost += info.mvd_cost_func(info.state,
-        unipreds[1].inter.mv[1][0],
-        unipreds[1].inter.mv[1][1],
+      best_bipred_cost += info->mvd_cost_func(info->state,
+        bipred_pu->inter.mv[1][0],
+        bipred_pu->inter.mv[1][1],
         0,
-        info.mv_cand,
+        info->mv_cand,
         NULL, 0, 0,
         &bitcost[1]);
 
       const uint8_t mv_ref_coded[2] = {
-        unipreds[0].inter.mv_ref[0],
-        unipreds[1].inter.mv_ref[1]
+        bipred_pu->inter.mv_ref[0],
+        bipred_pu->inter.mv_ref[1]
       };
       const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */;
-      cost += info.state->lambda_sqrt * extra_bits + 0.5;
+      best_bipred_cost += info->state->lambda_sqrt * extra_bits;
 
-      if (cost < *inter_cost) {
-        cur_cu->inter.mv_dir = 3;
-
-        cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0];
-        cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1];
-
-        cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0];
-        cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1];
-        cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0];
-        cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1];
-        cur_cu->merged = 0;
-
-        // Check every candidate to find a match
-        for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) {
-          if (merge_cand[merge_idx].dir != 3) continue;
-          if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
-            merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&
-            merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
-            merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&
-            merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] &&
-            merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1])
-          {
-            cur_cu->merged = 1;
-            cur_cu->merge_idx = merge_idx;
-            break;
-          }
-        }
+      if (best_bipred_cost < MAX_DOUBLE) {
 
         // Each motion vector has its own candidate
         for (int reflist = 0; reflist < 2; reflist++) {
-          kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist);
           int cu_mv_cand = select_mv_cand(
-            info.state,
-            info.mv_cand,
-            cur_cu->inter.mv[reflist][0],
-            cur_cu->inter.mv[reflist][1],
+            info->state,
+            info->mv_cand,
+            bipred_pu->inter.mv[reflist][0],
+            bipred_pu->inter.mv[reflist][1],
             NULL);
-          CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
+          CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand);
         }
 
-        *inter_cost = cost;
-        *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits;
+        amvp[2].cost[amvp[2].size] = best_bipred_cost;
+        amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits;
+        amvp[2].keys[amvp[2].size] = amvp[2].size;
+        amvp[2].size++;
       }
     }
 
     // TODO: this probably should have a separate command line option
-    if (cfg->rdo >= 3) {
-      search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
+    if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
+    
+    assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
+    kvz_sort_keys_by_cost(&amvp[2]);
+    if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+      kvz_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
     }
   }
-
-  // Compare best merge cost to amvp cost
-  if (mrg_costs[0] < *inter_cost) {
-    *inter_cost = mrg_costs[0];
-    *inter_bitcost = 0; // TODO: Check this
-    int merge_idx = mrg_cands[0];
-    cur_cu->type = CU_INTER;
-    cur_cu->merge_idx = merge_idx;
-    cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir;
-    cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0];
-    cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1];
-    cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0];
-    cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1];
-    cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0];
-    cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1];
-    cur_cu->merged = true;
-    cur_cu->skipped = false;
-  }
-
-  if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) {
-    assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]));
+  if(cfg->rdo < 2) {
+    int predmode_ctx;
+    const int skip_contest = kvz_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
+    const double part_mode_bits = state->encoder_control->cfg.smp_enable || state->encoder_control->cfg.amp_enable ?
+      CTX_ENTROPY_FBITS(&state->search_cabac.ctx.part_size_model[0], 1)
+        : 0;
+    const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
+    const double total_bits = no_skip_flag + part_mode_bits + pred_mode_bits;
+    for(int i = 0; i < 3; i++) {
+      if(amvp[i].size > 0) {
+        const uint8_t best_key = amvp[i].keys[0];
+        amvp[i].bits[best_key] += total_bits;
+        amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt;
+      }
+    }
   }
 }
 
@@ -1985,32 +2107,92 @@ static void search_pu_inter(encoder_state_t * const state,
 * \param inter_bitcost Return inter bitcost
 */
 void kvz_cu_cost_inter_rd2(encoder_state_t * const state,
-  int x, int y, int depth,
-  lcu_t *lcu,
-  double   *inter_cost,
-  uint32_t *inter_bitcost){
-
-  cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+                           int x, int y, int depth,
+                           cu_info_t* cur_cu,
+                           lcu_t *lcu,
+                           double   *inter_cost,
+                           double* inter_bitcost){
+  
   int tr_depth = MAX(1, depth);
   if (cur_cu->part_size != SIZE_2Nx2N) {
     tr_depth = depth + 1;
   }
   kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
 
+  const int x_px = SUB_SCU(x);
+  const int y_px = SUB_SCU(y);
+  const int width = LCU_WIDTH >> depth;
+  cabac_data_t cabac_copy;
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
+  cabac_copy.update = 1;
+
+  cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  *cur_pu = *cur_cu;
+
   const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
   kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
-  kvz_quantize_lcu_residual(state, true, reconstruct_chroma,
-    x, y, depth,
-    NULL,
-    lcu,
-    false);
 
-  *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+  int index = y_px * LCU_WIDTH + x_px;
+  double ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+                                   LCU_WIDTH, LCU_WIDTH,
+                                   width) * KVZ_LUMA_MULT;
   if (reconstruct_chroma) {
-    *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+    int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
+    double ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+                                       LCU_WIDTH_C, LCU_WIDTH_C,
+                                       width / 2);
+    double ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+                                       LCU_WIDTH_C, LCU_WIDTH_C,
+                                       width / 2);
+    ssd += (ssd_u + ssd_v) * KVZ_CHROMA_MULT;
   }
+  double no_cbf_bits;
+  double bits = 0;
+  const int skip_context = kvz_get_skip_context(x, y, lcu, NULL, NULL);
+  if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+    no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
+    bits += kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu);
+  }
+  else {
+    no_cbf_bits = kvz_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu);
+    bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1);
+  }
+  double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
 
-  *inter_cost += *inter_bitcost * state->lambda;
+  kvz_quantize_lcu_residual(state, true, reconstruct_chroma,
+                            x, y, depth,
+                            cur_cu,
+                            lcu,
+                            false);
+
+  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  
+  if(cbf) {
+    *inter_cost = kvz_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
+    if (reconstruct_chroma) {
+      *inter_cost += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+    }
+  }
+  else {
+    // If we have no coeffs after quant we already have the cost calculated
+    *inter_cost = no_cbf_cost;
+    cur_cu->cbf = 0;
+    *inter_bitcost = no_cbf_bits;
+    return;
+  }
+  
+  *inter_cost += (bits)* state->lambda;
+  *inter_bitcost = bits;
+
+  if(no_cbf_cost < *inter_cost) {
+    cur_cu->cbf = 0;
+    if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+      cur_cu->skipped = 1;
+    }
+    *inter_cost = no_cbf_cost;
+    *inter_bitcost = no_cbf_bits;
+    
+  }
 }
 
 
@@ -2032,22 +2214,80 @@ void kvz_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
                          lcu_t *lcu,
                          double   *inter_cost,
-                         uint32_t *inter_bitcost)
+                         double* inter_bitcost)
 {
+  *inter_cost = MAX_DOUBLE;
+  *inter_bitcost = MAX_INT;
+
+  // Store information of L0, L1, and bipredictions.
+  // Best cost will be left at MAX_DOUBLE if no valid CU is found.
+  // These will be initialized by the following function.
+  unit_stats_map_t amvp[3];
+  unit_stats_map_t merge;
+  inter_search_info_t info;
+
   search_pu_inter(state,
                   x, y, depth,
                   SIZE_2Nx2N, 0,
                   lcu,
-                  inter_cost,
-                  inter_bitcost);
+                  amvp,
+                  &merge,
+                  &info);
 
-  // Calculate more accurate cost when needed
-  if (state->encoder_control->cfg.rdo >= 2) {
-    kvz_cu_cost_inter_rd2(state,
-      x, y, depth,
-      lcu,
-      inter_cost,
-      inter_bitcost);
+  // Early Skip CU decision
+  if (merge.size == 1 && merge.unit[0].skipped) {
+    *inter_cost    = merge.cost[0];
+    *inter_bitcost = merge.bits[0];
+    return;
+  }
+
+  cu_info_t *best_inter_pu = NULL;
+
+  // Find best AMVP PU
+  for (int mv_dir = 1; mv_dir < 4; ++mv_dir) {
+
+    int best_key = amvp[mv_dir - 1].keys[0];
+
+    if (amvp[mv_dir - 1].size > 0 &&
+        amvp[mv_dir - 1].cost[best_key] < *inter_cost) {
+
+      best_inter_pu  = &amvp[mv_dir - 1].unit[best_key];
+      *inter_cost    =  amvp[mv_dir - 1].cost[best_key];
+      *inter_bitcost =  amvp[mv_dir - 1].bits[best_key];
+    }
+  }
+
+  // Compare best AMVP against best Merge mode
+  int best_merge_key = merge.keys[0];
+
+  if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) {
+
+    best_inter_pu  = &merge.unit[best_merge_key];
+    *inter_cost    =  merge.cost[best_merge_key];
+    *inter_bitcost =  0; // TODO: Check this
+  }
+
+  if (*inter_cost == MAX_DOUBLE) {
+    // Could not find any motion vector.
+    *inter_cost = MAX_DOUBLE;
+    *inter_bitcost = MAX_INT;
+    return;
+  }
+
+  const int x_local = SUB_SCU(x);
+  const int y_local = SUB_SCU(y);
+  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  *cur_pu = *best_inter_pu;
+
+  kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
+    true, state->encoder_control->chroma_format != KVZ_CSP_400);   
+
+  if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
+    assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
+  }
+
+  if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) {
+    assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]));
   }
 }
 
@@ -2067,14 +2307,24 @@ void kvz_search_cu_inter(encoder_state_t * const state,
  * \param inter_cost    Return inter cost
  * \param inter_bitcost Return inter bitcost
  */
-void kvz_search_cu_smp(encoder_state_t * const state,
+void kvz_search_cu_smp(encoder_state_t* const state,
                        int x, int y,
                        int depth,
                        part_mode_t part_mode,
                        lcu_t *lcu,
                        double *inter_cost,
-                       uint32_t *inter_bitcost)
+                       double* inter_bitcost)
 {
+  *inter_cost = MAX_DOUBLE;
+  *inter_bitcost = MAX_INT;
+
+  // Store information of L0, L1, and bipredictions.
+  // Best cost will be left at MAX_DOUBLE if no valid CU is found.
+  // These will be initialized by the following function.
+  unit_stats_map_t amvp[3];
+  unit_stats_map_t merge;
+  inter_search_info_t info;
+
   const int num_pu  = kvz_part_mode_num_parts[part_mode];
   const int width   = LCU_WIDTH >> depth;
   const int y_local = SUB_SCU(y);
@@ -2088,58 +2338,94 @@ void kvz_search_cu_smp(encoder_state_t * const state,
     const int y_pu      = PU_GET_Y(part_mode, width, y_local, i);
     const int width_pu  = PU_GET_W(part_mode, width, i);
     const int height_pu = PU_GET_H(part_mode, width, i);
-    cu_info_t *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
 
-    cur_pu->type      = CU_INTER;
-    cur_pu->part_size = part_mode;
-    cur_pu->depth     = depth;
-    cur_pu->qp        = state->qp;
+    double cost    = MAX_DOUBLE;
+    double bitcost = MAX_INT;
 
-    double cost      = MAX_INT;
-    uint32_t bitcost = MAX_INT;
+    search_pu_inter(state, x, y, depth, part_mode, i, lcu, amvp, &merge, &info);
 
-    search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost);
+    cu_info_t* best_inter_pu = NULL;
 
-    if (cost >= MAX_INT) {
+    // Find best AMVP PU
+    for (int mv_dir = 1; mv_dir < 4; ++mv_dir) {
+
+      int best_key = amvp[mv_dir - 1].keys[0];
+
+      if (amvp[mv_dir - 1].size > 0 &&
+        amvp[mv_dir - 1].cost[best_key] < cost) {
+
+        best_inter_pu  = &amvp[mv_dir - 1].unit[best_key];
+        cost           =  amvp[mv_dir - 1].cost[best_key];
+        bitcost        =  amvp[mv_dir - 1].bits[best_key];
+      }
+    }
+
+    // Compare best AMVP against best Merge mode
+    int best_merge_key = merge.keys[0];
+
+    if (merge.size > 0 && merge.cost[best_merge_key] < cost) {
+
+      best_inter_pu = &merge.unit[best_merge_key];
+      cost          =  merge.cost[best_merge_key];
+      bitcost       =  0; // TODO: Check this
+    }
+
+    if (cost == MAX_DOUBLE) {
       // Could not find any motion vector.
-      *inter_cost    = MAX_INT;
+      *inter_cost = MAX_DOUBLE;
       *inter_bitcost = MAX_INT;
       return;
     }
 
-    *inter_cost    += cost;
+    *inter_cost += cost;
     *inter_bitcost += bitcost;
 
-    for (int y_idx = y_pu; y_idx < y_pu + height_pu; y_idx += SCU_WIDTH) {
-      for (int x_idx = x_pu; x_idx < x_pu + width_pu; x_idx += SCU_WIDTH) {
-        cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x_idx, y_idx);
+    cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
+    *cur_pu = *best_inter_pu;
+
+    for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) {
+      for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) {
+        cu_info_t* scu = LCU_GET_CU_AT_PX(lcu, x, y);
         scu->type = CU_INTER;
         scu->inter = cur_pu->inter;
       }
     }
+
+    if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
+      assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
+    }
+
+    if (cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) {
+      assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]));
+    }
+  }
+  double smp_extra_bits = 0;
+  if (state->encoder_control->cfg.rdo < 2) {
+    //smp_extra_bits = kvz_encode_part_mode(
+    //  state,
+    //  &state->search_cabac,
+    //  LCU_GET_CU_AT_PX(lcu, x_local, y_local),
+    //  depth
+    //);
+
+    CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL, NULL)], 0, smp_extra_bits, "skip_flag");
+
+    // The transform is split for SMP and AMP blocks so we need more bits for
+    // coding the CBF.
+    smp_extra_bits += 6;
+
+    *inter_bitcost += smp_extra_bits;
   }
 
   // Calculate more accurate cost when needed
   if (state->encoder_control->cfg.rdo >= 2) {
     kvz_cu_cost_inter_rd2(state,
-      x, y, depth,
-      lcu,
-      inter_cost,
-      inter_bitcost);
+                          x, y, depth, 
+                          LCU_GET_CU_AT_PX(lcu, x_local, y_local),
+                          lcu,
+                          inter_cost,
+                          inter_bitcost);
+  } else {
+    *inter_cost += state->lambda_sqrt * smp_extra_bits;
   }
-
-  // Count bits spent for coding the partition mode.
-  int smp_extra_bits = 1; // horizontal or vertical
-  if (state->encoder_control->cfg.amp_enable) {
-    smp_extra_bits += 1; // symmetric or asymmetric
-    if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) {
-      smp_extra_bits += 1; // U,L or D,R
-    }
-  }
-  // The transform is split for SMP and AMP blocks so we need more bits for
-  // coding the CBF.
-  smp_extra_bits += 6;
-
-  *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits;
-  *inter_bitcost += smp_extra_bits;
 }
diff --git a/src/search_inter.h b/src/search_inter.h
index 5aff9f7f..cc003f92 100644
--- a/src/search_inter.h
+++ b/src/search_inter.h
@@ -64,20 +64,20 @@ enum hpel_position {
   HPEL_POS_DIA = 2
 };
 
-typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state,
+typedef double kvz_mvd_cost_func(const encoder_state_t *state,
                                   int x, int y,
                                   int mv_shift,
                                   mv_t mv_cand[2][2],
                                   inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                                   int16_t num_cand,
                                   int32_t ref_idx,
-                                  uint32_t *bitcost);
+                                  double *bitcost);
 
 void kvz_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
                          lcu_t *lcu,
                          double *inter_cost,
-                         uint32_t *inter_bitcost);
+                         double* inter_bitcost);
 
 void kvz_search_cu_smp(encoder_state_t * const state,
                        int x, int y,
@@ -85,12 +85,20 @@ void kvz_search_cu_smp(encoder_state_t * const state,
                        part_mode_t part_mode,
                        lcu_t *lcu,
                        double *inter_cost,
-                       uint32_t *inter_bitcost);
+                       double* inter_bitcost);
 
 
 unsigned kvz_inter_satd_cost(const encoder_state_t* state,
                              const lcu_t *lcu,
                              int x,
                              int y);
+void kvz_cu_cost_inter_rd2(encoder_state_t* const state,
+  int x, int y, int depth,
+  cu_info_t* cur_cu,
+  lcu_t* lcu,
+  double* inter_cost,
+  double* inter_bitcost);
+
+int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);
 
 #endif // SEARCH_INTER_H_
diff --git a/src/search_intra.c b/src/search_intra.c
index 87139b93..6f7a9349 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -97,13 +97,13 @@ static double get_cost(encoder_state_t * const state,
 
     // Add the offset bit costs of signaling 'luma and chroma use trskip',
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
-    const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
+    const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
 
     
     // ToDo: Check cost
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      ctx = &state->search_cabac.ctx.transform_skip_model_chroma;
       trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
     }
     
@@ -394,7 +394,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->lambda;
+    split_cost = 0;
 
     split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
     if (split_cost < nosplit_cost) {
@@ -417,14 +417,15 @@ static double search_intra_trdepth(encoder_state_t * const state,
     // so this will code cbf as 0 and not code the cbf at all for descendants.
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
       const uint8_t tr_depth = depth - pred_cu->depth;
+      cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
-      const cabac_ctx_t* ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
       }
       ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
       }
     }
 
@@ -677,9 +678,8 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->lambda_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
+    costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
   }
 
   #undef PARALLEL_BLKS
@@ -771,7 +771,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
       int rdo_bitcost = kvz_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id);
 
       *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5);
-
+    
       // Mip related stuff
       // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
       // Half of the modes [16, 31] are indicated with the separate transpose flag.
@@ -818,6 +818,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
   
 
+
   // The best transform split hierarchy is not saved anywhere, so to get the
   // transform split hierarchy the search has to be performed again with the
   // best mode.
@@ -854,7 +855,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 
 double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id)
 {
-  double mode_bits = 0.0;
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
+  double mode_bits = 0;
 
   bool enable_mip = state->encoder_control->cfg.mip;
   bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false;
@@ -899,11 +901,26 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
         break;
       }
     }
+  cabac_ctx_t *ctx = &(cabac->ctx.luma_planar_model[1]);
+  CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search");
+  if (state->search_cabac.update) {
+    if(mode_in_preds) {
+      CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx");
+      if(luma_mode != intra_preds[0]) {
+        CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx");        
+      }
+    }
+    else {
+      // This value should be transformed for actual coding,
+      // but here the value does not actually matter, just that we write 5 bits
+      CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode");
+    }
+  }
 
     bool enable_mrl = state->encoder_control->cfg.mrl;
     uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0;
 
-    const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model);
+    ctx = &(cabac->ctx.intra_luma_mpm_flag_model);
 
     if (multi_ref_index == 0) {
       mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1);
@@ -911,17 +928,17 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
 
     // Add MRL bits.
     if (enable_mrl && MAX_REF_LINE_IDX > 1) {
-      ctx = &(state->cabac.ctx.multi_ref_line[0]);
+      ctx = &(cabac->ctx.multi_ref_line[0]);
       mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0);
 
       if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) {
-        ctx = &(state->cabac.ctx.multi_ref_line[1]);
+        ctx = &(cabac->ctx.multi_ref_line[1]);
         mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1);
       }
     }
 
     if (mode_in_preds != -1 || multi_ref_index != 0) {
-      ctx = &(state->cabac.ctx.luma_planar_model[0]);
+      ctx = &(cabac->ctx.luma_planar_model[0]);
       if (multi_ref_index == 0) {
         mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0);
       }
@@ -938,7 +955,8 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
 
 double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode)
 {
-  const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model);
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model);
   double mode_bits;
   if (chroma_mode == luma_mode) {
     mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
@@ -958,6 +976,13 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
     mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67);
   }
 
+  if(cabac->update) {
+    if(chroma_mode != luma_mode) {
+      // Again it does not matter what we actually write here
+      CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode");      
+    }
+  }
+
   return mode_bits;
 }
 
@@ -1045,9 +1070,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
           -1, chroma.mode, // skip luma
           NULL, cclm_params, 0, false, false, lcu);
       }
+      double bits = 0;
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
+      bits += mode_bits;
       chroma.cost += mode_bits * state->lambda;
 
       if (chroma.cost < best_chroma.cost) {
diff --git a/src/transform.c b/src/transform.c
index 09de7b2c..4c90f3f4 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -260,11 +260,9 @@ int kvz_quantize_residual_trskip(
   struct {
     kvz_pixel rec[LCU_WIDTH * LCU_WIDTH];
     coeff_t coeff[LCU_WIDTH * LCU_WIDTH];
-    uint32_t cost;
+    double cost;
     int has_coeffs;
   } skip, *best;
-
-  const int bit_cost = (int)(state->lambda + 0.5);
   
   //noskip.has_coeffs = kvz_quantize_residual(
   //    state, cur_cu, width, color, scan_order,
@@ -278,7 +276,7 @@ int kvz_quantize_residual_trskip(
     1, in_stride, width,
     ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj);
   skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width);
-  skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost;
+  skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda;
 
 /*  if (noskip.cost <= skip.cost) {
     *trskip_out = 0;