WIP: Merge remote-tracking branch 'remotes/kvz_github/master' into update-cabac-during-search

2024-11-23 18:14:06 +00:00 · 2022-03-21 08:42:41 +02:00 · 2022-03-21 08:42:41 +02:00 · 0be443d309
parent f8375f9bc6 6918ab80ae
commit 0be443d309
33 changed files with 1885 additions and 1019 deletions
--- a/README.md
+++ b/README.md
@ -150,11 +150,20 @@ Video structure:
                                   - frametile: Constrain within the tile.
                                   - frametilemargin: Constrain even more.
      --roi <filename>       : Use a delta QP map for region of interest.
-                               Reads an array of delta QP values from a text
-                               file. The file format is: width and height of
-                               the QP delta map followed by width*height delta
-                               QP values in raster order. The map can be of any
-                               size and will be scaled to the video size.
+                               Reads an array of delta QP values from a file.
+                               Text and binary files are supported and detected
+                               from the file extension (.txt/.bin). If a known
+                               extension is not found, the file is treated as
+                               a text file. The file can include one or many
+                               ROI frames each in the following format:
+                               width and height of the QP delta map followed
+                               by width * height delta QP values in raster
+                               order. In binary format, width and height are
+                               32-bit integers whereas the delta QP values are
+                               signed 8-bit values. The map can be of any size
+                               and will be scaled to the video size. The file
+                               reading will loop if end of the file is reached.
+                               See roi.txt in the examples folder.
      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
                               in PPS and slice_qp_delta in slize header zero.
      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
--- a/configure.ac
+++ b/configure.ac
@ -22,8 +22,8 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #   - Increment when making new releases and major or minor was not changed since last release.
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
-ver_major=6
-ver_minor=7
+ver_major=7
+ver_minor=2
 ver_release=0

 # Prevents configure from adding a lot of defines to the CFLAGS
--- a/doc/kvazaar.1
+++ b/doc/kvazaar.1
@ -164,11 +164,20 @@ Constrain movement vectors. [none]
 .TP
 \fB\-\-roi <filename>      
 Use a delta QP map for region of interest.
-Reads an array of delta QP values from a text
-file. The file format is: width and height of
-the QP delta map followed by width*height delta
-QP values in raster order. The map can be of any
-size and will be scaled to the video size.
+Reads an array of delta QP values from a file.
+Text and binary files are supported and detected
+from the file extension (.txt/.bin). If a known
+extension is not found, the file is treated as
+a text file. The file can include one or many
+ROI frames each in the following format:
+width and height of the QP delta map followed
+by width * height delta QP values in raster
+order. In binary format, width and height are
+32\-bit integers whereas the delta QP values are
+signed 8\-bit values. The map can be of any size
+and will be scaled to the video size. The file
+reading will loop if end of the file is reached.
+See roi.txt in the examples folder.
 .TP
 \fB\-\-set\-qp\-in\-cu        
 Set QP at CU level keeping pic_init_qp_minus26.
--- a/src/bitstream.c
+++ b/src/bitstream.c
@ -33,6 +33,7 @@
 #include "bitstream.h"

 #include <math.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>

--- a/src/cabac.c
+++ b/src/cabac.c
@ -70,6 +70,7 @@ void kvz_cabac_start(cabac_data_t * const data)
  data->num_buffered_bytes = 0;
  data->buffered_byte = 0xff;
  data->only_count = 0; // By default, write bits out
+  data->update = 0; 
 }

 /**
@ -349,26 +350,28 @@ void kvz_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem
 /**
 * \brief
 */
-void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol)
+void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, 
+  cabac_ctx_t * const ctx, 
+  uint32_t symbol,
+  const int32_t offset,
+  const uint32_t max_symbol, 
+  double* bits_out)
 {
  int8_t code_last = max_symbol > symbol;

  assert(symbol <= max_symbol);

  if (!max_symbol) return;
-
-  data->cur_ctx = ctx;
-  CABAC_BIN(data, symbol, "ums");
+  
+  CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums");

  if (!symbol) return;

  while (--symbol) {
-    //data->cur_ctx = &ctx[offset];
-    CABAC_BIN(data, 1, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums");
  }
  if (code_last) {
-    //data->cur_ctx = &ctx[offset];
-    CABAC_BIN(data, 0, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums");
  }
 }

@ -405,7 +408,7 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int
 /**
 * \brief
 */
-void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
                                  cabac_data_t * const data,
                                  uint32_t symbol,
                                  uint32_t count)
@ -426,4 +429,5 @@ void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
  num_bins += count;

  CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb");
+  return num_bins;
 }
--- a/src/cabac.h
+++ b/src/cabac.h
@ -59,7 +59,8 @@ typedef struct
  uint32_t   buffered_byte;
  int32_t    num_buffered_bytes;
  int32_t    bits_left;
-  int8_t     only_count;
+  int8_t     only_count : 4;
+  int8_t     update : 4;
  bitstream_t *stream;

  // CONTEXTS
@ -140,11 +141,11 @@ void kvz_cabac_write(cabac_data_t *data);
 void kvz_cabac_finish(cabac_data_t *data);
 void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                              uint32_t r_param, const unsigned int cutoff);
-void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
                uint32_t symbol, uint32_t count);
 void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
-                                  uint32_t symbol, int32_t offset,
-                                  uint32_t max_symbol);
+                                      uint32_t symbol, int32_t offset,
+                                      uint32_t max_symbol, double* bits_out);
 void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol);

 #define CTX_PROB_BITS 15
@ -153,6 +154,18 @@ void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol
 #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0))
 #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1))

+// Floating point fractional bits, derived from kvz_entropy_bits
+extern const float kvz_f_entropy_bits[512];
+#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
+
+#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \
+  if((cabac)->only_count) (bits) += kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \
+  if((cabac)->update) {\
+    (cabac)->cur_ctx = ctx;\
+    CABAC_BIN((cabac), (val), (name));\
+  } \
+} while(0)
+
 // Macros
 #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] )
 #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 )
--- a/src/cfg.c
+++ b/src/cfg.c
@ -149,9 +149,9 @@ int kvz_config_init(kvz_config *cfg)
  cfg->gop_lp_definition.t = 1;
  cfg->open_gop = true;

-  cfg->roi.width = 0;
-  cfg->roi.height = 0;
-  cfg->roi.dqps = NULL;
+  cfg->roi.file_path = NULL;
+  cfg->roi.format = KVZ_ROI_TXT;
+
  cfg->set_qp_in_cu = false;

  cfg->erp_aqp = false;
@ -214,6 +214,9 @@ int kvz_config_init(kvz_config *cfg)

  cfg->cclm = 0;

+
+  cfg->combine_intra_cus = 1;
+  cfg->force_inter = 0;
  return 1;
 }

@ -221,11 +224,11 @@ int kvz_config_destroy(kvz_config *cfg)
 {
  if (cfg) {
    FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->roi.file_path);
    FREE_POINTER(cfg->fast_coeff_table_fn);
    FREE_POINTER(cfg->tiles_width_split);
    FREE_POINTER(cfg->tiles_height_split);
    FREE_POINTER(cfg->slice_addresses_in_ts);
-    FREE_POINTER(cfg->roi.dqps);
    FREE_POINTER(cfg->fastrd_learning_outdir_fn);
  }
  free(cfg);
@ -1295,60 +1298,29 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
  }
  else if OPT("implicit-rdpcm")
    cfg->implicit_rdpcm = (bool)atobool(value);
+
  else if OPT("roi") {
-    // The ROI description is as follows:
-    // First number is width, second number is height,
-    // then follows width * height number of dqp values.
-    FILE* f = fopen(value, "rb");
-    if (!f) {
-      fprintf(stderr, "Could not open ROI file.\n");
+    static enum kvz_roi_format const formats[] = { KVZ_ROI_TXT, KVZ_ROI_BIN };
+    static const char * const format_names[] = { "txt", "bin", NULL };
+
+    char *roi_file = strdup(value);
+    if (!roi_file) {
+      fprintf(stderr, "Failed to allocate memory for ROI file name.\n");
      return 0;
    }
+    FREE_POINTER(cfg->roi.file_path);
+    cfg->roi.file_path = roi_file;

-    int width = 0;
-    int height = 0;
-    if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) {
-      fprintf(stderr, "Failed to read ROI size.\n");
-      fclose(f);
-      return 0;
+    // Get file extension or the substring after the last dot
+    char *maybe_extension = strrchr(cfg->roi.file_path, '.');
+    if (!maybe_extension) {
+      cfg->roi.format = KVZ_ROI_TXT;
+    } else {
+      maybe_extension++;
+      int8_t format;
+      bool unknown_format = !parse_enum(maybe_extension, format_names, &format);
+      cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formats[format];
    }
-
-    if (width <= 0 || height <= 0) {
-      fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height);
-      fclose(f);
-      return 0;
-    }
-
-    if (width > 10000 || height > 10000) {
-      fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
-      fclose(f);
-      return 0;
-    }
-
-    const unsigned size = width * height;
-    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps[0]));
-    if (!dqp_array) {
-      fprintf(stderr, "Failed to allocate memory for ROI table.\n");
-      fclose(f);
-      return 0;
-    }
-
-    FREE_POINTER(cfg->roi.dqps);
-    cfg->roi.dqps   = dqp_array;
-    cfg->roi.width  = width;
-    cfg->roi.height = height;
-
-    for (int i = 0; i < size; ++i) {
-      int number; // Need a pointer to int for fscanf
-      if (fscanf(f, "%d", &number) != 1) {
-        fprintf(stderr, "Reading ROI file failed.\n");
-        fclose(f);
-        return 0;
-      }
-      dqp_array[i] = CLIP(-51, 51, number);
-    }
-
-    fclose(f);
  }
  else if OPT("set-qp-in-cu") {
    cfg->set_qp_in_cu = (bool)atobool(value);
@ -1502,6 +1474,12 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
  else if OPT("cclm") {
    cfg->cclm = (bool)atobool(value);
  }
+  else if OPT("combine-intra-cus") {
+    cfg->combine_intra_cus = atobool(value);
+  }
+  else if OPT("force-inter") {
+    cfg->force_inter = atobool(value);
+  }
  else {
    return 0;
  }
--- a/src/cli.c
+++ b/src/cli.c
@ -145,6 +145,7 @@ static const struct option long_options[] = {
  { "force-level",        required_argument, NULL, 0 },
  { "high-tier",                no_argument, NULL, 0 },
  { "me-steps",           required_argument, NULL, 0 },
+  { "roi-file",           required_argument, NULL, 0 },
  { "fast-residual-cost", required_argument, NULL, 0 },
  { "set-qp-in-cu",             no_argument, NULL, 0 },
  { "open-gop",                 no_argument, NULL, 0 },
@ -183,6 +184,10 @@ static const struct option long_options[] = {
  { "no-amvr",                  no_argument, NULL, 0 },
  { "cclm",                     no_argument, NULL, 0 },
  { "no-cclm",                  no_argument, NULL, 0 },
+  { "combine-intra-cus",        no_argument, NULL, 0 },
+  { "no-combine-intra-cus",     no_argument, NULL, 0 },
+  { "force-inter",              no_argument, NULL, 0 },
+  { "no-force-inter",           no_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -504,11 +509,20 @@ void print_help(void)
    "                                   - frametile: Constrain within the tile.\n"
    "                                   - frametilemargin: Constrain even more.\n"
    "      --roi <filename>       : Use a delta QP map for region of interest.\n"
-    "                               Reads an array of delta QP values from a text\n"
-    "                               file. The file format is: width and height of\n"
-    "                               the QP delta map followed by width*height delta\n"
-    "                               QP values in raster order. The map can be of any\n"
-    "                               size and will be scaled to the video size.\n"
+    "                               Reads an array of delta QP values from a file.\n"
+    "                               Text and binary files are supported and detected\n"
+    "                               from the file extension (.txt/.bin). If a known\n"
+    "                               extension is not found, the file is treated as\n"
+    "                               a text file. The file can include one or many\n"
+    "                               ROI frames each in the following format:\n"
+    "                               width and height of the QP delta map followed\n"
+    "                               by width * height delta QP values in raster\n"
+    "                               order. In binary format, width and height are\n"
+    "                               32-bit integers whereas the delta QP values are\n"
+    "                               signed 8-bit values. The map can be of any size\n"
+    "                               and will be scaled to the video size. The file\n"
+    "                               reading will loop if end of the file is reached.\n"
+    "                               See roi.txt in the examples folder.\n"
    "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
    "                               in PPS and slice_qp_delta in slize header zero.\n"
    "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
@ -594,6 +608,16 @@ void print_help(void)
    "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
    "                                learning trees, overrides the\n"
    "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
+    "                                   on lower depth even when search is not\n"
+    "                                   performed on said depth. Should only\n"
+    "                                   be disabled if cus absolutely must not\n"
+    "                                   be larger than limited by the search.\n"
+    "                                   [enabled]"
+    "      --force-inter          : Force the encoder to use inter always.\n"
+    "                               This is mostly for debugging and is not\n"
+    "                               guaranteed to produce sensible bitstream or\n"
+    "                               work at all. [disabled]"
    "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
    "      --(no-)bipred          : Bi-prediction [disabled]\n"
    "      --cu-split-termination <string> : CU split search termination [zero]\n"
--- a/src/encmain.c
+++ b/src/encmain.c
@ -441,6 +441,7 @@ int main(int argc, char *argv[])
  FILE *input  = NULL; //!< input file (YUV)
  FILE *output = NULL; //!< output file (HEVC NAL stream)
  FILE *recout = NULL; //!< reconstructed YUV output, --debug
+  FILE *roifile = NULL;
  clock_t start_time = clock();
  clock_t encoding_start_cpu_time;
  KVZ_CLOCK_T encoding_start_real_time;
@ -584,7 +585,7 @@ int main(int argc, char *argv[])
    // Give arguments via struct to the input thread
    input_handler_args in_args = {
      .available_input_slots = available_input_slots,
-      .filled_input_slots    = filled_input_slots,
+      .filled_input_slots = filled_input_slots,

      .input = input,
      .api = api,
@ -825,6 +826,7 @@ done:
  if (input)  fclose(input);
  if (output) fclose(output);
  if (recout) fclose(recout);
+  if (roifile) fclose(roifile);

  DBG_YUVIEW_CLEANUP();
  CHECKPOINTS_FINALIZE();
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -581,7 +581,7 @@ static void encode_transform_coeff(encoder_state_t * const state,

      // cu_qp_delta_abs prefix
      cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0];
-      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5);
+      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL);

      if (qp_delta_abs >= 5) {
        // cu_qp_delta_abs suffix
@ -610,17 +610,19 @@ static void encode_transform_coeff(encoder_state_t * const state,
 * \param depth           Depth from LCU.
 * \return if non-zero mvd is coded
 */
-static bool encode_inter_prediction_unit(encoder_state_t * const state,
-                                         cabac_data_t * const cabac,
-                                         const cu_info_t * const cur_cu,
-                                         int x, int y, int width, int height,
-                                         int depth)
+int kvz_encode_inter_prediction_unit(encoder_state_t * const state,
+                                      cabac_data_t * const cabac,
+                                      const cu_info_t * const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, lcu_t* lcu, double* bits_out)
 {
  // Mergeflag
  int16_t num_cand = 0;
  bool non_zero_mvd = false;
-  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
-  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+  double bits = 0;
+
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag");
+
  num_cand = state->encoder_control->cfg.max_merge;
  if (cur_cu->merged) { //merge
    if (num_cand > 1) {
@ -628,10 +630,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
      for (ui = 0; ui < num_cand - 1; ui++) {
        int32_t symbol = (ui != cur_cu->merge_idx);
        if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
        } else {
          CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          if(cabac->only_count) bits += 1;
        }
        if (symbol == 0) break;
      }
@ -650,12 +652,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
        uint32_t inter_dir_ctx = (7 - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height) + 1) >> 1));

-        cabac->cur_ctx = &(cabac->ctx.inter_dir[inter_dir_ctx]);
-        CABAC_BIN(cabac, (inter_dir == 3), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
      }
      if (inter_dir < 3) {
-        cabac->cur_ctx = &(cabac->ctx.inter_dir[5]);
-        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[5]), (inter_dir == 2), bits, "inter_pred_idc");
      }
   }

@ -674,20 +674,21 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
      if (ref_LX_size > 1) {
        // parseRefFrmIdx
        int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx];
-
-        cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-        CABAC_BIN(cabac, (ref_frame > 0), "ref_idx_lX");
+        
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");

        if (ref_frame > 0 && ref_LX_size > 2) {
          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
-          CABAC_BIN(cabac, (ref_frame > 1), "ref_idx_lX");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), (ref_frame > 1), bits, "ref_idx_lX");

          if (ref_frame > 1 && ref_LX_size > 3) {
            for (int idx = 3; idx < ref_LX_size; idx++)
            {
              uint8_t val = (ref_frame > idx - 1) ? 1 : 0;
              CABAC_BIN_EP(cabac, val, "ref_idx_lX");
+              if (cabac->only_count) bits += 1;
              if (!val) break;
+
            }
          }
        }
@ -697,28 +698,37 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
      if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) {

        mv_t mv_cand[2][2];
-        kvz_inter_get_mv_cand_cua(
+        if (lcu) {
+          kvz_inter_get_mv_cand(
+            state, 
+            x, y, width, height,
+            mv_cand, cur_cu, 
+            lcu, ref_list_idx);
+        }
+        else {
+          kvz_inter_get_mv_cand_cua(
            state,
            x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx);
+            mv_cand, cur_cu, ref_list_idx
+          );
+        }

        uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
        mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0];
        mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1];

        kvz_change_precision(INTERNAL_MV_PREC, kvz_g_imv_to_prec[KVZ_IMV_OFF], &mvd_hor, &mvd_ver);
-
-        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver);
+        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out);

        non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0);
      }

      // Signal which candidate MV to use
-      cabac->cur_ctx = &(cabac->ctx.mvp_idx_model);
-      CABAC_BIN(cabac, CU_GET_MV_CAND(cur_cu, ref_list_idx), "mvp_flag");
+      CABAC_FBITS_UPDATE(cabac,&(cabac->ctx.mvp_idx_model), CU_GET_MV_CAND(cur_cu, ref_list_idx), bits, "mvp_flag");

    } // for ref_list
  } // if !merge
+  if(bits_out) *bits_out += bits;
  return non_zero_mvd;
 }

@ -807,7 +817,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
 static void encode_intra_coding_unit(encoder_state_t * const state,
                                     cabac_data_t * const cabac,
                                     const cu_info_t * const cur_cu,
-                                     int x, int y, int depth, lcu_coeff_t* coeff)
+                                     int x, int y, int depth, lcu_t* lcu, lcu_coeff_t* coeff, double* bits_out)
 {
  const videoframe_t * const frame = state->tile->frame;
  uint8_t intra_pred_mode_actual[4];
@ -1050,6 +1060,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,

        kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT);
      }
+      if (cabac->only_count && bits_out) *bits_out += 5;
    }
  }

@ -1057,14 +1068,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
  }
+  // if we are counting bits, the cost for transform coeffs is done separately
+  // To get the distortion at the same time
+  if (!cabac->only_count) {
+    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);

-  encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
+    encode_mts_idx(state, cabac, cur_cu);

-  encode_mts_idx(state, cabac, cur_cu);
-
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
-    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
+      encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
+      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
+    }
  }

 }
@ -1105,32 +1119,32 @@ static void encode_part_mode(encoder_state_t * const state,
  //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
  //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
  // ------------------------------+------------------
-
+  double bits = 0;
  if (cur_cu->type == CU_INTRA) {
    if (depth == MAX_DEPTH) {
      cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
      if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
      } else {
-        CABAC_BIN(cabac, 0, "part_mode NxN");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN");
      }
    }
  } else {

    cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
    if (cur_cu->part_size == SIZE_2Nx2N) {
-      CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      return;
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
+      return bits;
    }
-    CABAC_BIN(cabac, 0, "part_mode split");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split");

    cabac->cur_ctx = &(cabac->ctx.part_size_model[1]);
    if (cur_cu->part_size == SIZE_2NxN ||
        cur_cu->part_size == SIZE_2NxnU ||
        cur_cu->part_size == SIZE_2NxnD) {
-      CABAC_BIN(cabac, 1, "part_mode vertical");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical");
    } else {
-      CABAC_BIN(cabac, 0, "part_mode horizontal");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal");
    }

    if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) {
@ -1138,19 +1152,22 @@ static void encode_part_mode(encoder_state_t * const state,

      if (cur_cu->part_size == SIZE_2NxN ||
          cur_cu->part_size == SIZE_Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode SMP");
-        return;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP");
+        return bits;
      }
-      CABAC_BIN(cabac, 0, "part_mode AMP");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP");

      if (cur_cu->part_size == SIZE_2NxnU ||
          cur_cu->part_size == SIZE_nLx2N) {
        CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
      } else {
        CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
      }
    }
  }
+  return bits;
 }
 **/

@ -1191,7 +1208,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,
  bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
  bool border = border_x || border_y; /*!< are we in any border CU */

-  if (depth <= ctrl->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
    state->must_code_qp_delta = true;
  }

@ -1456,7 +1473,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,
      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
      const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);

-      non_zero_mvd |= encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth);
+      non_zero_mvd |= kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL);
      DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
      kvz_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu);
    }
@ -1494,7 +1511,7 @@ void kvz_encode_coding_tree(encoder_state_t * const state,

    }
  } else if (cur_cu->type == CU_INTRA) {
-    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, coeff);
+    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, coeff, NULL);
  }

  else {
@ -1511,11 +1528,128 @@ end:

 }

+double kvz_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu) {
+  double bits = 0;
+  const encoder_control_t* const ctrl = state->encoder_control;
+
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+
+  const int cu_width = LCU_WIDTH >> depth;
+  
+  const cu_info_t* left_cu = NULL, *above_cu = NULL;
+  if (x) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+  }
+  if (y) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1);
+  }
+  uint8_t split_model = 0;
+
+  // Absolute coordinates
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y;
+
+  // Check for slice border
+  bool border_x = ctrl->in.width < abs_x + cu_width;
+  bool border_y = ctrl->in.height < abs_y + cu_width;
+  bool border = border_x || border_y; /*!< are we in any border CU */
+
+  if (depth <= state->frame->max_qp_delta_depth) {
+    state->must_code_qp_delta = true;
+  }
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    // Implicit split flag when on border
+    if (!border) {
+      // Get left and top block split_flags and if they are present and true, increase model number
+      if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
+        split_model++;
+      }
+
+      if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
+        split_model++;
+      }
+
+      // This mocks encoding the current CU so it should be never split
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), 0, bits, "SplitFlag");
+    }
+  }
+
+  // Encode skip flag
+  if (state->frame->slicetype != KVZ_SLICE_I) {
+    int8_t ctx_skip = 0;
+
+    if (left_cu && left_cu->skipped) {
+      ctx_skip++;
+    }
+    if (above_cu && above_cu->skipped) {
+      ctx_skip++;
+    }
+    
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag");
+
+    if (cur_cu->skipped) {
+      int16_t num_cand = state->encoder_control->cfg.max_merge;
+      if (num_cand > 1) {
+        for (int ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
+          }
+          else {
+            CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+            if(cabac->only_count) bits += 1;
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return bits;
+    }
+  }
+  // Prediction mode
+  if (state->frame->slicetype != KVZ_SLICE_I && cu_width != 4) {
+
+    int8_t ctx_predmode = 0;
+
+    if ((left_cu && left_cu->type == CU_INTRA) || (above_cu && above_cu->type == CU_INTRA)) {
+      ctx_predmode = 1;
+    }
+
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode");
+  }
+  
+  if (cur_cu->type == CU_INTER) {
+    const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size];
+
+    for (int i = 0; i < num_pu; ++i) {
+      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
+      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
+      const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
+      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
+      const cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+      kvz_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, lcu, &bits);
+    }
+  }
+  else if (cur_cu->type == CU_INTRA) {
+    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, NULL, &bits);
+  }
+  return bits;
+}
+

 void kvz_encode_mvd(encoder_state_t * const state,
                    cabac_data_t *cabac,
                    int32_t mvd_hor,
-                    int32_t mvd_ver)
+                    int32_t mvd_ver, double* bits_out)
 {
  const int8_t hor_abs_gr0 = mvd_hor != 0;
  const int8_t ver_abs_gr0 = mvd_ver != 0;
@ -1523,29 +1657,33 @@ void kvz_encode_mvd(encoder_state_t * const state,
  const uint32_t mvd_ver_abs = abs(mvd_ver);

  cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0];
-  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver");

  cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1];
  if (hor_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor");
  }
  if (ver_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver");
  }

  if (hor_abs_gr0) {
    if (mvd_hor_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      if(cabac->only_count) *bits_out += bits;
    }
    uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1;
    CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
+    if (cabac->only_count) *bits_out += 1;
  }
  if (ver_abs_gr0) {
    if (mvd_ver_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      uint32_t bits = kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      if (cabac->only_count) *bits_out += bits;
    }
    uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1;
    CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
+    if (cabac->only_count) *bits_out += 1;
  }
 }
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -56,7 +56,22 @@ void kvz_encode_ts_residual(encoder_state_t* const state,
 void kvz_encode_mvd(encoder_state_t * const state,
                    cabac_data_t *cabac,
                    int32_t mvd_hor,
-                    int32_t mvd_ver);
+                    int32_t mvd_ver,
+                    double* bits_out);
+
+double kvz_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu);
+
+int kvz_encode_inter_prediction_unit(encoder_state_t* const state,
+                                      cabac_data_t* const cabac,
+                                      const cu_info_t* const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, 
+                                      lcu_t* lcu,
+                                      double* bits_out);

 void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
  uint8_t lastpos_x, uint8_t lastpos_y,
--- a/src/encoder.c
+++ b/src/encoder.c
@ -32,7 +32,6 @@

 #include "encoder.h"

-// This define is required for M_PI on Windows.
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <stdio.h>
@ -45,14 +44,6 @@
 #include "kvz_math.h"
 #include "fast_coeff_cost.h"

-/**
- * \brief Strength of QP adjustments when using adaptive QP for 360 video.
- *
- * Determined empirically.
- */
-static const double ERP_AQP_STRENGTH = 3.0;
-
-
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);

 static unsigned cfg_num_threads(void)
@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder)
 }


-/**
- * \brief Return weight for 360 degree ERP video
- *
- * Returns the scaling factor of area from equirectangular projection to
- * spherical surface.
- *
- * \param y   y-coordinate of the pixel
- * \param h   height of the picture
- */
-static double ws_weight(int y, int h)
-{
-  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
-}
-
-
-
 /**
 * \brief Update ROI QPs for 360 video with equirectangular projection.
 *
@ -162,55 +137,6 @@ static double ws_weight(int y, int h)
 * \param orig_width    width of orig_roi
 * \param orig_height   height of orig_roi
 */
-static void init_erp_aqp_roi(encoder_control_t* encoder,
-                             int8_t *orig_roi,
-                             int32_t orig_width,
-                             int32_t orig_height)
-{
-  // Update ROI with WS-PSNR delta QPs.
-  int height = encoder->in.height_in_lcu;
-  int width  = orig_roi ? orig_width : 1;
-
-  int frame_height = encoder->in.real_height;
-
-  encoder->cfg.roi.width  = width;
-  encoder->cfg.roi.height = height;
-  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi[0]));
-
-  double total_weight = 0.0;
-  for (int y = 0; y < frame_height; y++) {
-    total_weight += ws_weight(y, frame_height);
-  }
-
-  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
-    int y_orig = LCU_WIDTH * y_lcu;
-    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
-
-    double lcu_weight = 0.0;
-    for (int y = y_orig; y < y_orig + lcu_height; y++) {
-      lcu_weight += ws_weight(y, frame_height);
-    }
-    // Normalize.
-    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
-
-    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
-
-    if (orig_roi) {
-      // If a ROI array already exists, we copy the existing values to the
-      // new array while adding qp_delta to each.
-      int y_roi = y_lcu * orig_height / height;
-      for (int x = 0; x < width; x++) {
-        encoder->cfg.roi.dqps[x + y_lcu * width] =
-          CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta);
-      }
-
-    } else {
-      // Otherwise, simply write qp_delta to the ROI array.
-      encoder->cfg.roi.dqps[y_lcu] = qp_delta;
-    }
-  }
-}
-

 static int8_t* derive_chroma_QP_mapping_table(const kvz_config* const cfg, int i)
 {
@ -394,6 +320,16 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
    encoder->scaling_list.use_default_list = 1;
  }

+  // ROI / delta QP
+  if (cfg->roi.file_path) {
+    const char *mode[2] = { "r", "rb" };
+    encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]);
+    if (!encoder->roi_file) {
+      fprintf(stderr, "Could not open ROI file.\n");
+      goto init_failed;
+    }
+  }
+
  if (cfg->fast_coeff_table_fn) {
    FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
    if (fast_coeff_table_f == NULL) {
@ -435,32 +371,10 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
    goto init_failed;
  }

-  if (cfg->erp_aqp) {
-    init_erp_aqp_roi(encoder,
-                     cfg->roi.dqps,
-                     cfg->roi.width,
-                     cfg->roi.height);
-
-  } else if (cfg->roi.dqps) {
-    // Copy delta QP array for ROI coding.
-    const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
-    encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0]));
-    memcpy(encoder->cfg.roi.dqps,
-           cfg->roi.dqps,
-           roi_size * sizeof(*cfg->roi.dqps));
-
-  }
-
  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
  // for SMP and AMP partition units.
  encoder->tr_depth_inter = 0;

-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
-    encoder->max_qp_delta_depth = 0;
-  } else {
-    encoder->max_qp_delta_depth = -1;
-  }
-
  //Tiles
  encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                          encoder->cfg.tiles_height_count > 1;
@ -761,7 +675,7 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)

  FREE_POINTER(encoder->tiles_tile_id);

-  FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.roi.file_path);

  kvz_scalinglist_destroy(&encoder->scaling_list);

@ -773,6 +687,10 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)

  kvz_close_rdcost_outfiles();

+  if (encoder->roi_file) {
+    fclose(encoder->roi_file);
+  }
+
  free(encoder);
 }

--- a/src/encoder.h
+++ b/src/encoder.h
@ -130,7 +130,7 @@ typedef struct encoder_control_t
  //! Picture weights when GOP is used.
  double gop_layer_weights[MAX_GOP_LAYERS];

-  int8_t max_qp_delta_depth;
+  FILE *roi_file;

  int tr_depth_inter;

--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -805,10 +805,10 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream,
  WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag");

  WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26");
-  WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
-  if (encoder->max_qp_delta_depth >= 0) {
+  WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
+  if (state->frame->max_qp_delta_depth >= 0) {
    // Use separate QP for each LCU when rate control is enabled.    
-    WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth");
+    WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth");
  }

  WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag");
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -32,6 +32,9 @@

 #include "encoderstate.h"

+ // This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <ctype.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -53,6 +56,13 @@

 #include "strategies/strategies-picture.h"

+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;
+

 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
  int i;
@ -572,7 +582,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
  cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
  const int cu_width = LCU_WIDTH >> depth;

-  if (depth <= state->encoder_control->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
    *prev_qp = -1;
  }

@ -665,7 +675,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)

  encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);

-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
    int last_qp = state->last_qp;
    int prev_qp = -1;
    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
@ -716,6 +726,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
  const uint64_t existing_bits = kvz_bitstream_tell(&state->stream);

  //Encode SAO
+  state->cabac.update = 1;
  if (encoder->cfg.sao_type) {
    encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
  }
@ -771,6 +782,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
      kvz_cabac_start(&state->cabac);
    }
  }
+  state->cabac.update = 0;


  pthread_mutex_lock(&state->frame->rc_lock);
@ -1421,6 +1433,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
  }
 }

+
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Updates the ROI parameters in frame->roi.
+ *
+ * \param encoder       encoder control
+ * \param frame         frame that will have the ROI map
+ */
+static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame)
+{
+  int8_t *orig_roi    = frame->roi.roi_array;
+  int32_t orig_width  = frame->roi.width;
+  int32_t orig_height = frame->roi.height;
+
+  // Update ROI with WS-PSNR delta QPs.
+  int new_height = encoder->in.height_in_lcu;
+  int new_width = orig_roi ? orig_width : 1;
+  int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0]));
+
+  int frame_height = encoder->in.real_height;
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < new_height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / new_height;
+      for (int x = 0; x < new_width; x++) {
+        new_array[x + y_lcu * new_width] =
+          CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta);
+      }
+
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      new_array[y_lcu] = qp_delta;
+    }
+  }
+
+  // Update new values
+  frame->roi.width = new_width;
+  frame->roi.height = new_height;
+  frame->roi.roi_array = new_array;
+  FREE_POINTER(orig_roi);
+}
+
+
+static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) {
+  // The ROI description is as follows:
+  // First number is width, second number is height,
+  // then follows width * height number of dqp values.
+
+  // Rewind the (seekable) ROI file when end of file is reached.
+  // Allows a single ROI frame to be used for a whole sequence
+  // and looping with --loop-input. Skips possible whitespace.
+  if (ftell(file) != -1L) {
+    int c = fgetc(file);
+    while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file);
+    ungetc(c, file);
+    if (c == EOF) rewind(file);
+  }
+
+  int *width  = &frame->roi.width;
+  int *height = &frame->roi.height;
+
+  bool failed = false;
+
+  if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height);
+  if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2;
+  
+  if (failed) {
+    fprintf(stderr, "Failed to read ROI size.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width <= 0 || *height <= 0) {
+    fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height);
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width > 10000 || *height > 10000) {
+    fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  const unsigned size = (*width) * (*height);
+  int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0]));
+  if (!dqp_array) {
+    fprintf(stderr, "Failed to allocate memory for ROI table.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  FREE_POINTER(frame->roi.roi_array);
+  frame->roi.roi_array = dqp_array;
+
+  if (format == KVZ_ROI_TXT) {
+    for (int i = 0; i < size; ++i) {
+      int number; // Need a pointer to int for fscanf
+      if (fscanf(file, "%d", &number) != 1) {
+        fprintf(stderr, "Reading ROI file failed.\n");
+        fclose(file);
+        assert(0);
+      }
+      dqp_array[i] = CLIP(-51, 51, number);
+    }
+  } else if (format == KVZ_ROI_BIN) {
+    if (fread(dqp_array, 1, size, file) != size) {
+      fprintf(stderr, "Reading ROI file failed.\n");
+      assert(0);
+    }
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
  assert(state->type == ENCODER_STATE_TYPE_MAIN);

@ -1437,6 +1597,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
    memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu);
  }

+  // ROI / delta QP maps
+  if (frame->roi.roi_array && cfg->roi.file_path) {
+    assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified.");
+  }
+
+  // Read frame from the file. If no file is specified,
+  // ROI data should be already set by the application.
+  if (cfg->roi.file_path) {
+    next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format);
+  }
+  
+  if (cfg->erp_aqp) {
+    init_erp_aqp_roi(state->encoder_control, state->tile->frame->source);
+  }
+
  // Variance adaptive quantization
  if (cfg->vaq) {
    const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
@ -1523,6 +1698,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
  }
  // Variance adaptive quantization - END

+  if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) {
+    state->frame->max_qp_delta_depth = 0;
+  } else {
+    state->frame->max_qp_delta_depth = -1;
+  }
+
  // Use this flag to handle closed gop irap picture selection.
  // If set to true, irap is already set and we avoid
  // setting it based on the intra period
@ -1834,10 +2015,9 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y)

 int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
-  const encoder_control_t *ctrl = state->encoder_control;
  const cu_array_t *cua = state->tile->frame->cu_array;
  // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth);

  // Coordinates of the top-left corner of the quantization group
  const int x_qg = x & ~(qg_width - 1);
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t {
  */
  double *aq_offsets;

+  int8_t max_qp_delta_depth;
+
  /**
   * \brief Whether next NAL is the first NAL in the access unit.
   */
@ -320,6 +322,7 @@ typedef struct encoder_state_t {
  
  bitstream_t stream;
  cabac_data_t cabac;
+  cabac_data_t search_cabac;

  uint32_t stats_bitstream_length; //Bitstream length written in bytes

@ -402,10 +405,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
 */
 static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+  if (state->frame->max_qp_delta_depth < 0) return false;

  const int cu_width = LCU_WIDTH >> depth;
-  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
  const int right  = x + cu_width;
  const int bottom = y + cu_width;
  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
--- a/src/fast_coeff_cost.c
+++ b/src/fast_coeff_cost.c
@ -40,7 +40,7 @@ static uint16_t to_q88(float f)
  return (uint16_t)(f * 256.0f + 0.5f);
 }

-static uint64_t to_4xq88(const float f[4])
+static uint64_t to_4xq88(const double f[4])
 {
  int i;
  uint64_t result = 0;
@ -58,9 +58,9 @@ int kvz_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_
  uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;

  for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
-    float curr_wts[4];
+    double curr_wts[4];

-    if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
+    if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0,
                                                    curr_wts + 1,
                                                    curr_wts + 2,
                                                    curr_wts + 3) != 4) {
--- a/src/fast_coeff_cost.h
+++ b/src/fast_coeff_cost.h
@ -45,7 +45,7 @@ typedef struct {

 // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
 // 0 to MAX_FAST_COEFF_COST_QP
-static const float default_fast_coeff_cost_wts[][4] = {
+static const double default_fast_coeff_cost_wts[][4] = {
  // Just extend it by stretching the first actual values..
  {0.164240f, 4.161530f, 3.509033f, 6.928047f},
  {0.164240f, 4.161530f, 3.509033f, 6.928047f},
--- a/src/filter.c
+++ b/src/filter.c
@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir)

 static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) {
+  if (state->frame->max_qp_delta_depth < 0) {
    return state->qp;
  }

--- a/src/image.c
+++ b/src/image.c
@ -106,6 +106,10 @@ kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_

  im->interlacing = KVZ_INTERLACING_NONE;

+  im->roi.roi_array = NULL;
+  im->roi.width = 0;
+  im->roi.height = 0;
+
  return im;
 }

@ -132,6 +136,7 @@ void kvz_image_free(kvz_picture *const im)
    kvz_image_free(im->base_image);
  } else {
    free(im->fulldata_buf);
+    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
  }

  // Make sure freed data won't be used.
@ -192,6 +197,8 @@ kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image,
  im->pts = 0;
  im->dts = 0;

+  im->roi = orig_image->roi;
+
  return im;
 }

--- a/src/inter.c
+++ b/src/inter.c
@ -1290,7 +1290,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
                                        int32_t width,
                                        int32_t height,
                                        const merge_candidates_t *merge_cand,
-                                        const cu_info_t *cur_cu,
+                                        const cu_info_t * const cur_cu,
                                        int8_t reflist,
                                        mv_t mv_cand[2][2])
 {
@ -1396,7 +1396,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                           int32_t width,
                           int32_t height,
                           mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t  * const cur_cu,
                           lcu_t *lcu,
                           int8_t reflist)
 {
--- a/src/inter.h
+++ b/src/inter.h
@ -96,7 +96,7 @@ void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                           int32_t width,
                           int32_t height,
                           mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t* cur_cu,
                           lcu_t *lcu,
                           int8_t reflist);

--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -267,6 +267,12 @@ enum kvz_amvr_resolution
  KVZ_IMV_HPEL    = 3
 };

+enum kvz_roi_format
+{
+  KVZ_ROI_TXT = 0,
+  KVZ_ROI_BIN = 1
+};
+
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)format)

@ -410,10 +416,9 @@ typedef struct kvz_config
  int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */

  struct {
-    int32_t width;
-    int32_t height;
-    int8_t *dqps;
-  } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
+    char *file_path;
+    enum kvz_roi_format format;
+  } roi; /*!< \brief Specify delta QPs for region of interest coding. */

  unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */

@ -526,6 +531,12 @@ typedef struct kvz_config
  int8_t cclm;

  int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
+
+  /** \brief whether to try combining intra cus at the lower depth when search
+   *         is not performed at said depth*/
+  uint8_t combine_intra_cus;
+
+  uint8_t force_inter;
 } kvz_config;

 /**
@ -557,6 +568,14 @@ typedef struct kvz_picture {
  enum kvz_chroma_format chroma_format;

  int32_t ref_pocs[16];
+
+  struct
+  {
+    int width;
+    int height;
+    int8_t *roi_array;
+  } roi;
+
 } kvz_picture;

 /**
@ -782,6 +801,9 @@ typedef struct kvz_api {
   * the bitstream, length of the bitstream, the reconstructed frame, the
   * original frame and frame info in data_out, len_out, pic_out, src_out and
   * info_out, respectively. Otherwise, set the output parameters to NULL.
+   * 
+   * Region of interest (ROI) / delta QP map can be specified in the input
+   * picture's ROI field but only when a ROI file is not used.
   *
   * After passing all of the input frames, the caller should keep calling this
   * function with pic_in set to NULL, until no more data is returned in the
--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -1088,17 +1088,20 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
  const encoder_control_t * const ctrl = state->encoder_control;
  lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y);

-  if (ctrl->cfg.roi.dqps != NULL) {
-    vector2d_t lcu = {
+  if (state->tile->frame->source->roi.roi_array) {
+    vector2d_t lcu_vec = {
      pos.x + state->tile->lcu_offset_x,
      pos.y + state->tile->lcu_offset_y
    };
    vector2d_t roi = {
-      lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu,
-      lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu
+      lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu,
+      lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu
    };
-    int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
-    int dqp = ctrl->cfg.roi.dqps[roi_index];
+    int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width;
+    int dqp = state->tile->frame->source->roi.roi_array[roi_index];
+    if(dqp != 0) {
+      pos.x = 0;
+    }
    state->qp = CLIP_TO_QP(state->frame->QP + dqp);
    state->lambda = qp_to_lambda(state, state->qp);
    state->lambda_sqrt = sqrt(state->lambda);
--- a/src/rdo.c
+++ b/src/rdo.c
@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost(
  // Take a copy of the CABAC so that we don't overwrite the contexts when
  // counting the bits.
  cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));

  // Clear bytes and bits and set mode to "count"
  cabac_copy.only_count = 1;
-  cabac_copy.num_buffered_bytes = 0;
-  cabac_copy.bits_left = 23;
+  int num_buffered_bytes = cabac_copy.num_buffered_bytes;
+  int bits_left = cabac_copy.bits_left;

  // Execute the coding function.
  // It is safe to drop the const modifier since state won't be modified
@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost(
      type,
      scan_mode);
  }
-
-  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+  if(cabac_copy.update) {
+    memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy));
+  }
+  return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3);
 }

 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
@ -1741,37 +1743,33 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
 /**
 * Calculate cost of actual motion vectors using CABAC coding
 */
-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       const int32_t mvd_hor,
-                                       const int32_t mvd_ver)
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     const int32_t mvd_hor,
+                                     const int32_t mvd_ver)
 {
  cabac_data_t cabac_copy = *cabac;
  cabac_copy.only_count = 1;
-
+  double bits = 0;
  // It is safe to drop const here because cabac->only_count is set.
-  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits);

-  uint32_t bitcost =
-    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
-    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
-
-  return bitcost;
+  return bits;
 }

 /** MVD cost calculation with CABAC
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
-                                 int x,
-                                 int y,
-                                 int mv_shift,
-                                 mv_t mv_cand[2][2],
-                                 inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                                 int16_t num_cand,
-                                 int32_t ref_idx,
-                                 uint32_t *bitcost)
+double kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
+                               int x,
+                               int y,
+                               int mv_shift,
+                               mv_t mv_cand[2][2],
+                               inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                               int16_t num_cand,
+                               int32_t ref_idx,
+                               double* bitcost)
 {
  cabac_data_t state_cabac_copy;
  cabac_data_t* cabac;
@ -1798,14 +1796,13 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
  }

  // Store cabac state and contexts
-  memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+  memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t));

  // Clear bytes and bits and set mode to "count"
  state_cabac_copy.only_count = 1;
-  state_cabac_copy.num_buffered_bytes = 0;
-  state_cabac_copy.bits_left = 23;

  cabac = &state_cabac_copy;
+  double bits = 0;

  if (!merged) {
    vector2d_t mvd1 = {
@ -1820,8 +1817,8 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
    kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1);
    kvz_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2);

-    uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
-    uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
+    double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);

    // Select candidate 1 if it has lower cost
    if (cand2_cost < cand1_cost) {
@ -1834,7 +1831,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,

  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);

-  CABAC_BIN(cabac, merged, "MergeFlag");
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag");
  num_cand = state->encoder_control->cfg.max_merge;
  if (merged) {
    if (num_cand > 1) {
@ -1842,10 +1839,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
      for (ui = 0; ui < num_cand - 1; ui++) {
        int32_t symbol = (ui != merge_idx);
        if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
        } else {
          CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+          bits += 1;
        }
        if (symbol == 0) break;
      }
@ -1868,24 +1865,23 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
        if (ref_list[ref_list_idx] > 1) {
          // parseRefFrmIdx
          int32_t ref_frame = ref_idx;
-
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+          
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");

          if (ref_frame > 0) {
            int32_t i;
            int32_t ref_num = ref_list[ref_list_idx] - 2;
-
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
+            
            ref_frame--;

            for (i = 0; i < ref_num; ++i) {
              const uint32_t symbol = (i == ref_frame) ? 0 : 1;

              if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
+                CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX");
              } else {
                CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+                bits += 1;
              }
              if (symbol == 0) break;
            }
@ -1895,7 +1891,7 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
        // ToDo: Bidir vector support
        if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
          // It is safe to drop const here because cabac->only_count is set.
-          kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
+          kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits);
        }

        // Signal which candidate MV to use
@ -1905,10 +1901,10 @@ uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
    }
  }

-  *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
+  *bitcost = bits;

  // Store bitcost before restoring cabac
-  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
+  return *bitcost * state->lambda_sqrt;
 }

 void kvz_close_rdcost_outfiles(void)
--- a/src/rdo.h
+++ b/src/rdo.h
@ -77,10 +77,10 @@ uint32_t kvz_get_coded_level(encoder_state_t * state, double* coded_cost, double

 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;

-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       int32_t mvd_hor,
-                                       int32_t mvd_ver);
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     int32_t mvd_hor,
+                                     int32_t mvd_ver);

 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15
@ -90,8 +90,5 @@ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
 extern const uint32_t kvz_entropy_bits[512];
 #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]

-// Floating point fractional bits, derived from kvz_entropy_bits
-extern const float kvz_f_entropy_bits[512];
-#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]

 #endif
--- a/src/sao.c
+++ b/src/sao.c
@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) {
 }


-static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
+static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {    
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded type_idx_, none = 0
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+  CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type");

  return mode_bits;
 }

-static float sao_mode_bits_merge(const encoder_state_t * const state,
+static double sao_mode_bits_merge(const encoder_state_t * const state,
                                 int8_t merge_cand) {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  ctx = &(cabac->ctx.sao_merge_flag_model);

-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag");
  if (merge_cand == 1) return mode_bits;
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag");
  return mode_bits;
 }


-static float sao_mode_bits_edge(const encoder_state_t * const state,
+static double sao_mode_bits_edge(const encoder_state_t * const state,
                              int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES],
                              sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
-    ctx = &(cabac->ctx.sao_merge_flag_model);   
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    ctx = &(cabac->ctx.sao_merge_flag_model);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded type_idx_, edge = 2 = cMax
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;

  // TR coded offsets.
  for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) {
@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state,
 }


-static float sao_mode_bits_band(const encoder_state_t * const state,
+static double sao_mode_bits_band(const encoder_state_t * const state,
                              int band_position[2], int offsets[10],
                              sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded sao_type_idx_, band = 1
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;

  // TR coded offsets and possible FL coded offset signs.
  for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++)
@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
  // Choose between SAO and doing nothing, taking into account the
  // rate-distortion cost of coding do nothing.
  {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
+    float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left);
+    int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5);
    if (sao_out->ddistortion >= cost_of_nothing) {
      sao_out->type = SAO_TYPE_NONE;
      merge_cost[0] = cost_of_nothing;
--- a/src/search.c
+++ b/src/search.c
@ -37,6 +37,7 @@

 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@ -59,14 +60,6 @@
 // Cost threshold for doing intra search in inter frames with --rd=0.
 static const int INTRA_THRESHOLD = 8;

-// Modify weight of luma SSD.
-#ifndef LUMA_MULT
-# define LUMA_MULT 0.8
-#endif
-// Modify weight of chroma SSD.
-#ifndef CHROMA_MULT
-# define CHROMA_MULT 1.5
-#endif

 static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
  const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);

  double ssd = 0.0;
-  ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+  ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd(
    &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
    LCU_WIDTH, LCU_WIDTH, cu_width
    );
  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
      &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
      );
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
      &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
      );
@ -294,11 +287,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * prediction unit data needs to be coded.
 */
 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu)
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu)
 {
  const int width = LCU_WIDTH >> depth;
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;

  // cur_cu is used for TU parameters.
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@ -324,14 +319,36 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
    return sum + tr_tree_bits * state->lambda;
  }

+
+  if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) {
+    // Because these need to be coded before the luma cbf they also need to be counted
+    // before the cabac state changes. However, since this branch is only executed when
+    // calculating the last RD cost it is not problem to include the chroma cbf costs in
+    // luma, because the chroma cost is calculated right after the luma cost.
+    // However, if we have different tr_depth, the bits cannot be written in correct
+    // order anyways so do not touch the chroma cbf here.
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+      cabac->cur_ctx = cr_ctx;
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+      cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
  if (pred_cu->type == CU_INTRA ||
-      tr_depth > 0 ||
+      is_tr_split ||
      cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
      cbf_is_set(tr_cu->cbf, depth, COLOR_V))
  {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
  }

  // SSD between reconstruction and original
@ -343,7 +360,8 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
                                        width);
  }

-  {
+
+  if (!skip_residual_coding) {
    int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
    const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];

@ -351,18 +369,19 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
  }

  double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->lambda;
+  return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda;
 }


 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu)
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu)
 {
  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);

  double tr_tree_bits = 0;
  double joint_cbcr_tr_tree_bits = 0;
@ -378,22 +397,27 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
    return 0;
  }

-  if (depth < MAX_PU_DEPTH) {
+  // See luma for why the second condition
+  if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) {
    const int tr_depth = depth - pred_cu->depth;
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
    }
    if(state->encoder_control->cfg.jccr) {
      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
    }
    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-    ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
+    ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]);
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
    }
    if(state->encoder_control->cfg.jccr) {
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
+      ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
    }
  }
@ -401,7 +425,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,

  if (tr_cu->tr_depth > depth) {
    int offset = LCU_WIDTH >> (depth + 1);
-    int sum = 0;
+    double sum = 0;

    sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
    sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
@ -448,6 +472,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
    }
  }

+  if (!skip_residual_coding)
  {
    int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
@ -464,8 +489,8 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
  double bits = tr_tree_bits + coeff_bits;
  double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;

-  double cost = (double)ssd + bits * state->c_lambda;
-  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  double cost = (double)ssd * KVZ_CHROMA_MULT + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd * KVZ_CHROMA_MULT + joint_bits * state->c_lambda;
  if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
    pred_cu->joint_cb_cr = 0;
    return cost;    
@ -485,6 +510,117 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
  return joint_cost;
 }

+static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
+                                           const int x_px, const int y_px, const int depth,
+                                           const cu_info_t* const pred_cu,
+                                           lcu_t* const lcu) {
+  const int width = LCU_WIDTH >> depth;
+
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  // cur_cu is used for TU parameters.
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+
+  double coeff_bits = 0;
+  double tr_tree_bits = 0;
+
+  // Check that lcu is not in 
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  const uint8_t tr_depth = tr_cu->tr_depth - depth;
+
+  const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+
+  {
+    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    // Only need to signal coded block flag if not skipped or merged
+    // skip = no coded residual, merge = coded residual
+    if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+    }
+
+  }
+
+  if(state->encoder_control->chroma_format != KVZ_CSP_400 && !skip_residual_coding) {
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
+    } 
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
+    } 
+  }
+
+  if (tr_depth > 0) {
+    int offset = LCU_WIDTH >> (depth + 1);
+    double sum = 0;
+
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    return sum + tr_tree_bits * state->lambda;
+  }
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ;
+
+  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = depth - tr_cu->depth;
+  if ((pred_cu->type == CU_INTRA ||
+    is_tr_split ||
+    cb_flag_u ||
+    cb_flag_v) 
+      && !skip_residual_coding)
+  {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+  }
+  // SSD between reconstruction and original
+  unsigned luma_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    int index = y_px * LCU_WIDTH + x_px;
+    luma_ssd = kvz_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      width);
+  }
+
+  {
+    int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+
+    coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip);
+  }
+
+  unsigned chroma_ssd = 0;
+  if(state->encoder_control->chroma_format != KVZ_CSP_400 && x_px % 8 == 0 && y_px % 8 == 0) {
+    const vector2d_t lcu_px = { x_px / 2, y_px / 2 };
+    const int chroma_width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+    if (!state->encoder_control->cfg.lossless) {
+      int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+      unsigned ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        chroma_width);
+      unsigned ssd_v = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        chroma_width);
+      chroma_ssd = ssd_u + ssd_v;
+    }
+
+     {
+      int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+      const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0);
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0);
+    }
+  }
+
+  double bits = tr_tree_bits + coeff_bits;
+  return luma_ssd * KVZ_LUMA_MULT + chroma_ssd * KVZ_CHROMA_MULT + bits * state->lambda;
+}
+

 // Return estimate of bits used to code prediction mode of cur_cu.
 static double calc_mode_bits(const encoder_state_t *state,
@ -518,6 +654,7 @@ static double calc_mode_bits(const encoder_state_t *state,
 }


+// TODO: replace usages of this by the kvz_sort_indices_by_cost function.
 /**
 * \brief Sort modes and costs to ascending order according to costs.
 */
@ -567,6 +704,23 @@ void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf
  }
 }

+/**
+ * \brief Sort keys (indices) to ascending order according to costs.
+ */
+void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map)
+{
+  // Size of sorted arrays is expected to be "small". No need for faster algorithm.
+  for (uint8_t i = 1; i < map->size; ++i) {
+    const int8_t cur_indx = map->keys[i];
+    const double cur_cost = map->cost[cur_indx];
+    uint8_t j = i;
+    while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) {
+      map->keys[j] = map->keys[j - 1];
+      --j;
+    }
+    map->keys[j] = cur_indx;
+  }
+}


 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
@ -592,10 +746,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  const encoder_control_t* ctrl = state->encoder_control;
  const videoframe_t * const frame = state->tile->frame;
  int cu_width = LCU_WIDTH >> depth;
-  double cost = MAX_INT;
-  double inter_zero_coeff_cost = MAX_INT;
-  uint32_t inter_bitcost = MAX_INT;
+  double cost = MAX_DOUBLE;
+  double inter_zero_coeff_cost = MAX_DOUBLE;
+  double inter_bitcost = MAX_INT;
  cu_info_t *cur_cu;
+  cabac_data_t pre_search_cabac;
+  memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));

  const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
  const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
@ -626,7 +782,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

  // Assign correct depth limit
  constraint_t* constr = state->constraint;
- if(constr->ml_intra_depth_ctu) {
+  if(constr->ml_intra_depth_ctu) {
    pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8];
    pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8];
  }
@ -670,7 +826,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

    if (can_use_inter) {
      double mode_cost;
-      uint32_t mode_bitcost;
+      double mode_bitcost;
      kvz_search_cu_inter(state,
                          x, y,
                          depth,
@ -721,12 +877,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

    int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
    bool can_use_intra =
-        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
        // When the split was forced because the CTU is partially outside
        // the frame, we permit intra coding even if pu_depth_intra would
        // otherwise forbid it.
        (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width ||
-        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height;
+        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) &&
+      !(state->encoder_control->cfg.force_inter && state->frame->slicetype != KVZ_SLICE_I);

    if (can_use_intra && !skip_intra) {
      int8_t intra_mode;
@ -737,6 +894,16 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      bool mip_transposed = false;
      kvz_search_cu_intra(state, x, y, depth, lcu,
                          &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
+#ifdef COMPLETE_PRED_MODE_BITS
+      // Technically counting these bits would be correct, however counting
+      // them universally degrades quality so this block is disabled by default
+      if(state->frame->slicetype != KVZ_SLICE_I) {
+        double pred_mode_type_bits = 0;
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag");
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[kvz_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag");
+        intra_cost += pred_mode_type_bits * state->lambda;
+      }
+#endif
      if (intra_cost < cost) {
        cost = intra_cost;
        cur_cu->type = CU_INTRA;
@ -828,9 +995,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
          cur_cu->merged = 0;
          cur_cu->skipped = 1;
          // Selecting skip reduces bits needed to code the CU
-          if (inter_bitcost > 1) {
-            inter_bitcost -= 1;
-          }
+          int skip_ctx = kvz_get_skip_context(x, y, lcu, NULL, NULL);
+          inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1);
+          inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0);
+          inter_bitcost += cur_cu->merge_idx;        
        }
      }
      lcu_fill_inter(lcu, x_local, y_local, cu_width);
@ -839,20 +1007,26 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  }

  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
+    double bits = 0;
+    cabac_data_t* cabac  = &state->search_cabac;
+    cabac->update = 1;
+
+    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
+      bits += kvz_mock_encode_coding_unit(
+        state,
+        cabac,
+        x, y, depth,
+        lcu,
+        cur_cu);
    }
-
-    double mode_bits;
-    if (cur_cu->type == CU_INTRA) {
-      mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
-    } else {
-      mode_bits = inter_bitcost;
+    else {
+      assert(0);
    }
+    
+    cost = bits * state->lambda;

-    cost += mode_bits * state->lambda;
-
+    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+    
    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
      cost = inter_zero_coeff_cost;

@ -874,13 +1048,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      cur_cu->cbf = 0;
      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
    }
-  }
+    cabac->update = 0;
+  } 

  bool can_split_cu =
    // If the CU is partially outside the frame, we need to split it even
    // if pu_depth_intra and pu_depth_inter would not permit it.
    cur_cu->type == CU_NOTSET ||
-    depth < pu_depth_intra.max ||
+    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != KVZ_SLICE_I)) ||
    (state->frame->slicetype != KVZ_SLICE_I &&
      depth < pu_depth_inter.max);

@ -889,21 +1064,27 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    int half_cu = cu_width / 2;
    double split_cost = 0.0;
    int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+    cabac_data_t post_seach_cabac;
+    memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    state->search_cabac.update = 1;
+
+    double split_bits = 0;

    if (depth < MAX_DEPTH) {
      // Add cost of cu_split_flag.
      uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.split_flag_model[split_model]);
+      CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, split_bits, "split_search");
    }

    if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
      // Add cost of intra part_size.
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;  // NxN
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.part_size_model[0]);
+      CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, split_bits, "split_search");
    }
+    state->search_cabac.update = 0;
+    split_cost += split_bits * state->lambda;

    // If skip mode was selected for the block, skip further search.
    // Skip mode means there's no coefficients in the block, so splitting
@ -925,13 +1106,29 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // searching.
    
    if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height 
+        && state->encoder_control->cfg.combine_intra_cus)
    {
+
      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);

      // If the best CU in depth+1 is intra and the biggest it can be, try it.
      if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
+        memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
        cost = 0;
+        double bits = 0;
+        if (depth < MAX_DEPTH) {
+          uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
+          cabac_ctx_t* ctx = &(state->search_cabac.ctx.split_flag_model[split_model]);
+          CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 0, bits, "no_split_search");
+        }
+        else if (depth == MAX_DEPTH && cur_cu->type == CU_INTRA) {
+          // Add cost of intra part_size.
+          cabac_ctx_t* ctx = &(state->search_cabac.ctx.part_size_model[0]);
+          CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, bits, "no_split_search");
+        }

        cur_cu->intra = cu_d1->intra;
        cur_cu->type = CU_INTRA;
@ -952,19 +1149,13 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                           NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
                           lcu);

-        cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-        if (has_chroma) {
-          cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
-        }
-
-        // Add the cost of coding no-split.
-        uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-        const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-
-        // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
        cost += mode_bits * state->lambda;
+
+        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+
+        memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
      }
    }

@ -978,6 +1169,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    } else if (depth > 0) {
      // Copy this CU's mode all the way down for use in adjacent CUs mode
      // search.
+      memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
      work_tree_copy_down(x_local, y_local, depth, work_tree);
      downsample_cclm_rec(
        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@ -1167,6 +1359,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
 */
 void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff)
 {
+  memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t));
+  state->search_cabac.only_count = 1;
  assert(x % LCU_WIDTH == 0);
  assert(y % LCU_WIDTH == 0);

--- a/src/search.h
+++ b/src/search.h
@ -44,22 +44,53 @@
 #include "image.h"
 #include "constraint.h"

+#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS)
+
+ // Modify weight of luma SSD.
+#ifndef KVZ_LUMA_MULT
+# define KVZ_LUMA_MULT 0.8
+#endif
+// Modify weight of chroma SSD.
+#ifndef KVZ_CHROMA_MULT
+# define KVZ_CHROMA_MULT 1.5
+#endif
+
+ /**
+  *  \brief Data collected during search processes.
+  * 
+  *         The intended use is to collect statistics of the
+  *         searched coding/prediction units. Data related to
+  *         a specific unit is found at index i. The arrays
+  *         should be indexed by elements of the "keys" array
+  *         that will be sorted by the RD costs of the units.         
+  */
+typedef struct unit_stats_map_t {
+
+  cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units
+  double    cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs
+  double    bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs  
+  int8_t    keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays
+  int       size;                    //!< number of active elements in the lists
+} unit_stats_map_t;
+
 #define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)
 #define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1

 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);

+void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map);
+
 void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);

 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu);
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu);
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu);
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu);
 void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);

 void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
--- a/src/search_inter.c
+++ b/src/search_inter.c
--- a/src/search_inter.h
+++ b/src/search_inter.h
@ -64,20 +64,20 @@ enum hpel_position {
  HPEL_POS_DIA = 2
 };

-typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state,
+typedef double kvz_mvd_cost_func(const encoder_state_t *state,
                                  int x, int y,
                                  int mv_shift,
                                  mv_t mv_cand[2][2],
                                  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                                  int16_t num_cand,
                                  int32_t ref_idx,
-                                  uint32_t *bitcost);
+                                  double *bitcost);

 void kvz_search_cu_inter(encoder_state_t * const state,
                         int x, int y, int depth,
                         lcu_t *lcu,
                         double *inter_cost,
-                         uint32_t *inter_bitcost);
+                         double* inter_bitcost);

 void kvz_search_cu_smp(encoder_state_t * const state,
                       int x, int y,
@ -85,12 +85,20 @@ void kvz_search_cu_smp(encoder_state_t * const state,
                       part_mode_t part_mode,
                       lcu_t *lcu,
                       double *inter_cost,
-                       uint32_t *inter_bitcost);
+                       double* inter_bitcost);


 unsigned kvz_inter_satd_cost(const encoder_state_t* state,
                             const lcu_t *lcu,
                             int x,
                             int y);
+void kvz_cu_cost_inter_rd2(encoder_state_t* const state,
+  int x, int y, int depth,
+  cu_info_t* cur_cu,
+  lcu_t* lcu,
+  double* inter_cost,
+  double* inter_bitcost);
+
+int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);

 #endif // SEARCH_INTER_H_
--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -97,13 +97,13 @@ static double get_cost(encoder_state_t * const state,

    // Add the offset bit costs of signaling 'luma and chroma use trskip',
    // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
-    const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
+    const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma;
    double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);

    
    // ToDo: Check cost
    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      ctx = &state->search_cabac.ctx.transform_skip_model_chroma;
      trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
    }
    
@ -394,7 +394,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
  //     max_depth.
  // - Min transform size hasn't been reached (MAX_PU_DEPTH).
  if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->lambda;
+    split_cost = 0;

    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
    if (split_cost < nosplit_cost) {
@ -417,14 +417,15 @@ static double search_intra_trdepth(encoder_state_t * const state,
    // so this will code cbf as 0 and not code the cbf at all for descendants.
    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
      const uint8_t tr_depth = depth - pred_cu->depth;
+      cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;

-      const cabac_ctx_t* ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
      }
      ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]);
      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
      }
    }

@ -677,9 +678,8 @@ static int8_t search_intra_rough(encoder_state_t * const state,

  // Add prediction mode coding cost as the last thing. We don't want this
  // affecting the halving search.
-  int lambda_cost = (int)(state->lambda_sqrt + 0.5);
  for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
+    costs[mode_i] += state->lambda_sqrt * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
  }

  #undef PARALLEL_BLKS
@ -771,7 +771,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
      int rdo_bitcost = kvz_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id);

      *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5);
-
+    
      // Mip related stuff
      // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
      // Half of the modes [16, 31] are indicated with the separate transpose flag.
@ -818,6 +818,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
  }
  

+
  // The best transform split hierarchy is not saved anywhere, so to get the
  // transform split hierarchy the search has to be performed again with the
  // best mode.
@ -854,7 +855,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,

 double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id)
 {
-  double mode_bits = 0.0;
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
+  double mode_bits = 0;

  bool enable_mip = state->encoder_control->cfg.mip;
  bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false;
@ -899,11 +901,26 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
        break;
      }
    }
+  cabac_ctx_t *ctx = &(cabac->ctx.luma_planar_model[1]);
+  CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search");
+  if (state->search_cabac.update) {
+    if(mode_in_preds) {
+      CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[0]), "mpm_idx");
+      if(luma_mode != intra_preds[0]) {
+        CABAC_BIN_EP(cabac, !(luma_mode == intra_preds[1]), "mpm_idx");        
+      }
+    }
+    else {
+      // This value should be transformed for actual coding,
+      // but here the value does not actually matter, just that we write 5 bits
+      CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode");
+    }
+  }

    bool enable_mrl = state->encoder_control->cfg.mrl;
    uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0;

-    const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model);
+    ctx = &(cabac->ctx.intra_luma_mpm_flag_model);

    if (multi_ref_index == 0) {
      mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1);
@ -911,17 +928,17 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const

    // Add MRL bits.
    if (enable_mrl && MAX_REF_LINE_IDX > 1) {
-      ctx = &(state->cabac.ctx.multi_ref_line[0]);
+      ctx = &(cabac->ctx.multi_ref_line[0]);
      mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0);

      if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) {
-        ctx = &(state->cabac.ctx.multi_ref_line[1]);
+        ctx = &(cabac->ctx.multi_ref_line[1]);
        mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1);
      }
    }

    if (mode_in_preds != -1 || multi_ref_index != 0) {
-      ctx = &(state->cabac.ctx.luma_planar_model[0]);
+      ctx = &(cabac->ctx.luma_planar_model[0]);
      if (multi_ref_index == 0) {
        mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0);
      }
@ -938,7 +955,8 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const

 double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode)
 {
-  const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model);
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model);
  double mode_bits;
  if (chroma_mode == luma_mode) {
    mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
@ -958,6 +976,13 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
    mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67);
  }

+  if(cabac->update) {
+    if(chroma_mode != luma_mode) {
+      // Again it does not matter what we actually write here
+      CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode");      
+    }
+  }
+
  return mode_bits;
 }

@ -1045,9 +1070,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
          -1, chroma.mode, // skip luma
          NULL, cclm_params, 0, false, false, lcu);
      }
+      double bits = 0;
      chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);

      double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
+      bits += mode_bits;
      chroma.cost += mode_bits * state->lambda;

      if (chroma.cost < best_chroma.cost) {
--- a/src/transform.c
+++ b/src/transform.c
@ -260,11 +260,9 @@ int kvz_quantize_residual_trskip(
  struct {
    kvz_pixel rec[LCU_WIDTH * LCU_WIDTH];
    coeff_t coeff[LCU_WIDTH * LCU_WIDTH];
-    uint32_t cost;
+    double cost;
    int has_coeffs;
  } skip, *best;
-
-  const int bit_cost = (int)(state->lambda + 0.5);
  
  //noskip.has_coeffs = kvz_quantize_residual(
  //    state, cur_cu, width, color, scan_order,
@ -278,7 +276,7 @@ int kvz_quantize_residual_trskip(
    1, in_stride, width,
    ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj);
  skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width);
-  skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost;
+  skip.cost += kvz_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda;

 /*  if (noskip.cost <= skip.cost) {
    *trskip_out = 0;