diff --git a/README.md b/README.md
index 5027547d..0058a2c7 100644
--- a/README.md
+++ b/README.md
@@ -145,11 +145,20 @@ Video structure:
                                    - frametile: Constrain within the tile.
                                    - frametilemargin: Constrain even more.
       --roi <filename>       : Use a delta QP map for region of interest.
-                               Reads an array of delta QP values from a text
-                               file. The file format is: width and height of
-                               the QP delta map followed by width*height delta
-                               QP values in raster order. The map can be of any
-                               size and will be scaled to the video size.
+                               Reads an array of delta QP values from a file.
+                               Text and binary files are supported and detected
+                               from the file extension (.txt/.bin). If a known
+                               extension is not found, the file is treated as
+                               a text file. The file can include one or many
+                               ROI frames each in the following format:
+                               width and height of the QP delta map followed
+                               by width * height delta QP values in raster
+                               order. In binary format, width and height are
+                               32-bit integers whereas the delta QP values are
+                               signed 8-bit values. The map can be of any size
+                               and will be scaled to the video size. The file
+                               reading will loop if end of the file is reached.
+                               See roi.txt in the examples folder.
       --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
                                in PPS and slice_qp_delta in slize header zero.
       --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
diff --git a/doc/uvg266.1 b/doc/uvg266.1
index e3657ea5..7a4319f8 100644
--- a/doc/uvg266.1
+++ b/doc/uvg266.1
@@ -164,11 +164,20 @@ Constrain movement vectors. [none]
 .TP
 \fB\-\-roi <filename>      
 Use a delta QP map for region of interest.
-Reads an array of delta QP values from a text
-file. The file format is: width and height of
-the QP delta map followed by width*height delta
-QP values in raster order. The map can be of any
-size and will be scaled to the video size.
+Reads an array of delta QP values from a file.
+Text and binary files are supported and detected
+from the file extension (.txt/.bin). If a known
+extension is not found, the file is treated as
+a text file. The file can include one or many
+ROI frames each in the following format:
+width and height of the QP delta map followed
+by width * height delta QP values in raster
+order. In binary format, width and height are
+32\-bit integers whereas the delta QP values are
+signed 8\-bit values. The map can be of any size
+and will be scaled to the video size. The file
+reading will loop if end of the file is reached.
+See roi.txt in the examples folder.
 .TP
 \fB\-\-set\-qp\-in\-cu        
 Set QP at CU level keeping pic_init_qp_minus26.
diff --git a/src/alf.c b/src/alf.c
index 7793c483..ff312627 100644
--- a/src/alf.c
+++ b/src/alf.c
@@ -1236,19 +1236,19 @@ static void code_alf_ctu_filter_index(encoder_state_t * const state,
       assert(filter_set_idx < num_available_filt_sets); //"temporal non-latest set"
       if (num_aps > 1)
       {
-        uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS);
+        uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS, NULL);
       }
     }
     else
     {
       assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //"fixed set larger than temporal"
-      uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS);
+      uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL);
     }
   }
   else
   {
     assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //Fixed set numavail < num_fixed
-    uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS);
+    uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL);
   }
 }
 
diff --git a/src/bitstream.c b/src/bitstream.c
index ce243f51..9d6de07e 100644
--- a/src/bitstream.c
+++ b/src/bitstream.c
@@ -33,6 +33,7 @@
 #include "bitstream.h"
 
 #include <math.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/src/cabac.c b/src/cabac.c
index 794e4de5..9f33b503 100644
--- a/src/cabac.c
+++ b/src/cabac.c
@@ -70,6 +70,7 @@ void uvg_cabac_start(cabac_data_t * const data)
   data->num_buffered_bytes = 0;
   data->buffered_byte = 0xff;
   data->only_count = 0; // By default, write bits out
+  data->update = 0; 
 }
 
 /**
@@ -199,7 +200,7 @@ void uvg_cabac_encode_bin_trm(cabac_data_t * const data, const uint8_t bin_value
 /**
  * \brief encode truncated binary code
  */
-void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value) {
+void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value, double* bits_out) {
   int thresh;
   int symbol = bin_value;
   if (max_value > 256) {
@@ -219,9 +220,11 @@ void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_va
   int b = max_value - val;
   if (symbol < val - b) {
     CABAC_BINS_EP(data, symbol, thresh, "TruncSymbols");
+    if (bits_out) *bits_out += thresh;
   } else {
     symbol += val - b;
     CABAC_BINS_EP(data, symbol, thresh + 1, "TruncSymbols");
+    if (bits_out) *bits_out += thresh + 1;
   }
 }
 
@@ -349,26 +352,30 @@ void uvg_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem
 /**
  * \brief
  */
-void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol)
+void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, 
+  cabac_ctx_t * const ctx, 
+  uint32_t symbol,
+  const int32_t offset,
+  const uint32_t max_symbol, 
+  double* bits_out)
 {
   int8_t code_last = max_symbol > symbol;
 
   assert(symbol <= max_symbol);
 
   if (!max_symbol) return;
-
-  data->cur_ctx = ctx;
-  CABAC_BIN(data, symbol, "ums");
+  
+  CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums");
 
   if (!symbol) return;
 
   data->cur_ctx = &ctx[offset];
 
   while (--symbol) {
-    CABAC_BIN(data, 1, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums");
   }
   if (code_last) {
-    CABAC_BIN(data, 0, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums");
   }
 }
 
@@ -405,7 +412,7 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int
 /**
  * \brief
  */
-void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+uint32_t uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
                                   cabac_data_t * const data,
                                   uint32_t symbol,
                                   uint32_t count)
@@ -426,4 +433,5 @@ void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
   num_bins += count;
 
   CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb");
+  return num_bins;
 }
diff --git a/src/cabac.h b/src/cabac.h
index d642787f..0088d5d9 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -59,7 +59,8 @@ typedef struct
   uint32_t   buffered_byte;
   int32_t    num_buffered_bytes;
   int32_t    bits_left;
-  int8_t     only_count;
+  int8_t     only_count : 4;
+  int8_t     update : 4;
   bitstream_t *stream;
 
   // CONTEXTS
@@ -133,18 +134,18 @@ extern const uint8_t uvg_g_auc_renorm_table[32];
 void uvg_cabac_start(cabac_data_t *data);
 void uvg_cabac_encode_bin(cabac_data_t *data, uint32_t bin_value);
 void uvg_cabac_encode_bin_ep(cabac_data_t *data, uint32_t bin_value);
-void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value);
+void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value, double* bits_out);
 void uvg_cabac_encode_bins_ep(cabac_data_t *data, uint32_t bin_values, int num_bins);
 void uvg_cabac_encode_bin_trm(cabac_data_t *data, uint8_t bin_value);
 void uvg_cabac_write(cabac_data_t *data);
 void uvg_cabac_finish(cabac_data_t *data);
 void uvg_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                               uint32_t r_param, const unsigned int cutoff);
-void uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+uint32_t uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
                 uint32_t symbol, uint32_t count);
 void uvg_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
-                                  uint32_t symbol, int32_t offset,
-                                  uint32_t max_symbol);
+                                      uint32_t symbol, int32_t offset,
+                                      uint32_t max_symbol, double* bits_out);
 void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol);
 
 #define CTX_PROB_BITS 15
@@ -153,6 +154,18 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol
 #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0))
 #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1))
 
+// Floating point fractional bits, derived from kvz_entropy_bits
+extern const float uvg_f_entropy_bits[512];
+#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
+
+#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \
+  if((cabac)->only_count) (bits) += uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \
+  if((cabac)->update) {\
+    (cabac)->cur_ctx = ctx;\
+    CABAC_BIN((cabac), (val), (name));\
+  } \
+} while(0)
+
 // Macros
 #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] )
 #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 )
@@ -185,23 +198,23 @@ extern uint32_t uvg_cabac_bins_count;
 extern bool uvg_cabac_bins_verbose;
 #define CABAC_BIN(data, value, name) { \
     uint32_t prev_state = CTX_STATE(data->cur_ctx); \
-    if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %d  [%d:%d]  %s = %u, range = %u LPS = %u state = %u -> ", \
-           uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS(data->cur_ctx,(data)->range), CTX_LPS(data->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS(data->cur_ctx,(data)->range), prev_state); }\
+    if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %d  [%d:%d]  %s = %u, range = %u LPS = %u state = %u -> ", \
+           uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS((data)->cur_ctx,(data)->range), CTX_LPS((data)->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS((data)->cur_ctx,(data)->range), prev_state); }\
     uvg_cabac_encode_bin((data), (value)); \
-    if(uvg_cabac_bins_verbose && !data->only_count) printf("%u\n", CTX_STATE(data->cur_ctx)); }
+    if(uvg_cabac_bins_verbose && !(data)->only_count) printf("%u\n", CTX_STATE((data)->cur_ctx)); }
     
 
   #define CABAC_BINS_EP(data, value, bins, name) { \
-    uint32_t prev_state = CTX_STATE(data->cur_ctx); \
+    uint32_t prev_state = (!(data)->only_count) ? CTX_STATE(data->cur_ctx) : 0; \
     uvg_cabac_encode_bins_ep((data), (value), (bins)); \
     if(uvg_cabac_bins_verbose && !data->only_count) { printf("%d %s = %u(%u bins), state = %u -> %u\n", \
-           uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE(data->cur_ctx));  uvg_cabac_bins_count+=bins;}}
+           uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE((data)->cur_ctx));  uvg_cabac_bins_count+=(bins);}}
 
   #define CABAC_BIN_EP(data, value, name) { \
-    uint32_t prev_state = CTX_STATE(data->cur_ctx); \
+    uint32_t prev_state = (!(data)->only_count) ? CTX_STATE((data)->cur_ctx) : 0;; \
     uvg_cabac_encode_bin_ep((data), (value)); \
-    if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %s = %u, state = %u -> %u\n", \
-           uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE(data->cur_ctx)); }}
+    if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %s = %u, state = %u -> %u\n", \
+           uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE((data)->cur_ctx)); }}
 #else
   #define CABAC_BIN(data, value, name) \
     uvg_cabac_encode_bin((data), (value));
diff --git a/src/cfg.c b/src/cfg.c
index 8147bcdb..96a24bb1 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -147,9 +147,9 @@ int uvg_config_init(uvg_config *cfg)
   cfg->gop_lp_definition.t = 1;
   cfg->open_gop = true;
 
-  cfg->roi.width = 0;
-  cfg->roi.height = 0;
-  cfg->roi.dqps = NULL;
+  cfg->roi.file_path = NULL;
+  cfg->roi.format = UVG_ROI_TXT;
+
   cfg->set_qp_in_cu = false;
 
   cfg->erp_aqp = false;
@@ -212,6 +212,9 @@ int uvg_config_init(uvg_config *cfg)
 
   cfg->cclm = 0;
 
+
+  cfg->combine_intra_cus = 1;
+  cfg->force_inter = 0;
   return 1;
 }
 
@@ -219,11 +222,11 @@ int uvg_config_destroy(uvg_config *cfg)
 {
   if (cfg) {
     FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->roi.file_path);
     FREE_POINTER(cfg->fast_coeff_table_fn);
     FREE_POINTER(cfg->tiles_width_split);
     FREE_POINTER(cfg->tiles_height_split);
     FREE_POINTER(cfg->slice_addresses_in_ts);
-    FREE_POINTER(cfg->roi.dqps);
     FREE_POINTER(cfg->fastrd_learning_outdir_fn);
   }
   free(cfg);
@@ -1269,60 +1272,29 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   }
   else if OPT("implicit-rdpcm")
     cfg->implicit_rdpcm = (bool)atobool(value);
+
   else if OPT("roi") {
-    // The ROI description is as follows:
-    // First number is width, second number is height,
-    // then follows width * height number of dqp values.
-    FILE* f = fopen(value, "rb");
-    if (!f) {
-      fprintf(stderr, "Could not open ROI file.\n");
+    static enum uvg_roi_format const formats[] = { UVG_ROI_TXT, UVG_ROI_BIN };
+    static const char * const format_names[] = { "txt", "bin", NULL };
+
+    char *roi_file = strdup(value);
+    if (!roi_file) {
+      fprintf(stderr, "Failed to allocate memory for ROI file name.\n");
       return 0;
     }
+    FREE_POINTER(cfg->roi.file_path);
+    cfg->roi.file_path = roi_file;
 
-    int width = 0;
-    int height = 0;
-    if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) {
-      fprintf(stderr, "Failed to read ROI size.\n");
-      fclose(f);
-      return 0;
+    // Get file extension or the substring after the last dot
+    char *maybe_extension = strrchr(cfg->roi.file_path, '.');
+    if (!maybe_extension) {
+      cfg->roi.format = UVG_ROI_TXT;
+    } else {
+      maybe_extension++;
+      int8_t format;
+      bool unknown_format = !parse_enum(maybe_extension, format_names, &format);
+      cfg->roi.format = unknown_format ? UVG_ROI_TXT : formats[format];
     }
-
-    if (width <= 0 || height <= 0) {
-      fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height);
-      fclose(f);
-      return 0;
-    }
-
-    if (width > 10000 || height > 10000) {
-      fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
-      fclose(f);
-      return 0;
-    }
-
-    const unsigned size = width * height;
-    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps[0]));
-    if (!dqp_array) {
-      fprintf(stderr, "Failed to allocate memory for ROI table.\n");
-      fclose(f);
-      return 0;
-    }
-
-    FREE_POINTER(cfg->roi.dqps);
-    cfg->roi.dqps   = dqp_array;
-    cfg->roi.width  = width;
-    cfg->roi.height = height;
-
-    for (int i = 0; i < size; ++i) {
-      int number; // Need a pointer to int for fscanf
-      if (fscanf(f, "%d", &number) != 1) {
-        fprintf(stderr, "Reading ROI file failed.\n");
-        fclose(f);
-        return 0;
-      }
-      dqp_array[i] = CLIP(-51, 51, number);
-    }
-
-    fclose(f);
   }
   else if OPT("set-qp-in-cu") {
     cfg->set_qp_in_cu = (bool)atobool(value);
@@ -1476,6 +1448,12 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
   else if OPT("cclm") {
     cfg->cclm = (bool)atobool(value);
   }
+  else if OPT("combine-intra-cus") {
+    cfg->combine_intra_cus = atobool(value);
+  }
+  else if OPT("force-inter") {
+    cfg->force_inter = atobool(value);
+  }
   else {
     return 0;
   }
diff --git a/src/cli.c b/src/cli.c
index b4f920bc..9fd36359 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -141,6 +141,7 @@ static const struct option long_options[] = {
   { "force-level",        required_argument, NULL, 0 },
   { "high-tier",                no_argument, NULL, 0 },
   { "me-steps",           required_argument, NULL, 0 },
+  { "roi-file",           required_argument, NULL, 0 },
   { "fast-residual-cost", required_argument, NULL, 0 },
   { "set-qp-in-cu",             no_argument, NULL, 0 },
   { "open-gop",                 no_argument, NULL, 0 },
@@ -179,6 +180,10 @@ static const struct option long_options[] = {
   { "no-amvr",                  no_argument, NULL, 0 },
   { "cclm",                     no_argument, NULL, 0 },
   { "no-cclm",                  no_argument, NULL, 0 },
+  { "combine-intra-cus",        no_argument, NULL, 0 },
+  { "no-combine-intra-cus",     no_argument, NULL, 0 },
+  { "force-inter",              no_argument, NULL, 0 },
+  { "no-force-inter",           no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -499,11 +504,20 @@ void print_help(void)
     "                                   - frametile: Constrain within the tile.\n"
     "                                   - frametilemargin: Constrain even more.\n"
     "      --roi <filename>       : Use a delta QP map for region of interest.\n"
-    "                               Reads an array of delta QP values from a text\n"
-    "                               file. The file format is: width and height of\n"
-    "                               the QP delta map followed by width*height delta\n"
-    "                               QP values in raster order. The map can be of any\n"
-    "                               size and will be scaled to the video size.\n"
+    "                               Reads an array of delta QP values from a file.\n"
+    "                               Text and binary files are supported and detected\n"
+    "                               from the file extension (.txt/.bin). If a known\n"
+    "                               extension is not found, the file is treated as\n"
+    "                               a text file. The file can include one or many\n"
+    "                               ROI frames each in the following format:\n"
+    "                               width and height of the QP delta map followed\n"
+    "                               by width * height delta QP values in raster\n"
+    "                               order. In binary format, width and height are\n"
+    "                               32-bit integers whereas the delta QP values are\n"
+    "                               signed 8-bit values. The map can be of any size\n"
+    "                               and will be scaled to the video size. The file\n"
+    "                               reading will loop if end of the file is reached.\n"
+    "                               See roi.txt in the examples folder.\n"
     "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
     "                               in PPS and slice_qp_delta in slize header zero.\n"
     "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
@@ -587,6 +601,16 @@ void print_help(void)
     "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
     "                                learning trees, overrides the\n"
     "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
+    "                                   on lower depth even when search is not\n"
+    "                                   performed on said depth. Should only\n"
+    "                                   be disabled if cus absolutely must not\n"
+    "                                   be larger than limited by the search.\n"
+    "                                   [enabled]"
+    "      --force-inter          : Force the encoder to use inter always.\n"
+    "                               This is mostly for debugging and is not\n"
+    "                               guaranteed to produce sensible bitstream or\n"
+    "                               work at all. [disabled]"
     "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
     "      --(no-)bipred          : Bi-prediction [disabled]\n"
     "      --cu-split-termination <string> : CU split search termination [zero]\n"
diff --git a/src/cu.h b/src/cu.h
index d1d3ae6d..496c73ac 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -148,7 +148,7 @@ typedef struct
   uint8_t merge_idx   : 3; //!< \brief merge index
   uint8_t tr_skip     : 1; //!< \brief transform skip flag
   uint8_t tr_idx      : 3; //!< \brief transform index
-  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
+  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 
 
   uint16_t cbf;
 
@@ -183,6 +183,16 @@ typedef struct
   };
 } cu_info_t;
 
+typedef struct {
+  int16_t x;
+  int16_t y;
+  int8_t width;
+  int8_t height;
+  int8_t chroma_width;
+  int8_t chroma_height;
+} cu_loc_t;
+
+
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
   (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
 
diff --git a/src/encmain.c b/src/encmain.c
index 2ca3bdc3..b04edd9d 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -441,6 +441,7 @@ int main(int argc, char *argv[])
   FILE *input  = NULL; //!< input file (YUV)
   FILE *output = NULL; //!< output file (HEVC NAL stream)
   FILE *recout = NULL; //!< reconstructed YUV output, --debug
+  FILE *roifile = NULL;
   clock_t start_time = clock();
   clock_t encoding_start_cpu_time;
   UVG_CLOCK_T encoding_start_real_time;
@@ -587,7 +588,7 @@ int main(int argc, char *argv[])
     // Give arguments via struct to the input thread
     input_handler_args in_args = {
       .available_input_slots = available_input_slots,
-      .filled_input_slots    = filled_input_slots,
+      .filled_input_slots = filled_input_slots,
 
       .input = input,
       .api = api,
@@ -828,6 +829,7 @@ done:
   if (input)  fclose(input);
   if (output) fclose(output);
   if (recout) fclose(recout);
+  if (roifile) fclose(roifile);
 
   DBG_YUVIEW_CLEANUP();
   CHECKPOINTS_FINALIZE();
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 65d7ab24..f63a8bef 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -352,8 +352,8 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
 }
 
 static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, uint8_t joint_chroma) {
-  int x_local = (x >> 1) % LCU_WIDTH_C;
-  int y_local = (y >> 1) % LCU_WIDTH_C;
+  int x_local = ((x & ~7) >> 1) % LCU_WIDTH_C;
+  int y_local = ((y & ~7) >> 1) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = uvg_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
   if(!joint_chroma){
@@ -367,7 +367,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
         // TODO: transform skip for chroma blocks
         CABAC_BIN(cabac, 0, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, NULL, false);
     }
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
@@ -375,7 +375,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
         cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
         CABAC_BIN(cabac, 0, "transform_skip_flag");
       }
-      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false);
+      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, NULL, false);
     }
   }
   else {
@@ -384,7 +384,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, 2, *scan_idx, NULL, false);
+    uvg_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, COLOR_V, *scan_idx, NULL, false);
     
   }
 }
@@ -444,8 +444,6 @@ static void encode_transform_unit(encoder_state_t * const state,
     } else {
       // Time to to code the chroma transform blocks. Move to the top-left
       // corner of the block.
-      x -= 4;
-      y -= 4;
       cur_pu = uvg_cu_array_at_const((const cu_array_t *)frame->cu_array, x, y);
     }
   }
@@ -485,7 +483,7 @@ static void encode_transform_coeff(encoder_state_t * const state,
   // containing CU.
   const int x_cu = 8 * (x / 8);
   const int y_cu = 8 * (y / 8);
-  const cu_info_t *cur_cu = uvg_cu_array_at_const(frame->cu_array, x_cu, y_cu);
+  const cu_info_t *cur_cu = uvg_cu_array_at_const(frame->cu_array, x, y);
 
   // NxN signifies implicit transform split at the first transform level.
   // There is a similar implicit split for inter, but it is only used when
@@ -507,8 +505,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
  
 
   const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
-  const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+  const int cb_flag_u = cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V);
 
   // The split_transform_flag is not signaled when:
   // - transform size is greater than 32 (depth == 0)
@@ -580,7 +578,7 @@ static void encode_transform_coeff(encoder_state_t * const state,
       cabac_data_t* cabac    = &state->cabac;
 
       // cu_qp_delta_abs prefix
-      uvg_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5);
+      uvg_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL);
 
       if (qp_delta_abs >= 5) {
         // cu_qp_delta_abs suffix
@@ -593,7 +591,13 @@ static void encode_transform_coeff(encoder_state_t * const state,
 
       state->must_code_qp_delta = false;
     }
-    if((cb_flag_u || cb_flag_v ) && (depth != 4 || only_chroma) && state->encoder_control->cfg.jccr) {
+    if((
+        ((cb_flag_u || cb_flag_v ) 
+          && cur_cu->type == CU_INTRA)
+        || (cb_flag_u && cb_flag_v)) 
+      && (depth != 4 || only_chroma) 
+      && state->encoder_control->cfg.jccr
+      ) {
       cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
       CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
     }
@@ -609,17 +613,19 @@ static void encode_transform_coeff(encoder_state_t * const state,
  * \param depth           Depth from LCU.
  * \return if non-zero mvd is coded
  */
-static bool encode_inter_prediction_unit(encoder_state_t * const state,
-                                         cabac_data_t * const cabac,
-                                         const cu_info_t * const cur_cu,
-                                         int x, int y, int width, int height,
-                                         int depth)
+int uvg_encode_inter_prediction_unit(encoder_state_t * const state,
+                                      cabac_data_t * const cabac,
+                                      const cu_info_t * const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, lcu_t* lcu, double* bits_out)
 {
   // Mergeflag
   int16_t num_cand = 0;
   bool non_zero_mvd = false;
-  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
-  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+  double bits = 0;
+
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag");
+
   num_cand = state->encoder_control->cfg.max_merge;
   if (cur_cu->merged) { //merge
     if (num_cand > 1) {
@@ -627,10 +633,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       for (ui = 0; ui < num_cand - 1; ui++) {
         int32_t symbol = (ui != cur_cu->merge_idx);
         if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
         } else {
           CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          if(cabac->only_count) bits += 1;
         }
         if (symbol == 0) break;
       }
@@ -649,12 +655,10 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4
         uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1));
 
-        cabac->cur_ctx = &(cabac->ctx.inter_dir[inter_dir_ctx]);
-        CABAC_BIN(cabac, (inter_dir == 3), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc");
       }
       if (inter_dir < 3) {
-        cabac->cur_ctx = &(cabac->ctx.inter_dir[5]);
-        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[5]), (inter_dir == 2), bits, "inter_pred_idc");
       }
    }
 
@@ -673,20 +677,21 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (ref_LX_size > 1) {
         // parseRefFrmIdx
         int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx];
-
-        cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-        CABAC_BIN(cabac, (ref_frame > 0), "ref_idx_lX");
+        
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");
 
         if (ref_frame > 0 && ref_LX_size > 2) {
           cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
-          CABAC_BIN(cabac, (ref_frame > 1), "ref_idx_lX");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), (ref_frame > 1), bits, "ref_idx_lX");
 
           if (ref_frame > 1 && ref_LX_size > 3) {
             for (int idx = 3; idx < ref_LX_size; idx++)
             {
               uint8_t val = (ref_frame > idx - 1) ? 1 : 0;
               CABAC_BIN_EP(cabac, val, "ref_idx_lX");
+              if (cabac->only_count) bits += 1;
               if (!val) break;
+
             }
           }
         }
@@ -696,39 +701,45 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
       if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) {
 
         mv_t mv_cand[2][2];
-        uvg_inter_get_mv_cand_cua(
+        if (lcu) {
+          uvg_inter_get_mv_cand(
+            state, 
+            x, y, width, height,
+            mv_cand, cur_cu, 
+            lcu, ref_list_idx);
+        }
+        else {
+          uvg_inter_get_mv_cand_cua(
             state,
             x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx);
+            mv_cand, cur_cu, ref_list_idx
+          );
+        }
 
         uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
         mv_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0];
         mv_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1];
 
         uvg_change_precision(INTERNAL_MV_PREC, uvg_g_imv_to_prec[UVG_IMV_OFF], &mvd_hor, &mvd_ver);
-
-        uvg_encode_mvd(state, cabac, mvd_hor, mvd_ver);
+        uvg_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out);
 
         non_zero_mvd |= (mvd_hor != 0) || (mvd_ver != 0);
       }
 
       // Signal which candidate MV to use
-      cabac->cur_ctx = &(cabac->ctx.mvp_idx_model);
-      CABAC_BIN(cabac, CU_GET_MV_CAND(cur_cu, ref_list_idx), "mvp_flag");
+      CABAC_FBITS_UPDATE(cabac,&(cabac->ctx.mvp_idx_model), CU_GET_MV_CAND(cur_cu, ref_list_idx), bits, "mvp_flag");
 
     } // for ref_list
   } // if !merge
+  if(bits_out) *bits_out += bits;
   return non_zero_mvd;
 }
 
-static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width, const int cclm_enabled) {
+static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, const int cclm_enabled) {
   unsigned pred_mode = 0;
   unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83};
-  const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
-  const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, 0);
-  const cu_info_t *first_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y);
-  int8_t chroma_intra_dir = first_pu->intra.mode_chroma;
-  int8_t luma_intra_dir = first_pu->intra.mode;
+  int8_t chroma_intra_dir = cur_cu->intra.mode_chroma;
+  int8_t luma_intra_dir = cur_cu->intra.mode;
 
 
   bool derived_mode = chroma_intra_dir == luma_intra_dir;
@@ -803,19 +814,20 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
   }
 }
 
-static void encode_intra_coding_unit(encoder_state_t * const state,
+void uvg_encode_intra_luma_coding_unit(const encoder_state_t * const state,
                                      cabac_data_t * const cabac,
                                      const cu_info_t * const cur_cu,
-                                     int x, int y, int depth, lcu_coeff_t* coeff)
+                                     int x, int y, int depth, const lcu_t* lcu, double* bits_out)
 {
   const videoframe_t * const frame = state->tile->frame;
-  uint8_t intra_pred_mode_actual[4];
-  uint8_t *intra_pred_mode = intra_pred_mode_actual;
+  uint8_t intra_pred_mode_actual;
+  uint8_t *intra_pred_mode = &intra_pred_mode_actual;
 
   //uint8_t intra_pred_mode_chroma = cur_cu->intra.mode_chroma;
-  int8_t intra_preds[4][INTRA_MPM_COUNT] = {{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1},{-1, -1, -1, -1, -1, -1}};
-  int8_t mpm_preds[4] = {-1, -1, -1, -1};
-  uint32_t flag[4];
+  int8_t intra_preds[INTRA_MPM_COUNT] = {-1, -1, -1, -1, -1, -1};
+  int8_t mpm_preds = -1;
+  uint32_t flag;
+  double bits = 0;
 
   /*
   if ((cur_cu->type == CU_INTRA && (LCU_WIDTH >> cur_cu->depth <= 32))) {
@@ -839,8 +851,6 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
     CABAC_BIN(cabac, 0, "bdpcm_mode");
   }
   */
-
-  const int num_pred_units = uvg_part_mode_num_parts[cur_cu->part_size];
   
   // Intra Subpartition mode
   uint32_t width = (LCU_WIDTH >> depth);
@@ -878,15 +888,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
     const int cu_width = LCU_WIDTH >> depth;
     const int cu_height = cu_width; // TODO: height for non-square blocks
-    uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, NULL, frame->cu_array);
+    uint8_t ctx_id = uvg_get_mip_flag_context(x, y, cu_width, cu_height, lcu, lcu ? NULL : frame->cu_array);
 
     // Write MIP flag
-    cabac->cur_ctx = &(cabac->ctx.mip_flag[ctx_id]);
-    CABAC_BIN(cabac, mip_flag, "mip_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mip_flag[ctx_id]), mip_flag, bits, "mip_flag");
     if (mip_flag) {
       // Write MIP transpose flag & mode
       CABAC_BIN_EP(cabac, mip_transpose, "mip_transposed");
-      uvg_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes);
+      if (cabac->only_count) bits += 1;
+      uvg_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes, bits_out);
+      if (cabac->only_count && bits_out) *bits_out += bits;
+      return;
     }
   }
 
@@ -900,172 +912,155 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
   if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) {
     if (MAX_REF_LINE_IDX > 1) {
-      cabac->cur_ctx = &(cabac->ctx.multi_ref_line[0]);
-      CABAC_BIN(cabac, multi_ref_idx != 0, "multi_ref_line");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[0]), multi_ref_idx != 0, bits, "multi_ref_line");
       if (MAX_REF_LINE_IDX > 2 && multi_ref_idx != 0) {
-        cabac->cur_ctx = &(cabac->ctx.multi_ref_line[1]);
-        CABAC_BIN(cabac, multi_ref_idx != 1, "multi_ref_line")
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.multi_ref_line[1]), multi_ref_idx != 1, bits, "multi_ref_line");
       }
     }
   }
 
 
   // ToDo: update real usage, these if clauses as such don't make any sense
-  if (isp_mode != 0 && multi_ref_idx == 0 && !mip_flag) {
+  if (isp_mode != 0 && multi_ref_idx == 0) {
     if (isp_mode) {
-      cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]);
-      CABAC_BIN(cabac, 0, "intra_subPartitions");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]),  0, bits, "intra_subPartitions");
     } else {
-      cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]);
-      CABAC_BIN(cabac, 1, "intra_subPartitions");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), 1, bits, "intra_subPartitions");
       // ToDo: complete this if-clause
       if (isp_mode == 3) {
-        cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[1]);
-        CABAC_BIN(cabac, allow_isp - 1, "intra_subPart_ver_hor");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_subpart_model[0]), allow_isp - 1, bits, "intra_subPart_ver_hor");
       }
     }
   }
 
   const int cu_width = LCU_WIDTH >> depth;
-  // If MIP is used, skip writing normal intra modes
-  if (!mip_flag) {
     // PREDINFO CODING
     // If intra prediction mode is found from the predictors,
     // it can be signaled with two EP's. Otherwise we can send
     // 5 EP bins with the full predmode
     // ToDo: fix comments for VVC
     
-    cabac->cur_ctx = &(cabac->ctx.intra_luma_mpm_flag_model);
-    for (int j = 0; j < num_pred_units; ++j) {
-      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
-      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
-      const cu_info_t* cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y);
+  const cu_info_t* cur_pu = cur_cu; // uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y);
 
-      const cu_info_t* left_pu = NULL;
-      const cu_info_t* above_pu = NULL;
+  const cu_info_t* left_pu = NULL;
+  const cu_info_t* above_pu = NULL;
 
-      if (pu_x > 0) {
-        assert(pu_x >> 2 > 0);
-        left_pu = uvg_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
-      }
-      // Don't take the above PU across the LCU boundary.
-      if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
-        assert(pu_y >> 2 > 0);
-        above_pu = uvg_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
-      }
+  if (x > 0) {
+    assert(x >> 2 > 0);
+    left_pu = lcu ?
+                LCU_GET_CU_AT_PX(
+                  lcu,
+                  SUB_SCU(x - 1),
+                  SUB_SCU(y + cu_width - 1)) :
+                uvg_cu_array_at_const(
+                  frame->cu_array,
+                  x - 1,
+                  y + cu_width - 1);
+  }
+  // Don't take the above PU across the LCU boundary.
+  if (y % LCU_WIDTH > 0 && y > 0) {
+    assert(y >> 2 > 0);
+    above_pu = lcu ?
+                 LCU_GET_CU_AT_PX(
+                   lcu,
+                   SUB_SCU(x + cu_width - 1),
+                   SUB_SCU(y -1)) :
+                 uvg_cu_array_at_const(
+                   frame->cu_array,
+                   x + cu_width - 1,
+                   y - 1);
+  }
+  
+  uvg_intra_get_dir_luma_predictor(x, y,
+    intra_preds,
+    cur_pu,
+    left_pu, above_pu);
 
+  intra_pred_mode_actual = cur_pu->intra.mode;
 
-      uvg_intra_get_dir_luma_predictor(pu_x, pu_y,
-        intra_preds[j],
-        cur_pu,
-        left_pu, above_pu);
-
-
-      intra_pred_mode_actual[j] = cur_pu->intra.mode;
-
-      for (int i = 0; i < INTRA_MPM_COUNT; i++) {
-        if (intra_preds[j][i] == intra_pred_mode[j]) {
-          mpm_preds[j] = (int8_t)i;
-          break;
-        }
-      }
-      // Is the mode in the MPM array or not
-      flag[j] = (mpm_preds[j] == -1) ? 0 : 1;
-      if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
-        CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag");
-      }
+  for (int i = 0; i < INTRA_MPM_COUNT; i++) {
+    if (intra_preds[i] == *intra_pred_mode) {
+      mpm_preds = (int8_t)i;
+      break;
+    }
+  }
+  // Is the mode in the MPM array or not
+  flag = (mpm_preds == -1) ? 0 : 1;
+  if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.intra_luma_mpm_flag_model), flag, bits, "prev_intra_luma_pred_flag");
+  }
+    
+  // Signal index of the prediction mode in the prediction list, if it is there
+  if (flag) {
+    
+    const cu_info_t* cur_pu = cur_cu;
+    if (cur_pu->intra.multi_ref_idx == 0) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]), (mpm_preds > 0 ? 1 : 0), bits, "mpm_idx_luma_planar");
     }
 
-    for (int j = 0; j < num_pred_units; ++j) {
-      // TODO: this loop is unnecessary in VVC. Remove in future
-      assert(j == 0 && "In VVC this loop should be run only once.");
+    if (mpm_preds > 0) {
+      CABAC_BIN_EP(cabac, (mpm_preds > 1 ? 1 : 0), "mpm_idx");
+      if (cabac->only_count) bits += 1;
+    }
+    if (mpm_preds > 1) {
+      CABAC_BIN_EP(cabac, (mpm_preds > 2 ? 1 : 0), "mpm_idx");
+      if (cabac->only_count) bits += 1;
+    }
+    if (mpm_preds > 2) {
+      CABAC_BIN_EP(cabac, (mpm_preds > 3 ? 1 : 0), "mpm_idx");
+      if (cabac->only_count) bits += 1;
+    }
+    if (mpm_preds > 3) {
+      CABAC_BIN_EP(cabac, (mpm_preds > 4 ? 1 : 0), "mpm_idx");
+      if (cabac->only_count) bits += 1;
+    }
+  }
+  else {
+    // Signal the actual prediction mode.
+    int32_t tmp_pred = *intra_pred_mode;
 
-      // Signal index of the prediction mode in the prediction list, if it is there
-      if (flag[j]) {
+    uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2];
+    memcpy(intra_preds_temp, intra_preds, sizeof(int8_t) * 3);
+    memcpy(intra_preds_temp + 4, &intra_preds[3], sizeof(int8_t) * 3);
+    intra_preds_temp[3] = 255;
+    intra_preds_temp[7] = 255;
 
-        const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
-        const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
-        const cu_info_t* cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y);
-        cabac->cur_ctx = &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]);
-        if (cur_pu->intra.multi_ref_idx == 0) {
-          CABAC_BIN(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx_luma_planar");
-        }
-        //CABAC_BIN_EP(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx");
-        if (mpm_preds[j] > 0) {
-          CABAC_BIN_EP(cabac, (mpm_preds[j] > 1 ? 1 : 0), "mpm_idx");
-        }
-        if (mpm_preds[j] > 1) {
-          CABAC_BIN_EP(cabac, (mpm_preds[j] > 2 ? 1 : 0), "mpm_idx");
-        }
-        if (mpm_preds[j] > 2) {
-          CABAC_BIN_EP(cabac, (mpm_preds[j] > 3 ? 1 : 0), "mpm_idx");
-        }
-        if (mpm_preds[j] > 3) {
-          CABAC_BIN_EP(cabac, (mpm_preds[j] > 4 ? 1 : 0), "mpm_idx");
-        }
+    // Improvised merge sort
+    // Sort prediction list from lowest to highest.
+    if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t);
+    if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t);
+    if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t);
+
+    if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t);
+    if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t);
+    if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t);
+
+    // Merge two subarrays
+    int32_t array1 = 0;
+    int32_t array2 = 4;
+    for (int item = 0; item < INTRA_MPM_COUNT; item++) {
+      if (intra_preds_temp[array1] < intra_preds_temp[array2]) {
+        intra_preds[item] = intra_preds_temp[array1];
+        array1++;
       }
       else {
-        // Signal the actual prediction mode.
-        int32_t tmp_pred = intra_pred_mode[j];
-
-        uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2];
-        memcpy(intra_preds_temp, intra_preds[j], sizeof(int8_t) * 3);
-        memcpy(intra_preds_temp + 4, &intra_preds[j][3], sizeof(int8_t) * 3);
-        intra_preds_temp[3] = 255;
-        intra_preds_temp[7] = 255;
-
-        // Improvised merge sort
-        // Sort prediction list from lowest to highest.
-        if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t);
-        if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t);
-        if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t);
-
-        if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t);
-        if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t);
-        if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t);
-
-        // Merge two subarrays
-        int32_t array1 = 0;
-        int32_t array2 = 4;
-        for (int item = 0; item < INTRA_MPM_COUNT; item++) {
-          if (intra_preds_temp[array1] < intra_preds_temp[array2]) {
-            intra_preds[j][item] = intra_preds_temp[array1];
-            array1++;
-          }
-          else {
-            intra_preds[j][item] = intra_preds_temp[array2];
-            array2++;
-          }
-        }
-
-        // Reduce the index of the signaled prediction mode according to the
-        // prediction list, as it has been already signaled that it's not one
-        // of the prediction modes.
-        for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) {
-          if (tmp_pred > intra_preds[j][i]) {
-            tmp_pred--;
-          }
-        }
-
-        uvg_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT);
+        intra_preds[item] = intra_preds_temp[array2];
+        array2++;
       }
     }
-  }
 
-  // Code chroma prediction mode.
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
-  }
-
-  encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
-
-  encode_mts_idx(state, cabac, cur_cu);
-
-  if (state->encoder_control->chroma_format != UVG_CSP_400 && depth == 4 && x % 8 && y % 8) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
-    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
-  }
+    // Reduce the index of the signaled prediction mode according to the
+    // prediction list, as it has been already signaled that it's not one
+    // of the prediction modes.
+    for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) {
+      if (tmp_pred > intra_preds[i]) {
+        tmp_pred--;
+      }
+    }
 
+    uvg_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT, bits_out);
+  }    
+  if (cabac->only_count && bits_out) *bits_out += bits;
 }
 
 /**
@@ -1104,32 +1099,32 @@ static void encode_part_mode(encoder_state_t * const state,
   //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
   //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
   // ------------------------------+------------------
-
+  double bits = 0;
   if (cur_cu->type == CU_INTRA) {
     if (depth == MAX_DEPTH) {
       cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
       if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
       } else {
-        CABAC_BIN(cabac, 0, "part_mode NxN");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode NxN");
       }
     }
   } else {
 
     cabac->cur_ctx = &(cabac->ctx.part_size_model[0]);
     if (cur_cu->part_size == SIZE_2Nx2N) {
-      CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      return;
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 1, bits, "part_mode 2Nx2N");
+      return bits;
     }
-    CABAC_BIN(cabac, 0, "part_mode split");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[0]), 0, bits, "part_mode split");
 
     cabac->cur_ctx = &(cabac->ctx.part_size_model[1]);
     if (cur_cu->part_size == SIZE_2NxN ||
         cur_cu->part_size == SIZE_2NxnU ||
         cur_cu->part_size == SIZE_2NxnD) {
-      CABAC_BIN(cabac, 1, "part_mode vertical");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 1, bits, "part_mode vertical");
     } else {
-      CABAC_BIN(cabac, 0, "part_mode horizontal");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[1]), 0, bits, "part_mode horizontal");
     }
 
     if (state->encoder_control->cfg.amp_enable && depth < MAX_DEPTH) {
@@ -1137,22 +1132,134 @@ static void encode_part_mode(encoder_state_t * const state,
 
       if (cur_cu->part_size == SIZE_2NxN ||
           cur_cu->part_size == SIZE_Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode SMP");
-        return;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 1, bits, "part_mode SMP");
+        return bits;
       }
-      CABAC_BIN(cabac, 0, "part_mode AMP");
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.part_size_model[3]), 0, bits, "part_mode AMP");
 
       if (cur_cu->part_size == SIZE_2NxnU ||
           cur_cu->part_size == SIZE_nLx2N) {
         CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
       } else {
         CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
+        if(cabac->only_count) bits += 1;
       }
     }
   }
+  return bits;
 }
 **/
 
+
+bool uvg_write_split_flag(const encoder_state_t * const state, cabac_data_t* cabac,
+  const cu_info_t * left_cu, const cu_info_t * above_cu,
+  uint8_t split_flag,
+  int depth, int cu_width, int x, int y, double* bits_out)
+{
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y;
+  double bits = 0;
+  const encoder_control_t* const ctrl = state->encoder_control;
+  // Implisit split flag when on border
+  // Exception made in VVC with flag not being implicit if the BT can be used for
+  // horizontal or vertical split, then this flag tells if QT or BT is used
+
+  bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
+  no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
+  if (depth > MAX_DEPTH) allow_qt = false;
+  // ToDo: update this when btt is actually used
+  bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
+  
+
+  uint8_t implicit_split_mode = UVG_NO_SPLIT;
+  //bool implicit_split = border;
+  bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height);
+  bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width);
+
+  if (!bottom_left_available && !top_right_available && allow_qt) {
+    implicit_split_mode = UVG_QUAD_SPLIT;
+  }
+  else if (!bottom_left_available && allow_btt) {
+    implicit_split_mode = UVG_HORZ_SPLIT;
+  }
+  else if (!top_right_available && allow_btt) {
+    implicit_split_mode = UVG_VERT_SPLIT;
+  }
+  else if (!bottom_left_available || !top_right_available) {
+    implicit_split_mode = UVG_QUAD_SPLIT;
+  }
+  
+  // Check split conditions
+  if (implicit_split_mode != UVG_NO_SPLIT) {
+    no_split = th_split = tv_split = false;
+    bh_split = (implicit_split_mode == UVG_HORZ_SPLIT);
+    bv_split = (implicit_split_mode == UVG_VERT_SPLIT);
+  }
+
+  if (!allow_btt) {
+    bh_split = bv_split = th_split = tv_split = false;
+  }
+
+  bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
+
+  split_flag |= implicit_split_mode != UVG_NO_SPLIT;
+
+  int split_model = 0;
+  if (no_split && allow_split) {
+    // Get left and top block split_flags and if they are present and true, increase model number
+    // ToDo: should use height and width to increase model, PU_GET_W() ?
+    if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) {
+      split_model++;
+    }
+
+    if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) {
+      split_model++;
+    }
+
+    uint32_t split_num = 0;
+    if (allow_qt) split_num += 2;
+    if (bh_split) split_num++;
+    if (bv_split) split_num++;
+    if (th_split) split_num++;
+    if (tv_split) split_num++;
+
+    if (split_num > 0) split_num--;
+
+    split_model += 3 * (split_num >> 1);
+
+    cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag, bits, "split_flag");
+  }
+
+  bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT;
+
+  if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
+    split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag");
+  }
+
+  // Only signal split when it is not implicit, currently only Qt split supported
+  if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
+
+    split_model = 0;
+
+    // Get left and top block split_flags and if they are present and true, increase model number
+    if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
+      split_model++;
+    }
+
+    if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
+      split_model++;
+    }
+
+    split_model += (depth > 2 ? 0 : 3);
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), split_flag, bits, "split_cu_mode");
+  }
+  if (bits_out) *bits_out += bits;
+  return split_flag;
+}
+
 void uvg_encode_coding_tree(encoder_state_t * const state,
                             uint16_t x,
                             uint16_t y,
@@ -1176,8 +1283,6 @@ void uvg_encode_coding_tree(encoder_state_t * const state,
     above_cu = uvg_cu_array_at_const((const cu_array_t*)frame->cu_array, x, y - 1);
   }
 
-  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
-  uint8_t split_model = 0;
 
   // Absolute coordinates
   uint16_t abs_x = x + state->tile->offset_x;
@@ -1190,123 +1295,15 @@ void uvg_encode_coding_tree(encoder_state_t * const state,
   bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
   bool border = border_x || border_y; /*!< are we in any border CU */
 
-  if (depth <= ctrl->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
     state->must_code_qp_delta = true;
   }
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH) {
 
-    // Implisit split flag when on border
-    // Exception made in VVC with flag not being implicit if the BT can be used for
-    // horizontal or vertical split, then this flag tells if QT or BT is used
-
-    bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
-    no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-    if(depth > MAX_DEPTH) allow_qt = false;
-    // ToDo: update this when btt is actually used
-    bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
-
+    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, GET_SPLITDATA(cur_cu, depth), depth, cu_width, x, y, NULL);
     
-
-    uint8_t implicit_split_mode = UVG_NO_SPLIT;
-    //bool implicit_split = border;
-    bool bottom_left_available = ((abs_y + cu_width - 1) < ctrl->in.height);
-    bool top_right_available = ((abs_x + cu_width - 1) < ctrl->in.width);
-
-    /*
-    if((depth >= 1 && (border_x != border_y))) implicit_split = false;
-    if (state->frame->slicetype != UVG_SLICE_I) {
-      if (border_x != border_y) implicit_split = false;
-      if (!bottom_left_available && top_right_available) implicit_split = false;
-      if (!top_right_available && bottom_left_available) implicit_split = false;
-    }
-    */
-
-
-    if (!bottom_left_available && !top_right_available && allow_qt) {
-      implicit_split_mode = UVG_QUAD_SPLIT;
-    } else if (!bottom_left_available && allow_btt) {
-      implicit_split_mode = UVG_HORZ_SPLIT;
-    } else if (!top_right_available && allow_btt) {
-      implicit_split_mode = UVG_VERT_SPLIT;
-    } else if (!bottom_left_available || !top_right_available) {
-      implicit_split_mode = UVG_QUAD_SPLIT;
-    }
-
-    //split_flag = implicit_split_mode != UVG_NO_SPLIT;
-
-    // Check split conditions
-    if (implicit_split_mode != UVG_NO_SPLIT) {
-      no_split = th_split = tv_split = false;
-      bh_split = (implicit_split_mode == UVG_HORZ_SPLIT);
-      bv_split = (implicit_split_mode == UVG_VERT_SPLIT);
-    }
-
-    if (!allow_btt) {
-      bh_split = bv_split = th_split = tv_split = false;
-    }
-
-    bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
-
-    split_flag |= implicit_split_mode != UVG_NO_SPLIT;
-
-    if (no_split && allow_split) {
-      split_model = 0;
-      
-      // Get left and top block split_flags and if they are present and true, increase model number
-      // ToDo: should use height and width to increase model, PU_GET_W() ?
-      if (left_cu && PU_GET_H(left_cu->part_size,LCU_WIDTH>>left_cu->depth,0) < LCU_WIDTH>>depth) {
-        split_model++;
-      }
-
-      if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) {
-        split_model++;
-      }
-
-      uint32_t split_num = 0;
-      if (allow_qt) split_num+=2;
-      if (bh_split) split_num++;
-      if (bv_split) split_num++;
-      if (th_split) split_num++;
-      if (tv_split) split_num++;
-
-      if (split_num > 0) split_num--;
-
-      split_model += 3 * (split_num >> 1);
-
-      cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
-      CABAC_BIN(cabac, split_flag, "SplitFlag");
-      //fprintf(stdout, "split_model=%d  %d / %d / %d / %d / %d\n", split_model, allow_qt, bh_split, bv_split, th_split, tv_split);
-    }
-
-    bool qt_split = split_flag || implicit_split_mode == UVG_QUAD_SPLIT;
-
-    if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
-      split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
-      cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]);
-      CABAC_BIN(cabac, qt_split, "QT_SplitFlag");
-    }
-
-    // Only signal split when it is not implicit, currently only Qt split supported
-    if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
-
-      split_model = 0;
-
-      // Get left and top block split_flags and if they are present and true, increase model number
-      if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
-        split_model++;
-      }
-
-      if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
-        split_model++;
-      }
-      split_model += (depth > 2 ? 0 : 3);
-
-      cabac->cur_ctx = &(cabac->ctx.qt_split_flag_model[split_model]);
-      CABAC_BIN(cabac, split_flag, "split_cu_mode");
-    }
-
     if (split_flag || border) {
       // Split blocks and remember to change x and y block positions
       uvg_encode_coding_tree(state, x, y, depth + 1, coeff);
@@ -1455,7 +1452,7 @@ void uvg_encode_coding_tree(encoder_state_t * const state,
       const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
       const cu_info_t *cur_pu = uvg_cu_array_at_const(frame->cu_array, pu_x, pu_y);
 
-      non_zero_mvd |= encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth);
+      non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL);
       DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu);
       uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu);
     }
@@ -1493,7 +1490,22 @@ void uvg_encode_coding_tree(encoder_state_t * const state,
 
     }
   } else if (cur_cu->type == CU_INTRA) {
-    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth, coeff);
+    uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, NULL, NULL);
+
+    // Code chroma prediction mode.
+    if (state->encoder_control->chroma_format != UVG_CSP_400 && depth != 4) {
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm);
+    }
+
+    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
+
+    encode_mts_idx(state, cabac, cur_cu);
+
+    // For 4x4 the chroma PU/TU is coded after the last 
+    if (state->encoder_control->chroma_format != UVG_CSP_400 && depth == 4 && x % 8 && y % 8) {
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm);
+      encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);    
+    }
   }
 
   else {
@@ -1510,11 +1522,111 @@ end:
 
 }
 
+double uvg_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu) {
+  double bits = 0;
+  const encoder_control_t* const ctrl = state->encoder_control;
+
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+
+  const int cu_width = LCU_WIDTH >> depth;
+  
+  const cu_info_t* left_cu = NULL, *above_cu = NULL;
+  if (x) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+  }
+  if (y) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local-1);
+  }
+  
+  if (depth <= state->frame->max_qp_delta_depth) {
+    state->must_code_qp_delta = true;
+  }
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    uvg_write_split_flag(state, cabac, left_cu, above_cu, 0, depth, cu_width, x, y, &bits);
+  }
+
+  // Encode skip flag
+  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+    int8_t ctx_skip = 0;
+
+    if (left_cu && left_cu->skipped) {
+      ctx_skip++;
+    }
+    if (above_cu && above_cu->skipped) {
+      ctx_skip++;
+    }
+    
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_skip_flag_model[ctx_skip]), cur_cu->skipped, bits, "SkipFlag");
+
+    if (cur_cu->skipped) {
+      int16_t num_cand = state->encoder_control->cfg.max_merge;
+      if (num_cand > 1) {
+        for (int ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
+          }
+          else {
+            CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+            if(cabac->only_count) bits += 1;
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return bits;
+    }
+  }
+  // Prediction mode
+  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+
+    int8_t ctx_predmode = 0;
+
+    if ((left_cu && left_cu->type == CU_INTRA) || (above_cu && above_cu->type == CU_INTRA)) {
+      ctx_predmode = 1;
+    }
+
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_pred_mode_model[ctx_predmode]), (cur_cu->type == CU_INTRA), bits, "PredMode");
+  }
+  
+  if (cur_cu->type == CU_INTER) {
+    const uint8_t imv_mode = UVG_IMV_OFF;
+    const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits);
+    if (ctrl->cfg.amvr && non_zero_mvd) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag");
+      if (imv_mode > UVG_IMV_OFF) {
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[4]), imv_mode, bits, "imv_flag");
+        if (imv_mode < UVG_IMV_HPEL) {
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[1]), imv_mode, bits, "imv_flag"); // 1 indicates 4PEL, 0 FPEL
+        }
+      }
+    }
+  }
+  else if (cur_cu->type == CU_INTRA) {
+    uvg_encode_intra_luma_coding_unit(state, cabac, cur_cu, x, y, depth, lcu, &bits);
+    if((depth != 4 || (x % 8 != 0 && y % 8 != 0)) && state->encoder_control->chroma_format != UVG_CSP_400) {
+      encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm);
+    }
+  }
+  else {
+    assert(0 && "Unset cu type");
+  }
+  return bits;
+}
+
 
 void uvg_encode_mvd(encoder_state_t * const state,
                     cabac_data_t *cabac,
                     int32_t mvd_hor,
-                    int32_t mvd_ver)
+                    int32_t mvd_ver, double* bits_out)
 {
   const int8_t hor_abs_gr0 = mvd_hor != 0;
   const int8_t ver_abs_gr0 = mvd_ver != 0;
@@ -1522,29 +1634,33 @@ void uvg_encode_mvd(encoder_state_t * const state,
   const uint32_t mvd_ver_abs = abs(mvd_ver);
 
   cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0];
-  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_hor != 0), *bits_out, "abs_mvd_greater0_flag_hor");
+  CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[0], (mvd_ver != 0), *bits_out, "abs_mvd_greater0_flag_ver");
 
   cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1];
   if (hor_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_hor_abs>1), *bits_out,"abs_mvd_greater1_flag_hor");
   }
   if (ver_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+    CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_mvd_model[1], (mvd_ver_abs>1), *bits_out, "abs_mvd_greater1_flag_ver");
   }
 
   if (hor_abs_gr0) {
     if (mvd_hor_abs > 1) {
-      uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      if(cabac->only_count) *bits_out += bits;
     }
     uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1;
     CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
+    if (cabac->only_count) *bits_out += 1;
   }
   if (ver_abs_gr0) {
     if (mvd_ver_abs > 1) {
-      uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      uint32_t bits = uvg_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      if (cabac->only_count) *bits_out += bits;
     }
     uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1;
     CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
+    if (cabac->only_count) *bits_out += 1;
   }
 }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 8141d19b..92e46e04 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -56,7 +56,33 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 void uvg_encode_mvd(encoder_state_t * const state,
                     cabac_data_t *cabac,
                     int32_t mvd_hor,
-                    int32_t mvd_ver);
+                    int32_t mvd_ver,
+                    double* bits_out);
+
+double uvg_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu);
+
+int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
+                                      cabac_data_t* const cabac,
+                                      const cu_info_t* const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, 
+                                      lcu_t* lcu,
+                                      double* bits_out);
+
+void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  int x, int y, int depth, const lcu_t* lcu, double* bits_out);
+
+
+bool uvg_write_split_flag(const encoder_state_t* const state, cabac_data_t* cabac,
+  const cu_info_t* left_cu, const cu_info_t* above_cu,
+  uint8_t split_flag,
+  int depth, int cu_width, int x, int y, double* bits_out);
 
 void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
   uint8_t lastpos_x, uint8_t lastpos_y,
diff --git a/src/encoder.c b/src/encoder.c
index daaa717e..86259ad9 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -32,7 +32,6 @@
 
 #include "encoder.h"
 
-// This define is required for M_PI on Windows.
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <stdio.h>
@@ -45,14 +44,6 @@
 #include "uvg_math.h"
 #include "fast_coeff_cost.h"
 
-/**
- * \brief Strength of QP adjustments when using adaptive QP for 360 video.
- *
- * Determined empirically.
- */
-static const double ERP_AQP_STRENGTH = 3.0;
-
-
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
 static unsigned cfg_num_threads(void)
@@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder)
 }
 
 
-/**
- * \brief Return weight for 360 degree ERP video
- *
- * Returns the scaling factor of area from equirectangular projection to
- * spherical surface.
- *
- * \param y   y-coordinate of the pixel
- * \param h   height of the picture
- */
-static double ws_weight(int y, int h)
-{
-  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
-}
-
-
-
 /**
  * \brief Update ROI QPs for 360 video with equirectangular projection.
  *
@@ -162,55 +137,6 @@ static double ws_weight(int y, int h)
  * \param orig_width    width of orig_roi
  * \param orig_height   height of orig_roi
  */
-static void init_erp_aqp_roi(encoder_control_t* encoder,
-                             int8_t *orig_roi,
-                             int32_t orig_width,
-                             int32_t orig_height)
-{
-  // Update ROI with WS-PSNR delta QPs.
-  int height = encoder->in.height_in_lcu;
-  int width  = orig_roi ? orig_width : 1;
-
-  int frame_height = encoder->in.real_height;
-
-  encoder->cfg.roi.width  = width;
-  encoder->cfg.roi.height = height;
-  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi[0]));
-
-  double total_weight = 0.0;
-  for (int y = 0; y < frame_height; y++) {
-    total_weight += ws_weight(y, frame_height);
-  }
-
-  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
-    int y_orig = LCU_WIDTH * y_lcu;
-    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
-
-    double lcu_weight = 0.0;
-    for (int y = y_orig; y < y_orig + lcu_height; y++) {
-      lcu_weight += ws_weight(y, frame_height);
-    }
-    // Normalize.
-    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
-
-    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
-
-    if (orig_roi) {
-      // If a ROI array already exists, we copy the existing values to the
-      // new array while adding qp_delta to each.
-      int y_roi = y_lcu * orig_height / height;
-      for (int x = 0; x < width; x++) {
-        encoder->cfg.roi.dqps[x + y_lcu * width] =
-          CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta);
-      }
-
-    } else {
-      // Otherwise, simply write qp_delta to the ROI array.
-      encoder->cfg.roi.dqps[y_lcu] = qp_delta;
-    }
-  }
-}
-
 
 static int8_t* derive_chroma_QP_mapping_table(const uvg_config* const cfg, int i)
 {
@@ -394,6 +320,16 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
     encoder->scaling_list.use_default_list = 1;
   }
 
+  // ROI / delta QP
+  if (cfg->roi.file_path) {
+    const char *mode[2] = { "r", "rb" };
+    encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]);
+    if (!encoder->roi_file) {
+      fprintf(stderr, "Could not open ROI file.\n");
+      goto init_failed;
+    }
+  }
+
   if (cfg->fast_coeff_table_fn) {
     FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
     if (fast_coeff_table_f == NULL) {
@@ -435,32 +371,10 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
     goto init_failed;
   }
 
-  if (cfg->erp_aqp) {
-    init_erp_aqp_roi(encoder,
-                     cfg->roi.dqps,
-                     cfg->roi.width,
-                     cfg->roi.height);
-
-  } else if (cfg->roi.dqps) {
-    // Copy delta QP array for ROI coding.
-    const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
-    encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0]));
-    memcpy(encoder->cfg.roi.dqps,
-           cfg->roi.dqps,
-           roi_size * sizeof(*cfg->roi.dqps));
-
-  }
-
   // NOTE: When tr_depth_inter is equal to 0, the transform is still split
   // for SMP and AMP partition units.
   encoder->tr_depth_inter = 0;
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
-    encoder->max_qp_delta_depth = 0;
-  } else {
-    encoder->max_qp_delta_depth = -1;
-  }
-
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
@@ -761,7 +675,7 @@ void uvg_encoder_control_free(encoder_control_t *const encoder)
 
   FREE_POINTER(encoder->tiles_tile_id);
 
-  FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.roi.file_path);
 
   uvg_scalinglist_destroy(&encoder->scaling_list);
 
@@ -773,6 +687,10 @@ void uvg_encoder_control_free(encoder_control_t *const encoder)
 
   uvg_close_rdcost_outfiles();
 
+  if (encoder->roi_file) {
+    fclose(encoder->roi_file);
+  }
+
   free(encoder);
 }
 
diff --git a/src/encoder.h b/src/encoder.h
index 86bf2529..02dc26b7 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -130,7 +130,7 @@ typedef struct encoder_control_t
   //! Picture weights when GOP is used.
   double gop_layer_weights[MAX_GOP_LAYERS];
 
-  int8_t max_qp_delta_depth;
+  FILE *roi_file;
 
   int tr_depth_inter;
 
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index ae346526..402ec559 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -805,7 +805,7 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream,
   WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag");
 
   WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26");
-  WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
+  WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
 
   WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag");
   /* // If chroma_tool_offsets_present
@@ -1037,8 +1037,8 @@ static void uvg_encoder_state_write_bitstream_picture_header(
   const int poc_lsb = state->frame->poc & ((1 << encoder->poc_lsb_bits) - 1);
   WRITE_U(stream, poc_lsb, encoder->poc_lsb_bits, "ph_pic_order_cnt_lsb");
 
-  if (encoder->max_qp_delta_depth >= 0) {
-    WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice");
+  if (state->frame->max_qp_delta_depth >= 0) {
+    WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice");
   }
 
   // alf enable flags and aps IDs
@@ -1118,8 +1118,8 @@ static void uvg_encoder_state_write_bitstream_picture_header(
     || state->frame->pictype == UVG_NAL_IDR_N_LP) {
   }
   else {
-    if (encoder->max_qp_delta_depth >= 0) {
-      WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice");
+    if (state->frame->max_qp_delta_depth >= 0) {
+      WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice");
     }
     if (state->encoder_control->cfg.tmvp_enable) {
       WRITE_U(stream, state->encoder_control->cfg.tmvp_enable, 1, "ph_pic_temporal_mvp_enabled_flag");
@@ -1128,7 +1128,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
   }
 
   if (encoder->cfg.jccr) {
-    WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag");
+    WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
   }
   // END PICTURE HEADER
 
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 32d86d65..5a99e588 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -32,6 +32,9 @@
 
 #include "encoderstate.h"
 
+ // This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <ctype.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -53,6 +56,12 @@
 
 #include "strategies/strategies-picture.h"
 
+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;
 
 int uvg_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -572,7 +581,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
   cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
   const int cu_width = LCU_WIDTH >> depth;
 
-  if (depth <= state->encoder_control->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
   }
 
@@ -624,6 +633,38 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
   }
 }
 
+
+static void set_joint_cb_cr_modes(encoder_state_t* state, uvg_picture* pic)
+{
+  bool              sgnFlag = true;
+
+  if (state->encoder_control->chroma_format != UVG_CSP_400)
+  {
+    const int       x1 = pic->width / 2 - 1;
+    const int       y1 = pic->height / 2 - 1;
+    const int       cbs = pic->stride / 2;
+    const int       crs = pic->stride / 2;
+    const uvg_pixel* p_cb = pic->u + 1 * cbs;
+    const uvg_pixel* p_cr = pic->v + 1 * crs;
+    int64_t         sum_cb_cr = 0;
+
+    // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes
+    for (int y = 1; y < y1; y++, p_cb += cbs, p_cr += crs)
+    {
+      for (int x = 1; x < x1; x++)
+      {
+        int cb = (12 * (int)p_cb[x] - 2 * ((int)p_cb[x - 1] + (int)p_cb[x + 1] + (int)p_cb[x - cbs] + (int)p_cb[x + cbs]) - ((int)p_cb[x - 1 - cbs] + (int)p_cb[x + 1 - cbs] + (int)p_cb[x - 1 + cbs] + (int)p_cb[x + 1 + cbs]));
+        int cr = (12 * (int)p_cr[x] - 2 * ((int)p_cr[x - 1] + (int)p_cr[x + 1] + (int)p_cr[x - crs] + (int)p_cr[x + crs]) - ((int)p_cr[x - 1 - crs] + (int)p_cr[x + 1 - crs] + (int)p_cr[x - 1 + crs] + (int)p_cr[x + 1 + crs]));
+        sum_cb_cr += cb * cr;
+      }
+    }
+
+    sgnFlag = (sum_cb_cr < 0);
+  }
+
+  state->frame->jccr_sign = sgnFlag;
+}
+
 static void encoder_state_worker_encode_lcu_bitstream(void* opaque);
 
 static void encoder_state_worker_encode_lcu_search(void * opaque)
@@ -665,7 +706,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
 
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
     int last_qp = state->last_qp;
     int prev_qp = -1;
     set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
@@ -716,6 +757,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   const uint64_t existing_bits = uvg_bitstream_tell(&state->stream);
 
   //Encode SAO
+  state->cabac.update = 1;
   if (encoder->cfg.sao_type) {
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
   }
@@ -771,6 +813,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
       uvg_cabac_start(&state->cabac);
     }
   }
+  state->cabac.update = 0;
 
 
   pthread_mutex_lock(&state->frame->rc_lock);
@@ -1421,6 +1464,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
   }
 }
 
+
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Updates the ROI parameters in frame->roi.
+ *
+ * \param encoder       encoder control
+ * \param frame         frame that will have the ROI map
+ */
+static void init_erp_aqp_roi(const encoder_control_t *encoder, uvg_picture *frame)
+{
+  int8_t *orig_roi    = frame->roi.roi_array;
+  int32_t orig_width  = frame->roi.width;
+  int32_t orig_height = frame->roi.height;
+
+  // Update ROI with WS-PSNR delta QPs.
+  int new_height = encoder->in.height_in_lcu;
+  int new_width = orig_roi ? orig_width : 1;
+  int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0]));
+
+  int frame_height = encoder->in.real_height;
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < new_height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / new_height;
+      for (int x = 0; x < new_width; x++) {
+        new_array[x + y_lcu * new_width] =
+          CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta);
+      }
+
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      new_array[y_lcu] = qp_delta;
+    }
+  }
+
+  // Update new values
+  frame->roi.width = new_width;
+  frame->roi.height = new_height;
+  frame->roi.roi_array = new_array;
+  FREE_POINTER(orig_roi);
+}
+
+
+static void next_roi_frame_from_file(uvg_picture *frame, FILE *file, enum uvg_roi_format format) {
+  // The ROI description is as follows:
+  // First number is width, second number is height,
+  // then follows width * height number of dqp values.
+
+  // Rewind the (seekable) ROI file when end of file is reached.
+  // Allows a single ROI frame to be used for a whole sequence
+  // and looping with --loop-input. Skips possible whitespace.
+  if (ftell(file) != -1L) {
+    int c = fgetc(file);
+    while (format == UVG_ROI_TXT && isspace(c)) c = fgetc(file);
+    ungetc(c, file);
+    if (c == EOF) rewind(file);
+  }
+
+  int *width  = &frame->roi.width;
+  int *height = &frame->roi.height;
+
+  bool failed = false;
+
+  if (format == UVG_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height);
+  if (format == UVG_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2;
+  
+  if (failed) {
+    fprintf(stderr, "Failed to read ROI size.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width <= 0 || *height <= 0) {
+    fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height);
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width > 10000 || *height > 10000) {
+    fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  const unsigned size = (*width) * (*height);
+  int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0]));
+  if (!dqp_array) {
+    fprintf(stderr, "Failed to allocate memory for ROI table.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  FREE_POINTER(frame->roi.roi_array);
+  frame->roi.roi_array = dqp_array;
+
+  if (format == UVG_ROI_TXT) {
+    for (int i = 0; i < size; ++i) {
+      int number; // Need a pointer to int for fscanf
+      if (fscanf(file, "%d", &number) != 1) {
+        fprintf(stderr, "Reading ROI file failed.\n");
+        fclose(file);
+        assert(0);
+      }
+      dqp_array[i] = CLIP(-51, 51, number);
+    }
+  } else if (format == UVG_ROI_BIN) {
+    if (fread(dqp_array, 1, size, file) != size) {
+      fprintf(stderr, "Reading ROI file failed.\n");
+      assert(0);
+    }
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_picture* frame) {
   assert(state->type == ENCODER_STATE_TYPE_MAIN);
 
@@ -1437,6 +1628,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
     memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu);
   }
 
+  // ROI / delta QP maps
+  if (frame->roi.roi_array && cfg->roi.file_path) {
+    assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified.");
+  }
+
+  // Read frame from the file. If no file is specified,
+  // ROI data should be already set by the application.
+  if (cfg->roi.file_path) {
+    next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format);
+  }
+  
+  if (cfg->erp_aqp) {
+    init_erp_aqp_roi(state->encoder_control, state->tile->frame->source);
+  }
+
   // Variance adaptive quantization
   if (cfg->vaq) {
     const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
@@ -1523,6 +1729,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
   }
   // Variance adaptive quantization - END
 
+  if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) {
+    state->frame->max_qp_delta_depth = 0;
+  } else {
+    state->frame->max_qp_delta_depth = -1;
+  }
+
   // Use this flag to handle closed gop irap picture selection.
   // If set to true, irap is already set and we avoid
   // setting it based on the intra period
@@ -1689,6 +1901,7 @@ void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)
 
 
   encoder_state_init_new_frame(state, frame);
+  if(state->encoder_control->cfg.jccr) set_joint_cb_cr_modes(state, frame);
   
   // Create a separate job for ALF done after everything else, and only then do final bitstream writing (for ALF parameters)
   if (state->encoder_control->cfg.alf_type && state->encoder_control->cfg.wpp) {
@@ -1834,10 +2047,9 @@ lcu_stats_t* uvg_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y)
 
 int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
-  const encoder_control_t *ctrl = state->encoder_control;
   const cu_array_t *cua = state->tile->frame->cu_array;
   // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
 
   // Coordinates of the top-left corner of the quantization group
   const int x_qg = x & ~(qg_width - 1);
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 620af515..40e1dc24 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t {
   */
   double *aq_offsets;
 
+  int8_t max_qp_delta_depth;
+
   /**
    * \brief Whether next NAL is the first NAL in the access unit.
    */
@@ -193,6 +195,7 @@ typedef struct encoder_state_config_frame_t {
 
   cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row
   uint8_t* hmvp_size; //!< \brief HMVP LUT size
+  bool jccr_sign; 
 
 } encoder_state_config_frame_t;
 
@@ -320,6 +323,7 @@ typedef struct encoder_state_t {
   
   bitstream_t stream;
   cabac_data_t cabac;
+  cabac_data_t search_cabac;
 
   uint32_t stats_bitstream_length; //Bitstream length written in bytes
 
@@ -402,10 +406,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
  */
 static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+  if (state->frame->max_qp_delta_depth < 0) return false;
 
   const int cu_width = LCU_WIDTH >> depth;
-  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
   const int right  = x + cu_width;
   const int bottom = y + cu_width;
   return (right % qg_width == 0 || right >= state->tile->frame->width) &&
diff --git a/src/fast_coeff_cost.c b/src/fast_coeff_cost.c
index f077ec21..d708fbfd 100644
--- a/src/fast_coeff_cost.c
+++ b/src/fast_coeff_cost.c
@@ -40,7 +40,7 @@ static uint16_t to_q88(float f)
   return (uint16_t)(f * 256.0f + 0.5f);
 }
 
-static uint64_t to_4xq88(const float f[4])
+static uint64_t to_4xq88(const double f[4])
 {
   int i;
   uint64_t result = 0;
@@ -58,9 +58,9 @@ int uvg_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_
   uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;
 
   for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
-    float curr_wts[4];
+    double curr_wts[4];
 
-    if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
+    if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0,
                                                     curr_wts + 1,
                                                     curr_wts + 2,
                                                     curr_wts + 3) != 4) {
diff --git a/src/fast_coeff_cost.h b/src/fast_coeff_cost.h
index 0639a34c..5c53fdf1 100644
--- a/src/fast_coeff_cost.h
+++ b/src/fast_coeff_cost.h
@@ -45,7 +45,7 @@ typedef struct {
 
 // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
 // 0 to MAX_FAST_COEFF_COST_QP
-static const float default_fast_coeff_cost_wts[][4] = {
+static const double default_fast_coeff_cost_wts[][4] = {
   // Just extend it by stretching the first actual values..
   {0.164240f, 4.161530f, 3.509033f, 6.928047f},
   {0.164240f, 4.161530f, 3.509033f, 6.928047f},
diff --git a/src/filter.c b/src/filter.c
index 656b7889..1641109d 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir)
 
 static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) {
+  if (state->frame->max_qp_delta_depth < 0) {
     return state->qp;
   }
 
diff --git a/src/image.c b/src/image.c
index 48a1e958..ff960f26 100644
--- a/src/image.c
+++ b/src/image.c
@@ -106,6 +106,10 @@ uvg_picture * uvg_image_alloc(enum uvg_chroma_format chroma_format, const int32_
 
   im->interlacing = UVG_INTERLACING_NONE;
 
+  im->roi.roi_array = NULL;
+  im->roi.width = 0;
+  im->roi.height = 0;
+
   return im;
 }
 
@@ -132,6 +136,7 @@ void uvg_image_free(uvg_picture *const im)
     uvg_image_free(im->base_image);
   } else {
     free(im->fulldata_buf);
+    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
   }
 
   // Make sure freed data won't be used.
@@ -192,6 +197,8 @@ uvg_picture *uvg_image_make_subimage(uvg_picture *const orig_image,
   im->pts = 0;
   im->dts = 0;
 
+  im->roi = orig_image->roi;
+
   return im;
 }
 
diff --git a/src/inter.c b/src/inter.c
index d28d7002..7333a3cf 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -624,7 +624,9 @@ void uvg_inter_pred_pu(const encoder_state_t * const state,
                        int i_pu)
 
 {
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  const int x_scu = SUB_SCU(x);
+  const int y_scu = SUB_SCU(y);
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
   const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
   const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
   const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
@@ -673,6 +675,12 @@ void uvg_inter_pred_pu(const encoder_state_t * const state,
       NULL,
       predict_luma, predict_chroma);
   }
+
+  if (predict_chroma && state->encoder_control->cfg.jccr) {
+    const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+  }
 }
 
 /**
@@ -1290,7 +1298,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
                                         int32_t width,
                                         int32_t height,
                                         const merge_candidates_t *merge_cand,
-                                        const cu_info_t *cur_cu,
+                                        const cu_info_t * const cur_cu,
                                         int8_t reflist,
                                         mv_t mv_cand[2][2])
 {
@@ -1396,7 +1404,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t width,
                            int32_t height,
                            mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t  * const cur_cu,
                            lcu_t *lcu,
                            int8_t reflist)
 {
diff --git a/src/inter.h b/src/inter.h
index 3d3ae797..45f5e5ea 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -96,7 +96,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t width,
                            int32_t height,
                            mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t* cur_cu,
                            lcu_t *lcu,
                            int8_t reflist);
 
diff --git a/src/intra.c b/src/intra.c
index 8f1d9aab..97702498 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -82,6 +82,17 @@ static const uint8_t num_ref_pixels_left[16][16] = {
   { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
 };
 
+
+static void mip_predict(
+  const encoder_state_t* const state,
+  const uvg_intra_references* const refs,
+  const uint16_t pred_block_width,
+  const uint16_t pred_block_height,
+  uvg_pixel* dst,
+  const int mip_mode,
+  const bool mip_transp);
+
+
 int8_t uvg_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
@@ -452,7 +463,7 @@ static void get_cclm_parameters(
   }
 }
 
-static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) {
+static void linear_transform_cclm(const cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) {
   int scale = cclm_params->a;
   int shift = cclm_params->shift;
   int offset = cclm_params->b;
@@ -468,7 +479,7 @@ static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * sr
 }
 
 
-void uvg_predict_cclm(
+static void predict_cclm(
   encoder_state_t const* const state,
   const color_t color,
   const int8_t width,
@@ -477,7 +488,7 @@ void uvg_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  lcu_t* const lcu,
+  const lcu_t* const lcu,
   uvg_intra_references* chroma_ref,
   uvg_pixel* dst,
   cclm_parameters_t* cclm_params
@@ -498,6 +509,7 @@ void uvg_predict_cclm(
 
 
   uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
+  const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
 
   // Essentially what this does is that it uses 6-tap filtering to downsample
   // the luma intra references down to match the resolution of the chroma channel.
@@ -508,12 +520,12 @@ void uvg_predict_cclm(
   if (y0) {
     for (; available_above_right < width / 2; available_above_right++) {
       int x_extension = x_scu + width * 2 + 4 * available_above_right;
-      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
       if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
     }
     if(y_scu == 0) {
       if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
-      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2));
+      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride2 / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2));
     }
     else {
       for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
@@ -533,16 +545,16 @@ void uvg_predict_cclm(
   if(x0) {
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
-      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
       if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
       if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
     }
     for(int i = 0; i < height + available_left_below * 2; i++) {
-      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1];
+      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1];
     }    
   }
 
-  uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width);
+  uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride2) / 4], sampled_luma, width, height, stride2 / 2, width);
 
   int16_t a, b, shift;
   get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
@@ -727,12 +739,17 @@ void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int*
 }
 
 
+
 /** \brief Matrix weighted intra prediction.
 */
-void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* const refs,
-                     const uint16_t pred_block_width, const uint16_t pred_block_height,
-                     uvg_pixel* dst,
-                     const int mip_mode, const bool mip_transp)
+static void mip_predict(
+  const encoder_state_t* const state,
+  const uvg_intra_references* const refs,
+  const uint16_t pred_block_width,
+  const uint16_t pred_block_height,
+  uvg_pixel* dst,
+  const int mip_mode,
+  const bool mip_transp)
 {
   // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative
   
@@ -875,14 +892,13 @@ void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* c
 }
 
 
-void uvg_intra_predict(
-  encoder_state_t *const state,
+static void intra_predict_regular(
+  const encoder_state_t* const state,
   uvg_intra_references *refs,
   int_fast8_t log2_width,
   int_fast8_t mode,
   color_t color,
   uvg_pixel *dst,
-  bool filter_boundary,
   const uint8_t multi_ref_idx)
 {
   const int_fast8_t width = 1 << log2_width;
@@ -1350,18 +1366,66 @@ void uvg_intra_build_reference(
   }
 }
 
+
+void uvg_intra_predict(
+  const encoder_state_t* const state,
+  uvg_intra_references* const refs,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
+  uvg_pixel* dst,
+  const intra_search_data_t* data,
+  const lcu_t* lcu
+  )
+{
+  const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
+  // TODO: what is this used for?
+  // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
+  bool use_mip = false;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma;
+  if (data->pred_cu.intra.mip_flag) {
+    if (color == COLOR_Y) {
+      use_mip = true;
+    }
+    else {
+      use_mip = state->encoder_control->chroma_format == UVG_CSP_444;
+      intra_mode = use_mip ? intra_mode : 0;
+    }
+  }
+  if (intra_mode < 68) {
+    if (use_mip) {
+      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
+      mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
+    }
+    else {
+      intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+    }
+  }
+  else {
+    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
+    if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
+      predict_cclm(
+        state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, 
+        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]);
+    }
+    else {
+      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width);
+    }
+  }
+}
+
+
 static void intra_recon_tb_leaf(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
   int x,
   int y,
   int depth,
-  int8_t intra_mode,
-  cclm_parameters_t *cclm_params,
   lcu_t *lcu,
   color_t color,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp)
+  const intra_search_data_t* search_data)
 {
   const uvg_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
@@ -1383,7 +1447,7 @@ static void intra_recon_tb_leaf(
   int x_scu = SUB_SCU(x);
   int y_scu = SUB_SCU(y);
   const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
-  uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0;
 
   uvg_intra_references refs;
   // Extra reference lines for use with MRL. Extra lines needed only for left edge.
@@ -1406,42 +1470,14 @@ static void intra_recon_tb_leaf(
   uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);
 
   uvg_pixel pred[32 * 32];
-  int stride = state->tile->frame->source->stride;
-  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
-  bool use_mip = false;
-  if (mip_flag) {
-    if (color == COLOR_Y) {
-      use_mip = true;
-    } else {
-      // MIP can be used for chroma if the chroma scheme is 444
-      if (state->encoder_control->chroma_format == UVG_CSP_444) {
-        use_mip = true;
-      } else {
-        // If MIP cannot be used for chroma, set mode to planar
-        intra_mode = 0;
-      }
-    }
-  }
 
-  if(intra_mode < 68) {
-    if (use_mip) {
-      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
-      uvg_mip_predict(state, &refs, width, height, pred, intra_mode, mip_transp);
-    }
-    else {
-      uvg_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
-    }
-  } else {
-    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
-    if(cclm_params == NULL) {
-      cclm_parameters_t temp_params;
-      uvg_predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
-    }
-    else {
-      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
-    }
-  }
+  cu_loc_t loc = {
+    x, y,
+    width, height,
+    width, height,
+  };
+
+  uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu);
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixel *block = NULL;
@@ -1483,17 +1519,12 @@ static void intra_recon_tb_leaf(
  * \param lcu           containing LCU
  */
 void uvg_intra_recon_cu(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
   int x,
   int y,
   int depth,
-  int8_t mode_luma,
-  int8_t mode_chroma,
+  intra_search_data_t* search_data,
   cu_info_t *cur_cu,
-  cclm_parameters_t *cclm_params,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp,
   lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
@@ -1501,12 +1532,16 @@ void uvg_intra_recon_cu(
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
-  uint8_t multi_ref_index = multi_ref_idx;
-  bool use_mip = mip_flag;
-  bool mip_transposed = mip_transp;
+  const int8_t mode_luma = search_data->pred_cu.intra.mode;
+  const int8_t mode_chroma= search_data->pred_cu.intra.mode_chroma;
+
+  if(mode_chroma != -1 && mode_luma == -1) {
+    x &= ~7;
+    y &= ~7;
+  }
   
   if (mode_luma != -1 && mode_chroma != -1) {
-    if (use_mip) {
+    if (search_data->pred_cu.intra.mip_flag) {
       assert(mode_luma == mode_chroma && "Chroma mode must be derived from luma mode if block uses MIP.");
     }
   }
@@ -1527,10 +1562,10 @@ void uvg_intra_recon_cu(
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    uvg_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
+    uvg_intra_recon_cu(state, x,   y,   depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x2,  y,   depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x,   y2,  depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x2,  y2,  depth + 1, search_data, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -1552,13 +1587,15 @@ void uvg_intra_recon_cu(
    
     // Process a leaf TU.
     if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data);
     }
     if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data);
     }
 
-    uvg_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
+    uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
+      search_data->pred_cu.joint_cb_cr != 4 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0),
+      x, y, depth, cur_cu, lcu, false);
   }
 }
diff --git a/src/intra.h b/src/intra.h
index dd90a87b..7f6c04d0 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -63,6 +63,18 @@ typedef struct
   int16_t b;
 } cclm_parameters_t;
 
+typedef struct {
+  cu_info_t pred_cu;
+  cclm_parameters_t cclm_parameters[2];
+  double cost;
+  double bits;
+  double coeff_bits;
+  double distortion;
+} intra_search_data_t ;
+
+
+#define UVG_NUM_INTRA_MODES 67
+
 /**
 * \brief Function for deriving intra luma predictions
 * \param x          x-coordinate of the PU in pixels
@@ -114,53 +126,22 @@ void uvg_intra_build_reference(
  * \param filter_boundary Whether to filter the boundary on modes 10 and 26.
  */
 void uvg_intra_predict(
-  encoder_state_t *const state,
-  uvg_intra_references *refs,
-  int_fast8_t log2_width,
-  int_fast8_t mode,
-  color_t color,
-  uvg_pixel *dst,
-  bool filter_boundary,
-  const uint8_t multi_ref_idx);
+  const encoder_state_t* const state,
+  uvg_intra_references* const refs,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
+  uvg_pixel* dst,
+  const intra_search_data_t* data,
+  const lcu_t* lcu
+);
 
 void uvg_intra_recon_cu(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
   int x,
   int y,
   int depth,
-  int8_t mode_luma,
-  int8_t mode_chroma,
+  intra_search_data_t* search_data,
   cu_info_t *cur_cu,
-  cclm_parameters_t* cclm_params,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp,
   lcu_t *lcu);
 
-
-void uvg_predict_cclm(
-  encoder_state_t const* const state,
-  const color_t color,
-  const int8_t width,
-  const int8_t height,
-  const int16_t x0,
-  const int16_t y0,
-  const int16_t stride,
-  const int8_t mode,
-  lcu_t* const lcu,
-  uvg_intra_references* chroma_ref,
-  uvg_pixel* dst,
-  cclm_parameters_t* cclm_params
-);
-
 int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
-
-void uvg_mip_predict(
-  encoder_state_t const * const state,
-  uvg_intra_references * refs,
-  const uint16_t width,
-  const uint16_t height,
-  uvg_pixel* dst,
-  const int mip_mode,
-  const bool mip_transp
-);
\ No newline at end of file
diff --git a/src/rate_control.c b/src/rate_control.c
index 27cc86ba..ca2215a5 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -1088,17 +1088,20 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
   const encoder_control_t * const ctrl = state->encoder_control;
   lcu_stats_t *lcu = uvg_get_lcu_stats(state, pos.x, pos.y);
 
-  if (ctrl->cfg.roi.dqps != NULL) {
-    vector2d_t lcu = {
+  if (state->tile->frame->source->roi.roi_array) {
+    vector2d_t lcu_vec = {
       pos.x + state->tile->lcu_offset_x,
       pos.y + state->tile->lcu_offset_y
     };
     vector2d_t roi = {
-      lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu,
-      lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu
+      lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu,
+      lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu
     };
-    int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
-    int dqp = ctrl->cfg.roi.dqps[roi_index];
+    int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width;
+    int dqp = state->tile->frame->source->roi.roi_array[roi_index];
+    if(dqp != 0) {
+      pos.x = 0;
+    }
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
     state->lambda = qp_to_lambda(state, state->qp);
     state->lambda_sqrt = sqrt(state->lambda);
diff --git a/src/rdo.c b/src/rdo.c
index 29bbdc97..8bad55a5 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost(
   // Take a copy of the CABAC so that we don't overwrite the contexts when
   // counting the bits.
   cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
 
   // Clear bytes and bits and set mode to "count"
   cabac_copy.only_count = 1;
-  cabac_copy.num_buffered_bytes = 0;
-  cabac_copy.bits_left = 23;
+  int num_buffered_bytes = cabac_copy.num_buffered_bytes;
+  int bits_left = cabac_copy.bits_left;
 
   // Execute the coding function.
   // It is safe to drop the const modifier since state won't be modified
@@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost(
       type,
       scan_mode);
   }
-
-  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+  if(cabac_copy.update) {
+    memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy));
+  }
+  return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3);
 }
 
 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
@@ -1741,37 +1743,33 @@ void uvg_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
 /**
  * Calculate cost of actual motion vectors using CABAC coding
  */
-uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       const int32_t mvd_hor,
-                                       const int32_t mvd_ver)
+double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     const int32_t mvd_hor,
+                                     const int32_t mvd_ver)
 {
   cabac_data_t cabac_copy = *cabac;
   cabac_copy.only_count = 1;
-
+  double bits = 0;
   // It is safe to drop const here because cabac->only_count is set.
-  uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+  uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits);
 
-  uint32_t bitcost =
-    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
-    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
-
-  return bitcost;
+  return bits;
 }
 
 /** MVD cost calculation with CABAC
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
-                                 int x,
-                                 int y,
-                                 int mv_shift,
-                                 mv_t mv_cand[2][2],
-                                 inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                                 int16_t num_cand,
-                                 int32_t ref_idx,
-                                 uint32_t *bitcost)
+double uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
+                               int x,
+                               int y,
+                               int mv_shift,
+                               mv_t mv_cand[2][2],
+                               inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                               int16_t num_cand,
+                               int32_t ref_idx,
+                               double* bitcost)
 {
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
@@ -1798,14 +1796,13 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
   }
 
   // Store cabac state and contexts
-  memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+  memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t));
 
   // Clear bytes and bits and set mode to "count"
   state_cabac_copy.only_count = 1;
-  state_cabac_copy.num_buffered_bytes = 0;
-  state_cabac_copy.bits_left = 23;
 
   cabac = &state_cabac_copy;
+  double bits = 0;
 
   if (!merged) {
     vector2d_t mvd1 = {
@@ -1820,8 +1817,8 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
     uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1);
     uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2);
 
-    uint32_t cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
-    uint32_t cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
+    double cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    double cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -1834,7 +1831,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
 
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
 
-  CABAC_BIN(cabac, merged, "MergeFlag");
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag");
   num_cand = state->encoder_control->cfg.max_merge;
   if (merged) {
     if (num_cand > 1) {
@@ -1842,10 +1839,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
       for (ui = 0; ui < num_cand - 1; ui++) {
         int32_t symbol = (ui != merge_idx);
         if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
         } else {
           CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+          bits += 1;
         }
         if (symbol == 0) break;
       }
@@ -1868,24 +1865,23 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
         if (ref_list[ref_list_idx] > 1) {
           // parseRefFrmIdx
           int32_t ref_frame = ref_idx;
-
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+          
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");
 
           if (ref_frame > 0) {
             int32_t i;
             int32_t ref_num = ref_list[ref_list_idx] - 2;
-
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
+            
             ref_frame--;
 
             for (i = 0; i < ref_num; ++i) {
               const uint32_t symbol = (i == ref_frame) ? 0 : 1;
 
               if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
+                CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX");
               } else {
                 CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+                bits += 1;
               }
               if (symbol == 0) break;
             }
@@ -1895,7 +1891,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
         // ToDo: Bidir vector support
         if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
           // It is safe to drop const here because cabac->only_count is set.
-          uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
+          uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits);
         }
 
         // Signal which candidate MV to use
@@ -1905,10 +1901,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
     }
   }
 
-  *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
+  *bitcost = bits;
 
   // Store bitcost before restoring cabac
-  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
+  return *bitcost * state->lambda_sqrt;
 }
 
 void uvg_close_rdcost_outfiles(void)
diff --git a/src/rdo.h b/src/rdo.h
index 7a365254..46db8c90 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -77,10 +77,10 @@ uint32_t uvg_get_coded_level(encoder_state_t * state, double* coded_cost, double
 
 uvg_mvd_cost_func uvg_calc_mvd_cost_cabac;
 
-uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       int32_t mvd_hor,
-                                       int32_t mvd_ver);
+double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     int32_t mvd_hor,
+                                     int32_t mvd_ver);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15
@@ -90,8 +90,5 @@ uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
 extern const uint32_t uvg_entropy_bits[512];
 #define CTX_ENTROPY_BITS(ctx, val) uvg_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
 
-// Floating point fractional bits, derived from uvg_entropy_bits
-extern const float uvg_f_entropy_bits[512];
-#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
 
 #endif
diff --git a/src/sao.c b/src/sao.c
index e61d638e..e83b8117 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) {
 }
 
 
-static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
+static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {    
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, none = 0
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+  CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type");
 
   return mode_bits;
 }
 
-static float sao_mode_bits_merge(const encoder_state_t * const state,
+static double sao_mode_bits_merge(const encoder_state_t * const state,
                                  int8_t merge_cand) {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   ctx = &(cabac->ctx.sao_merge_flag_model);
 
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag");
   if (merge_cand == 1) return mode_bits;
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag");
   return mode_bits;
 }
 
 
-static float sao_mode_bits_edge(const encoder_state_t * const state,
+static double sao_mode_bits_edge(const encoder_state_t * const state,
                               int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES],
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
-    ctx = &(cabac->ctx.sao_merge_flag_model);   
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    ctx = &(cabac->ctx.sao_merge_flag_model);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, edge = 2 = cMax
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) {
@@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state,
 }
 
 
-static float sao_mode_bits_band(const encoder_state_t * const state,
+static double sao_mode_bits_band(const encoder_state_t * const state,
                               int band_position[2], int offsets[10],
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded sao_type_idx_, band = 1
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets and possible FL coded offset signs.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++)
@@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const uvg_
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
+    float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left);
+    int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
       merge_cost[0] = cost_of_nothing;
diff --git a/src/search.c b/src/search.c
index ac58ef99..e3845569 100644
--- a/src/search.c
+++ b/src/search.c
@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@@ -59,14 +60,6 @@
 // Cost threshold for doing intra search in inter frames with --rd=0.
 static const int INTRA_THRESHOLD = 8;
 
-// Modify weight of luma SSD.
-#ifndef LUMA_MULT
-# define LUMA_MULT 0.8
-#endif
-// Modify weight of chroma SSD.
-#ifndef CHROMA_MULT
-# define CHROMA_MULT 1.5
-#endif
 
 static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
@@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
   const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
   double ssd = 0.0;
-  ssd += LUMA_MULT * uvg_pixels_calc_ssd(
+  ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
     &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
     LCU_WIDTH, LCU_WIDTH, cu_width
     );
   if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
-    ssd += CHROMA_MULT * uvg_pixels_calc_ssd(
+    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
-    ssd += CHROMA_MULT * uvg_pixels_calc_ssd(
+    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
       &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
@@ -251,7 +244,8 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
   int x_scu = SUB_SCU(x);
   int y_scu = SUB_SCU(y);
   y_rec += x_scu + y_scu * LCU_WIDTH;
-  int stride = state->tile->frame->source->stride;
+  const int stride = state->tile->frame->rec->stride;
+  const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
 
   for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
     for (int x_ = 0; x_ < width; x_++) {
@@ -265,13 +259,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
       s += y_rec[2 * x_ + LCU_WIDTH] * 2;
       s += y_rec[2 * x_ + 1 + LCU_WIDTH];
       s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
-      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride2 / 2;
       state->tile->frame->cclm_luma_rec[index] = s >> 3;
     }
     y_rec += LCU_WIDTH * 2;
   }
   if((y + height * 2) % 64 == 0) {
-    int line = y / 64 * stride / 2;
+    int line = y / 64 * stride2 / 2;
     y_rec -= LCU_WIDTH;
     for (int i = 0; i < width; ++i) {
       int s = 2;
@@ -294,11 +288,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * prediction unit data needs to be coded.
 */
 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu)
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
   // cur_cu is used for TU parameters.
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@@ -324,14 +320,36 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
     return sum + tr_tree_bits * state->lambda;
   }
 
+
+  if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) {
+    // Because these need to be coded before the luma cbf they also need to be counted
+    // before the cabac state changes. However, since this branch is only executed when
+    // calculating the last RD cost it is not problem to include the chroma cbf costs in
+    // luma, because the chroma cost is calculated right after the luma cost.
+    // However, if we have different tr_depth, the bits cannot be written in correct
+    // order anyways so do not touch the chroma cbf here.
+    if (state->encoder_control->chroma_format != UVG_CSP_400) {
+      cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+      cabac->cur_ctx = cr_ctx;
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+      cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
   // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
   if (pred_cu->type == CU_INTRA ||
-      tr_depth > 0 ||
+      is_tr_split ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_V))
   {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
   }
 
   // SSD between reconstruction and original
@@ -343,7 +361,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                                         width);
   }
 
-  {
+
+  if (!skip_residual_coding) {
     int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
 
@@ -351,23 +370,22 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->lambda;
+  return (double)ssd * UVG_LUMA_MULT + bits * state->lambda;
 }
 
 
 double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu)
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
   const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
 
   double tr_tree_bits = 0;
-  double joint_cbcr_tr_tree_bits = 0;
   double coeff_bits = 0;
-  double joint_coeff_bits = 0;
 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
@@ -378,30 +396,28 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
     return 0;
   }
 
-  if (depth < MAX_PU_DEPTH) {
+  // See luma for why the second condition
+  if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) {
     const int tr_depth = depth - pred_cu->depth;
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
-    }
-    if(state->encoder_control->cfg.jccr) {
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
     }
     int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-    ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
+    ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]);
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
-    }
-    if(state->encoder_control->cfg.jccr) {
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
     }
   }
 
 
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
-    int sum = 0;
+    double sum = 0;
 
     sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
     sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
@@ -418,15 +434,10 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
       ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]);
       tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0);      
     }
-    if(pred_cu->joint_cb_cr) {
-      ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]);
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1);
-    }
   }
 
   // Chroma SSD
   int ssd = 0;
-  int joint_ssd = 0;
   if (!state->encoder_control->cfg.lossless) {
     int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
     int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
@@ -436,53 +447,266 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
                                     LCU_WIDTH_C,        LCU_WIDTH_C,
                                     width);
     ssd = ssd_u + ssd_v;
+  }
 
-    if(state->encoder_control->cfg.jccr) {
+  if (!skip_residual_coding)
+  {
+    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
+    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
+  }
+
+
+  double bits = tr_tree_bits + coeff_bits;
+
+  return (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda;
+}
+
+static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
+                                           const int x_px, const int y_px, const int depth,
+                                           const cu_info_t* const pred_cu,
+                                           lcu_t* const lcu) {
+  const int width = LCU_WIDTH >> depth;
+
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  // cur_cu is used for TU parameters.
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+
+  double coeff_bits = 0;
+  double tr_tree_bits = 0;
+
+  // Check that lcu is not in 
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  const uint8_t tr_depth = tr_cu->tr_depth - depth;
+
+  const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+
+  {
+    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    // Only need to signal coded block flag if not skipped or merged
+    // skip = no coded residual, merge = coded residual
+    if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+    }
+
+  }
+
+  if(state->encoder_control->chroma_format != UVG_CSP_400 && !skip_residual_coding) {
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
+    } 
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
+    } 
+  }
+
+  if (tr_depth > 0) {
+    int offset = LCU_WIDTH >> (depth + 1);
+    double sum = 0;
+
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    return sum + tr_tree_bits * state->lambda;
+  }
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ;
+
+  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = depth - tr_cu->depth;
+  if ((pred_cu->type == CU_INTRA ||
+    is_tr_split ||
+    cb_flag_u ||
+    cb_flag_v) 
+      && !skip_residual_coding)
+  {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+  }
+
+  if (cb_flag_y | cb_flag_u | cb_flag_v) {
+    // TODO qp_delta_sign_flag
+
+    if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) {
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
+    }
+  }
+
+
+  // SSD between reconstruction and original
+  unsigned luma_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    int index = y_px * LCU_WIDTH + x_px;
+    luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      width);
+  }
+
+  {
+    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip);
+  }
+
+  unsigned chroma_ssd = 0;
+  if(state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) {
+    const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 };
+    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
+    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+    if(pred_cu->joint_cb_cr == 0) {
+      if (!state->encoder_control->cfg.lossless) {
+        int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+        unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+          LCU_WIDTH_C, LCU_WIDTH_C,
+          chroma_width);
+        unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+          LCU_WIDTH_C, LCU_WIDTH_C,
+          chroma_width);
+        chroma_ssd = ssd_u + ssd_v;
+      }
+
+      {
+
+        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0);
+        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0);
+      }
+    } else {
       int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
         LCU_WIDTH_C, LCU_WIDTH_C,
         width);
       int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
         LCU_WIDTH_C, LCU_WIDTH_C,
-        width);
-      joint_ssd = ssd_u_joint + ssd_v_joint;
+        chroma_width);
+      chroma_ssd = ssd_u_joint + ssd_v_joint;
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
     }
   }
 
+  double bits = tr_tree_bits + coeff_bits;
+  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda;
+}
+
+
+void uvg_select_jccr_mode(
+  const encoder_state_t* const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  cu_info_t* pred_cu,
+  lcu_t* const lcu,
+  double* cost_out)
+{
+  const vector2d_t lcu_px = { (SUB_SCU(x_px) & ~7) / 2, (SUB_SCU(y_px) & ~7) / 2 };
+  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x_px), SUB_SCU(y_px));
+  assert(pred_cu->depth == pred_cu->tr_depth && "jccr does not support transform splitting");
+  if (cost_out == NULL && pred_cu->joint_cb_cr == 0) {
+    return;
+  }
+
+  double tr_tree_bits = 0;
+  double joint_cbcr_tr_tree_bits = 0;
+  double coeff_bits = 0;
+  double joint_coeff_bits = 0;
+
+  assert(lcu_px.x >= 0 && lcu_px.x < LCU_WIDTH_C);
+  assert(lcu_px.y >= 0 && lcu_px.y < LCU_WIDTH_C);
+
+  if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
+    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
+    // block and return 0 cost for all others.
+    return;
+  }
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+  cabac->cur_ctx = ctx;
+  int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+  CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");   
+  ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+  int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+  CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search");
+
+  int cbf_mask = u_is_set * 2 + v_is_set - 1;
+  if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2)
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag");
+
+  if(pred_cu->joint_cb_cr) {
+    const int u_jccr = (pred_cu->joint_cb_cr >> 1) & 1;
+    ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    CABAC_FBITS_UPDATE(cabac, ctx, u_jccr, joint_cbcr_tr_tree_bits, "cbf_cb_search");
+    ctx = &(cabac->ctx.qt_cbf_model_cr[u_jccr]);
+    CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cr_search");
+    cbf_mask = pred_cu->joint_cb_cr - 1;
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag");
+  }
+  unsigned ssd = 0;
+  unsigned joint_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    const int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+    const unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+      LCU_WIDTH_C, LCU_WIDTH_C,
+      width);
+    const unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+      LCU_WIDTH_C, LCU_WIDTH_C,
+      width);
+    ssd = ssd_u + ssd_v;
+
+    if (pred_cu->joint_cb_cr) {
+      const unsigned ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      const unsigned ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      joint_ssd = ssd_u_joint + ssd_v_joint;      
+    }    
+  }
+
   {
     int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
     const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
     coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
     coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
-
-    if(state->encoder_control->cfg.jccr) {
-      joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
-    }
+    
+    joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);    
   }
 
 
   double bits = tr_tree_bits + coeff_bits;
   double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;
 
-  double cost = (double)ssd + bits * state->c_lambda;
-  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  double cost = (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd * UVG_CHROMA_MULT + joint_bits * state->c_lambda;
   if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
     pred_cu->joint_cb_cr = 0;
-    return cost;    
+    if (cost_out) *cost_out += cost;
+    return;
   }
   cbf_clear(&pred_cu->cbf, depth, COLOR_U);
   cbf_clear(&pred_cu->cbf, depth, COLOR_V);
-  if (pred_cu->joint_cb_cr & 1) {
+  if (pred_cu->joint_cb_cr & 2) {
     cbf_set(&pred_cu->cbf, depth, COLOR_U);
   }
-  if (pred_cu->joint_cb_cr & 2) {
+  if (pred_cu->joint_cb_cr & 1) {
     cbf_set(&pred_cu->cbf, depth, COLOR_V);
   }
   int lcu_width = LCU_WIDTH_C;
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
   uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
-  return joint_cost;
+  if (cost_out) *cost_out += joint_cost;
 }
 
 
@@ -492,23 +716,9 @@ static double calc_mode_bits(const encoder_state_t *state,
                              const cu_info_t * cur_cu,
                              int x, int y, int depth)
 {
-  int x_local = SUB_SCU(x);
-  int y_local = SUB_SCU(y);
-
   assert(cur_cu->type == CU_INTRA);
 
-  int8_t candidate_modes[INTRA_MPM_COUNT];
-  {
-    const cu_info_t *left_cu  = ((x >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local - SCU_WIDTH, y_local) : NULL);
-    const cu_info_t *above_cu = ((y >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local, y_local - SCU_WIDTH) : NULL);
-    uvg_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
-  }
-
-  int width = LCU_WIDTH >> depth;
-  int height = width; // TODO: height for non-square blocks
-  int num_mip_modes_half = NUM_MIP_MODES_HALF(width, height);
-  int mip_flag_ctx_id = uvg_get_mip_flag_context(x, y, width, height, lcu, NULL);
-  double mode_bits = uvg_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, num_mip_modes_half, mip_flag_ctx_id);
+  double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu);
 
   if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
     mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
@@ -518,6 +728,7 @@ static double calc_mode_bits(const encoder_state_t *state,
 }
 
 
+// TODO: replace usages of this by the uvg_sort_indices_by_cost function.
 /**
  * \brief Sort modes and costs to ascending order according to costs.
  */
@@ -567,16 +778,25 @@ void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf
   }
 }
 
-
-
-static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
+/**
+ * \brief Sort keys (indices) to ascending order according to costs.
+ */
+void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 {
-  vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
-  bool condA = x >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x - 1, lcu_cu.y    )->depth > depth;
-  bool condL = y >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x,     lcu_cu.y - 1)->depth > depth;
-  return condA + condL;
+  // Size of sorted arrays is expected to be "small". No need for faster algorithm.
+  for (uint8_t i = 1; i < map->size; ++i) {
+    const int8_t cur_indx = map->keys[i];
+    const double cur_cost = map->cost[cur_indx];
+    uint8_t j = i;
+    while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) {
+      map->keys[j] = map->keys[j - 1];
+      --j;
+    }
+    map->keys[j] = cur_indx;
+  }
 }
 
+
 /**
  * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode.
  * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH.
@@ -592,10 +812,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
-  double cost = MAX_INT;
-  double inter_zero_coeff_cost = MAX_INT;
-  uint32_t inter_bitcost = MAX_INT;
+  double cost = MAX_DOUBLE;
+  double inter_zero_coeff_cost = MAX_DOUBLE;
+  double inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
+  cabac_data_t pre_search_cabac;
+  memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));
 
   const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
   const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
@@ -626,7 +848,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
   // Assign correct depth limit
   constraint_t* constr = state->constraint;
- if(constr->ml_intra_depth_ctu) {
+  if(constr->ml_intra_depth_ctu) {
     pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8];
     pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8];
   }
@@ -670,7 +892,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
     if (can_use_inter) {
       double mode_cost;
-      uint32_t mode_bitcost;
+      double mode_bitcost;
       uvg_search_cu_inter(state,
                           x, y,
                           depth,
@@ -693,33 +915,34 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
     int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
         (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width ||
-        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height;
+        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) &&
+      !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I);
 
+    intra_search_data_t intra_search;
     if (can_use_intra && !skip_intra) {
-      int8_t intra_mode;
-      int8_t intra_trafo;
-      double intra_cost;
-      uint8_t multi_ref_index = 0;
-      bool mip_flag = false;
-      bool mip_transposed = false;
-      uvg_search_cu_intra(state, x, y, depth, lcu,
-                          &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
-      if (intra_cost < cost) {
-        cost = intra_cost;
+      intra_search.pred_cu = *cur_cu;
+      intra_search.pred_cu.joint_cb_cr = 4;
+      uvg_search_cu_intra(state, x, y, depth, &intra_search,
+                          lcu);
+#ifdef COMPLETE_PRED_MODE_BITS
+      // Technically counting these bits would be correct, however counting
+      // them universally degrades quality so this block is disabled by default
+      if(state->frame->slicetype != UVG_SLICE_I) {
+        double pred_mode_type_bits = 0;
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag");
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag");
+        intra_cost += pred_mode_type_bits * state->lambda;
+      }
+#endif
+      if (intra_search.cost < cost) {
+        cost = intra_search.cost;
+        *cur_cu = intra_search.pred_cu;
         cur_cu->type = CU_INTRA;
-        cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
-        cur_cu->intra.mode = intra_mode;
-        cur_cu->intra.multi_ref_idx = multi_ref_index;
-        cur_cu->intra.mip_flag = mip_flag;
-        cur_cu->intra.mip_is_transposed = mip_transposed;
-
-        //If the CU is not split from 64x64 block, the MTS is disabled for that CU.
-        cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0;
       }
     }
 
@@ -727,20 +950,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      cur_cu->intra.mode_chroma = cur_cu->intra.mode;
-      
+
+      intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       uvg_intra_recon_cu(state,
                          x, y,
-                         depth,
-                         cur_cu->intra.mode, -1, // skip chroma
-                         NULL, NULL, cur_cu->intra.multi_ref_idx, 
-                         cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, 
+                         depth, &intra_search,
+                         NULL, 
                          lcu);
 
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
+      cur_cu->joint_cb_cr = 0;
 
       // TODO: This heavily relies to square CUs
       if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400) {
@@ -748,19 +970,47 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
-        cclm_parameters_t cclm_params[2];
+        intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma
         if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) {
-          cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
+          cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search);
+
+          if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4;
+          else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr;
+
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
-
+        intra_search.pred_cu.intra.mode = -1; // skip luma
         uvg_intra_recon_cu(state,
-                           x & ~7, y & ~7, // TODO: as does this
-                           depth,
-                           -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, cclm_params, 0, 
-                           cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           x, y, // TODO: as does this
+                           depth, &intra_search,
+                           NULL,
                            lcu);
+        if(depth != 0 && state->encoder_control->cfg.jccr && ctrl->cfg.rdo < 3) {
+          uvg_select_jccr_mode(state,
+                               x, y,
+                               depth,
+                               NULL,
+                               lcu,
+                               NULL);
+        }
+        else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) {
+          assert(cur_cu->joint_cb_cr < 4);
+          cbf_clear(&cur_cu->cbf, depth, COLOR_U);
+          cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+          if (cur_cu->joint_cb_cr & 2) {
+            cbf_set(&cur_cu->cbf, depth, COLOR_U);
+          }
+          if (cur_cu->joint_cb_cr & 1) {
+            cbf_set(&cur_cu->cbf, depth, COLOR_V);
+          }
+          const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 };
+          int lcu_width = LCU_WIDTH_C;
+          const int index = lcu_px.x + lcu_px.y * lcu_width;
+          const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+          uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
+          uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
+
+        }
       }
     } else if (cur_cu->type == CU_INTER) {
 
@@ -788,11 +1038,20 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         }
 
         uvg_quantize_lcu_residual(state,
-          true, has_chroma,
-          x, y, depth,
-          NULL,
-          lcu,
-          false);
+                                  true, has_chroma,
+                                  state->encoder_control->cfg.jccr, x, y,
+                                  depth,
+                                  NULL,
+                                  lcu,
+                                  false);
+        if (cur_cu->depth == cur_cu->tr_depth && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr) {
+          uvg_select_jccr_mode(state,
+            x, y,
+            depth,
+            NULL,
+            lcu,
+            NULL);
+        }
 
         int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
@@ -800,9 +1059,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
           cur_cu->merged = 0;
           cur_cu->skipped = 1;
           // Selecting skip reduces bits needed to code the CU
-          if (inter_bitcost > 1) {
-            inter_bitcost -= 1;
-          }
+          int skip_ctx = uvg_get_skip_context(x, y, lcu, NULL, NULL);
+          inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1);
+          inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0);
+          inter_bitcost += cur_cu->merge_idx;        
         }
       }
       lcu_fill_inter(lcu, x_local, y_local, cu_width);
@@ -811,20 +1071,26 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   }
 
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-    if (state->encoder_control->chroma_format != UVG_CSP_400) {
-      cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
+    double bits = 0;
+    cabac_data_t* cabac  = &state->search_cabac;
+    cabac->update = 1;
+
+    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
+      bits += uvg_mock_encode_coding_unit(
+        state,
+        cabac,
+        x, y, depth,
+        lcu,
+        cur_cu);
     }
-
-    double mode_bits;
-    if (cur_cu->type == CU_INTRA) {
-      mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
-    } else {
-      mode_bits = inter_bitcost;
+    else {
+      assert(0);
     }
+    
+    cost = bits * state->lambda;
 
-    cost += mode_bits * state->lambda;
-
+    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+    
     if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
 
@@ -846,13 +1112,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       cur_cu->cbf = 0;
       lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
-  }
+    cabac->update = 0;
+  } 
 
   bool can_split_cu =
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    depth < pu_depth_intra.max ||
+    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
     (state->frame->slicetype != UVG_SLICE_I &&
       depth < pu_depth_inter.max);
 
@@ -861,21 +1128,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     int half_cu = cu_width / 2;
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+    cabac_data_t post_seach_cabac;
+    memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    state->search_cabac.update = 1;
+
+    double split_bits = 0;
 
     if (depth < MAX_DEPTH) {
       // Add cost of cu_split_flag.
-      uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;
+      uvg_write_split_flag(state, &state->search_cabac, 
+        x > 0 ? LCU_GET_CU_AT_PX(lcu,SUB_SCU(x) -1, SUB_SCU(y)): NULL,
+        y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
+        1, depth, cu_width, x, y, &split_bits);
     }
 
-    if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
-      // Add cost of intra part_size.
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;  // NxN
-    }
+    state->search_cabac.update = 0;
+    split_cost += split_bits * state->lambda;
 
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting
@@ -897,13 +1166,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // searching.
     
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height 
+        && state->encoder_control->cfg.combine_intra_cus)
     {
+
       cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
+        memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
         cost = 0;
+        double bits = 0;
+        uvg_write_split_flag(state, &state->search_cabac,
+          x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
+          y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
+          0, depth, cu_width, x, y, & split_bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -914,29 +1193,25 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
         uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth);
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+        
+        intra_search_data_t proxy;
+        FILL(proxy, 0);
+        proxy.pred_cu = *cur_cu;
 
-        const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-        const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1;
         uvg_intra_recon_cu(state,
                            x, y,
                            depth,
-                           cur_cu->intra.mode, mode_chroma,
-                           NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           &proxy,
+                           NULL,
                            lcu);
 
-        cost += uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-        if (has_chroma) {
-          cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
-        }
-
-        // Add the cost of coding no-split.
-        uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-        const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-
-        // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
         cost += mode_bits * state->lambda;
+
+        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+
+        memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
       }
     }
 
@@ -950,6 +1225,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     } else if (depth > 0) {
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
+      memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
       work_tree_copy_down(x_local, y_local, depth, work_tree);
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@@ -962,6 +1238,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
       }
     }
+    else {
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );      
+    }
   } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
@@ -1139,6 +1420,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
  */
 void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff)
 {
+  memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t));
+  state->search_cabac.only_count = 1;
   assert(x % LCU_WIDTH == 0);
   assert(y % LCU_WIDTH == 0);
 
diff --git a/src/search.h b/src/search.h
index 85e76d23..9b4d92f7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -44,22 +44,62 @@
 #include "image.h"
 #include "constraint.h"
 
-#define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)
-#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1
+#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS)
+
+ // Modify weight of luma SSD.
+#ifndef UVG_LUMA_MULT
+#define UVG_LUMA_MULT 0.8
+#endif
+// Modify weight of chroma SSD.
+#ifndef UVG_CHROMA_MULT
+#define UVG_CHROMA_MULT 1.5
+#endif
+
+ /**
+  *  \brief Data collected during search processes.
+  * 
+  *         The intended use is to collect statistics of the
+  *         searched coding/prediction units. Data related to
+  *         a specific unit is found at index i. The arrays
+  *         should be indexed by elements of the "keys" array
+  *         that will be sorted by the RD costs of the units.         
+  */
+typedef struct unit_stats_map_t {
+
+  cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units
+  double    cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs
+  double    bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs  
+  int8_t    keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays
+  int       size;                    //!< number of active elements in the lists
+} unit_stats_map_t;
+
+#define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12))
+#define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1)
 
 void uvg_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);
 
+void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
+
 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);
 
 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu);
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu);
 double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu);
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu);
+void uvg_select_jccr_mode(
+  const encoder_state_t* const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  cu_info_t* const pred_cu,
+  lcu_t* const lcu,
+  double* cost_out);
+
 void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);
 
 void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
diff --git a/src/search_inter.c b/src/search_inter.c
index a6feb1f5..836f45e4 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "image.h"
 #include "imagelist.h"
 #include "inter.h"
@@ -68,7 +69,7 @@ typedef struct {
   /**
    * \brief Top-left corner of the PU
    */
-  const vector2d_t origin;
+  vector2d_t origin;
   int32_t width;
   int32_t height;
 
@@ -78,19 +79,6 @@ typedef struct {
 
   uvg_mvd_cost_func *mvd_cost_func;
 
-  /**
-   * \brief Best motion vector among the ones tested so far
-   */
-  vector2d_t best_mv;
-  /**
-   * \brief Cost of best_mv
-   */
-  uint32_t best_cost;
-  /**
-   * \brief Bit cost of best_mv
-   */
-  uint32_t best_bitcost;
-
   /**
    * \brief Possible optimized SAD implementation for the width, leave as
    *        NULL for arbitrary-width blocks
@@ -205,20 +193,25 @@ static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int
 /**
  * \brief Calculate cost for an integer motion vector.
  *
- * Updates info->best_mv, info->best_cost and info->best_bitcost to the new
+ * Updates best_mv, best_cost and best_bitcost to the new
  * motion vector if it yields a lower cost than the current one.
  *
  * If the motion vector violates the MV constraints for tiles or WPP, the
  * cost is not set.
  *
- * \return true if info->best_mv was changed, false otherwise
+ * \return true if best_mv was changed, false otherwise
  */
-static bool check_mv_cost(inter_search_info_t *info, int x, int y)
+static bool check_mv_cost(inter_search_info_t *info,
+                          int x,
+                          int y,
+                          double *best_cost,
+                          double* best_bits,
+                          vector2d_t *best_mv)
 {
   if (!intmv_within_tile(info, x, y)) return false;
 
-  uint32_t bitcost = 0;
-  uint32_t cost = uvg_image_calc_sad(
+  double bitcost = 0;
+  double cost = uvg_image_calc_sad(
       info->pic,
       info->ref,
       info->origin.x,
@@ -230,25 +223,25 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y)
       info->optimized_sad
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   cost += info->mvd_cost_func(
       info->state,
       x, y, INTERNAL_MV_PREC,
       info->mv_cand,
-      info->merge_cand,
-      info->num_merge_cand,
+      NULL,
+      0,
       info->ref_idx,
       &bitcost
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   // Set to motion vector in internal pixel precision.
-  info->best_mv.x = x * (1 << INTERNAL_MV_PREC);
-  info->best_mv.y = y * (1 << INTERNAL_MV_PREC);
-  info->best_cost = cost;
-  info->best_bitcost = bitcost;
+  best_mv->x = x * (1 << INTERNAL_MV_PREC);
+  best_mv->y = y * (1 << INTERNAL_MV_PREC);
+  *best_cost = cost;
+  *best_bits = bitcost;
 
   return true;
 }
@@ -256,10 +249,10 @@ static bool check_mv_cost(inter_search_info_t *info, int x, int y)
 
 static unsigned get_ep_ex_golomb_bitcost(unsigned symbol)
 {
-  // Calculate 2 * log2(symbol + 2)
+  // Calculate 2 * log2(symbol )
 
   unsigned bins = 0;
-  symbol += 2;
+  symbol += 0;
   if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; }
   if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; }
   if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; }
@@ -299,12 +292,16 @@ static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv)
  * \brief Select starting point for integer motion estimation search.
  *
  * Checks the zero vector, extra_mv and merge candidates and updates
- * info->best_mv to the best one.
+ * best_mv to the best one.
  */
-static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv)
+static void select_starting_point(inter_search_info_t *info,
+                                  vector2d_t extra_mv,
+                                  double *best_cost,
+                                  double* best_bits,
+                                  vector2d_t *best_mv)
 {
   // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  check_mv_cost(info, 0, 0);
+  check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv);
 
   // Change to integer precision.
   extra_mv.x >>= INTERNAL_MV_PREC;
@@ -312,7 +309,7 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv
 
   // Check mv_in if it's not one of the merge candidates.
   if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) {
-    check_mv_cost(info, extra_mv.x, extra_mv.y);
+    check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv);
   }
 
   // Go through candidates
@@ -324,49 +321,26 @@ static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv
 
     if (x == 0 && y == 0) continue;
 
-    check_mv_cost(info, x, y);
+    check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
   }
 }
 
 
-static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
-                                    const cabac_data_t* cabac,
-                                    const int32_t mvd_hor,
-                                    const int32_t mvd_ver)
+static double get_mvd_coding_cost(const encoder_state_t* state,
+  const cabac_data_t* cabac,
+  const int32_t mvd_hor,
+  const int32_t mvd_ver)
 {
-  unsigned bitcost = 0;
-
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_hor != 0));
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], (mvd_ver != 0));
-
-  if (hor_abs_gr0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_hor_abs > 1));
-  }
-  if (ver_abs_gr0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], (mvd_ver_abs > 1));
-  }
-
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT;
-  }
-  if (ver_abs_gr0) {
-    if (mvd_ver_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT;
-  }
+  double bitcost = 4 << CTX_FRAC_BITS;
+  const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) };
+  bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS));
+  bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS));
 
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS;
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS;
 
   // Round and shift back to integer bits.
-  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+  return bitcost / (1 << CTX_FRAC_BITS);
 }
 
 
@@ -374,7 +348,7 @@ static int select_mv_cand(const encoder_state_t *state,
                           mv_t mv_cand[2][2],
                           int32_t mv_x,
                           int32_t mv_y,
-                          uint32_t *cost_out)
+                          double*cost_out)
 {
   const bool same_cand =
     (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]);
@@ -384,7 +358,7 @@ static int select_mv_cand(const encoder_state_t *state,
     return 0;
   }
 
-  uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+  double (*mvd_coding_cost)(const encoder_state_t * const state,
                               const cabac_data_t*,
                               int32_t, int32_t);
   if (state->encoder_control->cfg.mv_rdo) {
@@ -397,12 +371,12 @@ static int select_mv_cand(const encoder_state_t *state,
 
   uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd);
 
-  uint32_t cand1_cost = mvd_coding_cost(
+  double cand1_cost = mvd_coding_cost(
       state, &state->cabac,
       mvd.x,
       mvd.y);
 
-  uint32_t cand2_cost;
+  double cand2_cost;
   if (same_cand) {
     cand2_cost = cand1_cost;
   } else {
@@ -423,17 +397,17 @@ static int select_mv_cand(const encoder_state_t *state,
 }
 
 
-static uint32_t calc_mvd_cost(const encoder_state_t *state,
-                              int x,
-                              int y,
-                              int mv_shift,
-                              mv_t mv_cand[2][2],
-                              inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                              int16_t num_cand,
-                              int32_t ref_idx,
-                              uint32_t *bitcost)
+static double calc_mvd_cost(const encoder_state_t *state,
+                            int x,
+                            int y,
+                            int mv_shift,
+                            mv_t mv_cand[2][2],
+                            inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                            int16_t num_cand,
+                            int32_t ref_idx,
+                            double* bitcost)
 {
-  uint32_t temp_bitcost = 0;
+  double temp_bitcost = 0;
   uint32_t merge_idx;
   int8_t merged      = 0;
 
@@ -456,23 +430,26 @@ static uint32_t calc_mvd_cost(const encoder_state_t *state,
 
   // Check mvd cost only if mv is not merged
   if (!merged) {
-    uint32_t mvd_cost = 0;
+    double mvd_cost = 0;
     select_mv_cand(state, mv_cand, x, y, &mvd_cost);
     temp_bitcost += mvd_cost;
   }
   *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5);
+  return temp_bitcost * state->lambda_sqrt;
 }
 
 
-static bool early_terminate(inter_search_info_t *info)
+static bool early_terminate(inter_search_info_t *info,
+                            double *best_cost,
+                            double* best_bits,
+                            vector2d_t *best_mv)
 {
   static const vector2d_t small_hexbs[7] = {
       { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 },
       { 0, -1 }, { -1, 0 }, { 0, 0 },
   };
 
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   int first_index = 0;
   int last_index = 3;
@@ -482,9 +459,9 @@ static bool early_terminate(inter_search_info_t *info)
     if (info->state->encoder_control->cfg.me_early_termination ==
         UVG_ME_EARLY_TERMINATION_SENSITIVE)
     {
-      threshold = info->best_cost * 0.95;
+      threshold = *best_cost * 0.95;
     } else {
-      threshold = info->best_cost;
+      threshold = *best_cost;
     }
 
     int best_index = 6;
@@ -492,7 +469,7 @@ static bool early_terminate(inter_search_info_t *info)
       int x = mv.x + small_hexbs[i].x;
       int y = mv.y + small_hexbs[i].y;
 
-      if (check_mv_cost(info, x, y)) {
+      if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) {
         best_index = i;
       }
     }
@@ -502,7 +479,7 @@ static bool early_terminate(inter_search_info_t *info)
     mv.y += small_hexbs[best_index].y;
 
     // If best match is not better than threshold, we stop the search.
-    if (info->best_cost >= threshold) {
+    if (*best_cost >= threshold) {
       return true;
     }
 
@@ -517,7 +494,10 @@ void uvg_tz_pattern_search(inter_search_info_t *info,
                            unsigned pattern_type,
                            const int iDist,
                            vector2d_t mv,
-                           int *best_dist)
+                           int *best_dist,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   assert(pattern_type < 4);
 
@@ -619,7 +599,7 @@ void uvg_tz_pattern_search(inter_search_info_t *info,
     int x = mv.x + offset.x;
     int y = mv.y + offset.y;
 
-    if (check_mv_cost(info, x, y)) {
+    if (check_mv_cost(info, x, y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -632,20 +612,27 @@ void uvg_tz_pattern_search(inter_search_info_t *info,
 
 void uvg_tz_raster_search(inter_search_info_t *info,
                           int iSearchRange,
-                          int iRaster)
+                          int iRaster,
+                          double *best_cost,
+                          double* best_bits,
+                          vector2d_t *best_mv)
 {
-  const vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  const vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   //compute SAD values for every point in the iRaster downsampled version of the current search area
   for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) {
     for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) {
-      check_mv_cost(info, mv.x + x, mv.y + y);
+      check_mv_cost(info, mv.x + x, mv.y + y, best_cost, best_bits, best_mv);
     }
   }
 }
 
 
-static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
+static void tz_search(inter_search_info_t *info,
+                      vector2d_t extra_mv,
+                      double *best_cost,
+                      double* best_bits,
+                      vector2d_t *best_mv)
 {
   //TZ parameters
   const int iSearchRange = 96;  // search range for each stage
@@ -657,25 +644,13 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   const bool use_star_refinement = true;   // enable step 4 mode 2 (only one mode will be executed)
 
   int best_dist = 0;
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-      early_terminate(info))
-  {
-    return;
-  }
-
-  vector2d_t start = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  
+  vector2d_t start = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   // step 2, grid search
   int rounds_without_improvement = 0;
   for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-    uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+    uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
 
     // Break the loop if the last three rounds didn't produce a better MV.
     if (best_dist != iDist) rounds_without_improvement++;
@@ -688,7 +663,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
     start.y = 0;
     rounds_without_improvement = 0;
     for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) {
-      uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+      uvg_tz_pattern_search(info, step2_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
 
       if (best_dist != iDist) rounds_without_improvement++;
       if (rounds_without_improvement >= 3) break;
@@ -698,7 +673,7 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   //step 3, raster scan
   if (use_raster_scan && best_dist > iRaster) {
     best_dist = iRaster;
-    uvg_tz_raster_search(info, iSearchRange, iRaster);
+    uvg_tz_raster_search(info, iSearchRange, iRaster, best_cost, best_bits, best_mv);
   }
 
   //step 4
@@ -706,19 +681,19 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
   //raster refinement
   if (use_raster_refinement && best_dist > 0) {
     for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) {
-      start.x = info->best_mv.x >> INTERNAL_MV_PREC;
-      start.y = info->best_mv.y >> INTERNAL_MV_PREC;
-      uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
+      start.x = best_mv->x >> INTERNAL_MV_PREC;
+      start.y = best_mv->y >> INTERNAL_MV_PREC;
+      uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
     }
   }
 
   //star refinement (repeat step 2 for the current starting point)
   while (use_star_refinement && best_dist > 0) {
     best_dist = 0;
-    start.x = info->best_mv.x >> INTERNAL_MV_PREC;
-    start.y = info->best_mv.y >> INTERNAL_MV_PREC;
+    start.x = best_mv->x >> INTERNAL_MV_PREC;
+    start.y = best_mv->y >> INTERNAL_MV_PREC;
     for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-      uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
+      uvg_tz_pattern_search(info, step4_type, iDist, start, &best_dist, best_cost, best_bits, best_mv);
     }
   }
 }
@@ -740,7 +715,12 @@ static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
  * the predicted motion vector is way off. In the future even more additional
  * points like 0,0 might be used, such as vectors from top or left.
  */
-static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps)
+static void hexagon_search(inter_search_info_t *info,
+                           vector2d_t extra_mv,
+                           uint32_t steps,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   // The start of the hexagonal pattern has been repeated at the end so that
   // the indices between 1-6 can be used as the start of a 3-point list of new
@@ -765,27 +745,14 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
       { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 }
   };
 
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-      early_terminate(info))
-  {
-    return;
-  }
-
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   // Current best index, either to merge_cands, large_hexbs or small_hexbs.
   int best_index = 0;
 
   // Search the initial 7 points of the hexagon.
   for (int i = 1; i < 7; ++i) {
-    if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y)) {
+    if (check_mv_cost(info, mv.x + large_hexbs[i].x, mv.y + large_hexbs[i].y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -814,7 +781,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
     // Iterate through the next 3 points.
     for (int i = 0; i < 3; ++i) {
       vector2d_t offset = large_hexbs[start + i];
-      if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) {
+      if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y, best_cost, best_bits, best_mv)) {
         best_index = start + i;
       }
     }
@@ -826,7 +793,7 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 
   // Do the final step of the search with a small pattern.
   for (int i = 1; i < 9; ++i) {
-    check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y);
+    check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y, best_cost, best_bits, best_mv);
   }
 }
 
@@ -846,7 +813,12 @@ static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 * the predicted motion vector is way off. In the future even more additional
 * points like 0,0 might be used, such as vectors from top or left.
 **/
-static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) 
+static void diamond_search(inter_search_info_t *info,
+                           vector2d_t extra_mv,
+                           uint32_t steps,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   enum diapos {
     DIA_UP = 0,
@@ -864,29 +836,16 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
     {0, -1}, {1, 0}, {0, 1}, {-1, 0},
     {0, 0}
   };
-
-  info->best_cost = UINT32_MAX;
-
-  // Select starting point from among merge candidates. These should
-  // include both mv_cand vectors and (0, 0).
-  select_starting_point(info, extra_mv);
-
-  // Check if we should stop search
-  if (info->state->encoder_control->cfg.me_early_termination &&
-    early_terminate(info))
-  {
-    return;
-  }
   
   // current motion vector
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
   // current best index
   enum diapos best_index = DIA_CENTER;
 
   // initial search of the points of the diamond
   for (int i = 0; i < 5; ++i) {
-    if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) {
+    if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) {
       best_index = i;
     }
   }
@@ -916,7 +875,7 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
       // this is where we came from so it's checked already
       if (i == from_dir) continue;
 
-      if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) {
+      if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y, best_cost, best_bits, best_mv)) {
         best_index = i;
         better_found = 1;
       }
@@ -938,12 +897,15 @@ static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint3
 
 static void search_mv_full(inter_search_info_t *info,
                            int32_t search_range,
-                           vector2d_t extra_mv)
+                           vector2d_t extra_mv,
+                           double *best_cost,
+                           double* best_bits,
+                           vector2d_t *best_mv)
 {
   // Search around the 0-vector.
   for (int y = -search_range; y <= search_range; y++) {
     for (int x = -search_range; x <= search_range; x++) {
-      check_mv_cost(info, x, y);
+      check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
     }
   }
 
@@ -955,7 +917,7 @@ static void search_mv_full(inter_search_info_t *info,
   if (!mv_in_merge(info, extra_mv)) {
     for (int y = -search_range; y <= search_range; y++) {
       for (int x = -search_range; x <= search_range; x++) {
-        check_mv_cost(info, extra_mv.x + x, extra_mv.y + y);
+        check_mv_cost(info, extra_mv.x + x, extra_mv.y + y, best_cost, best_bits, best_mv);
       }
     }
   }
@@ -1002,7 +964,7 @@ static void search_mv_full(inter_search_info_t *info,
         }
         if (already_tested) continue;
 
-        check_mv_cost(info, x, y);
+        check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
       }
     }
   }
@@ -1015,7 +977,10 @@ static void search_mv_full(inter_search_info_t *info,
  * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
  * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
  */
-static void search_frac(inter_search_info_t *info)
+static void search_frac(inter_search_info_t *info,
+                        double *best_cost,
+                        double *best_bits,
+                        vector2d_t *best_mv)
 {
   // Map indexes to relative coordinates in the following way:
   // 5 3 6
@@ -1028,13 +993,14 @@ static void search_frac(inter_search_info_t *info)
   };
 
   // Set mv to pixel precision
-  vector2d_t mv = { info->best_mv.x >> INTERNAL_MV_PREC, info->best_mv.y >> INTERNAL_MV_PREC };
+  vector2d_t mv = { best_mv->x >> INTERNAL_MV_PREC, best_mv->y >> INTERNAL_MV_PREC };
 
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0;
-  uint32_t bitcosts[4] = { 0 };
+  double cost = MAX_DOUBLE;
+  double bitcost = 0;
+  double bitcosts[4] = { 0 };
   unsigned best_index = 0;
 
+// Keep this as unsigned until SAD / SATD functions are updated
   unsigned costs[4] = { 0 };
 
   ALIGNED(64) uvg_pixel filtered[4][LCU_LUMA_SIZE];
@@ -1100,12 +1066,12 @@ static void search_frac(inter_search_info_t *info)
   costs[0] += info->mvd_cost_func(state,
                                   mv.x, mv.y, INTERNAL_MV_PREC,
                                   info->mv_cand,
-                                  info->merge_cand,
-                                  info->num_merge_cand,
+                                  NULL,
+                                  0,
                                   info->ref_idx,
                                   &bitcosts[0]);
-  best_cost = costs[0];
-  best_bitcost = bitcosts[0];
+  cost = costs[0];
+  bitcost = bitcosts[0];
   
   //Set mv to half-pixel precision
   mv.x *= 2;
@@ -1160,8 +1126,8 @@ static void search_frac(inter_search_info_t *info)
             mv.y + pattern[j]->y,
             mv_shift,
             info->mv_cand,
-            info->merge_cand,
-            info->num_merge_cand,
+            NULL,
+            0,
             info->ref_idx,
             &bitcosts[j]
         );
@@ -1169,9 +1135,9 @@ static void search_frac(inter_search_info_t *info)
     }
 
     for (int j = 0; j < 4; ++j) {
-      if (within_tile[j] && costs[j] < best_cost) {
-        best_cost = costs[j];
-        best_bitcost = bitcosts[j];
+      if (within_tile[j] && costs[j] < cost) {
+        cost = costs[j];
+        bitcost = bitcosts[j];
         best_index = i + j;
       }
     }
@@ -1201,9 +1167,38 @@ static void search_frac(inter_search_info_t *info)
   mv.x *= 1 << (INTERNAL_MV_PREC - 2);
   mv.y *= 1 << (INTERNAL_MV_PREC - 2);
 
-  info->best_mv = mv;
-  info->best_cost = best_cost;
-  info->best_bitcost = best_bitcost;
+  *best_mv = mv;
+  *best_cost = cost;
+  *best_bits = bitcost;
+}
+
+int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx) {
+  assert(!(lcu && cu_a));
+  int context = 0;
+  const cu_info_t* left_pu = NULL;
+  const cu_info_t* top_pu = NULL;
+  if(lcu) {
+    int x_local = SUB_SCU(x);
+    int y_local = SUB_SCU(y);
+    if (x) {
+      left_pu = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local);
+    }
+    if (y) {
+      top_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1);
+    }
+  }
+  else {
+    if (x > 0) {
+      left_pu = uvg_cu_array_at_const(cu_a, x - 1, y);
+    }
+    if (y > 0) {
+      top_pu = uvg_cu_array_at_const(cu_a, x, y - 1);
+    }
+  }
+  context += left_pu && left_pu->skipped;
+  context += top_pu && top_pu->skipped;
+  if (predmode_ctx) *predmode_ctx = (left_pu && left_pu->type == CU_INTRA) || (top_pu && top_pu->type == CU_INTRA);
+  return context;
 }
 
 /**
@@ -1251,46 +1246,37 @@ static void apply_mv_scaling(int32_t current_poc,
  */
 static void search_pu_inter_ref(inter_search_info_t *info,
   int depth,
-  lcu_t *lcu, cu_info_t *cur_cu,
-  double *inter_cost,
-  uint32_t *inter_bitcost,
-  double *best_LX_cost,
-  cu_info_t *unipred_LX)
+  lcu_t *lcu,
+  cu_info_t *cur_cu,
+  unit_stats_map_t *amvp)
 {
   const uvg_config *cfg = &info->state->encoder_control->cfg;
 
-  // which list, L0 or L1, ref_idx is in and in what index
-  int8_t ref_list = -1;
-  // the index of the ref_idx in L0 or L1 list
-  int8_t LX_idx;
-  // max value of LX_idx plus one
-  const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size[0],
-    info->state->frame->ref_LX_size[1]);
+  // Reference picture might be in both lists
+  bool ref_list_active[2] = { false, false };
+  // Reference picture indices in L0 and L1 lists
+  int8_t ref_list_idx[2] = { -1, -1 };
 
-  for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++)
-  {
-    // check if ref_idx is in L0
-    if (LX_idx < info->state->frame->ref_LX_size[0] &&
-      info->state->frame->ref_LX[0][LX_idx] == info->ref_idx) {
-      ref_list = 0;
-      break;
-    }
-
-    // check if ref_idx is in L1
-    if (LX_idx < info->state->frame->ref_LX_size[1] &&
-      info->state->frame->ref_LX[1][LX_idx] == info->ref_idx) {
-      ref_list = 1;
-      break;
+  // Check if ref picture is present in the lists
+  for (int ref_list = 0; ref_list < 2; ++ref_list) {
+    for (int i = 0; i < info->state->frame->ref_LX_size[ref_list]; ++i) {
+      if (info->state->frame->ref_LX[ref_list][i] == info->ref_idx) {
+        ref_list_active[ref_list] = true;
+        ref_list_idx[ref_list] = i;
+        break;
+      }
     }
   }
-  // ref_idx has to be found in either L0 or L1
-  assert(LX_idx < LX_IDX_MAX_PLUS_1);
 
-  // store temp values to be stored back later
-  int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
+  // Must find at least one reference picture
+  assert(ref_list_active[0] || ref_list_active[1]);
+
+  // Does not matter which list is used, if in both.
+  int ref_list = ref_list_active[0] ? 0 : 1;
+  int LX_idx = ref_list_idx[ref_list];
 
   // Get MV candidates
-  cur_cu->inter.mv_ref[ref_list] = LX_idx;
+  cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list];
 
   uvg_inter_get_mv_cand(info->state,
     info->origin.x,
@@ -1302,10 +1288,7 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     lcu,
     ref_list);
 
-  // store old values back
-  cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
-
-  vector2d_t mv = { 0, 0 };
+  vector2d_t best_mv = { 0, 0 };
 
   // Take starting point for MV search from previous frame.
   // When temporal motion vector candidates are added, there is probably
@@ -1319,8 +1302,7 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     if (ref_cu->inter.mv_dir & 1) {
       mv_previous.x = ref_cu->inter.mv[0][0];
       mv_previous.y = ref_cu->inter.mv[0][1];
-    }
-    else {
+    } else {
       mv_previous.x = ref_cu->inter.mv[1][0];
       mv_previous.y = ref_cu->inter.mv[1][1];
     }
@@ -1353,16 +1335,16 @@ static void search_pu_inter_ref(inter_search_info_t *info,
         info->state->frame->ref->pocs[neighbor_poc_index],
         info->state->frame->ref->images[neighbor_poc_index]->ref_pocs[
           info->state->frame->ref->ref_LXs[neighbor_poc_index]
-          [col_list]
+            [col_list]
           [ref_cu->inter.mv_ref[col_list]]
         ],
         &mv_previous
-      );
+          );
     }
 
     // Check if the mv is valid after scaling
     if (fracmv_within_tile(info, mv_previous.x, mv_previous.y)) {
-      mv = mv_previous;
+      best_mv = mv_previous;
     }
   }
 
@@ -1375,102 +1357,90 @@ static void search_pu_inter_ref(inter_search_info_t *info,
     default: break;
   }
 
-  info->best_cost = UINT32_MAX;
+  double best_cost = MAX_DOUBLE;
+  double best_bits = MAX_INT;
 
-  switch (cfg->ime_algorithm) {
-    case UVG_IME_TZ:
-      tz_search(info, mv);
-      break;
+  // Select starting point from among merge candidates. These should
+  // include both mv_cand vectors and (0, 0).
+  select_starting_point(info, best_mv, &best_cost, &best_bits, &best_mv);
+  bool skip_me = early_terminate(info, &best_cost, &best_bits, &best_mv);
+      
+  if (!(info->state->encoder_control->cfg.me_early_termination && skip_me)) {
 
-    case UVG_IME_FULL64:
-    case UVG_IME_FULL32:
-    case UVG_IME_FULL16:
-    case UVG_IME_FULL8:
-    case UVG_IME_FULL:
-      search_mv_full(info, search_range, mv);
-      break;
+    switch (cfg->ime_algorithm) {
+      case UVG_IME_TZ:
+        tz_search(info, best_mv, &best_cost, &best_bits, &best_mv);
+        break;
 
-    case UVG_IME_DIA:
-      diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
-      break;
+      case UVG_IME_FULL64:
+      case UVG_IME_FULL32:
+      case UVG_IME_FULL16:
+      case UVG_IME_FULL8:
+      case UVG_IME_FULL:
+        search_mv_full(info, search_range, best_mv, &best_cost, &best_bits, &best_mv);
+        break;
 
-    default:
-      hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
-      break;
-  }
+      case UVG_IME_DIA:
+        diamond_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps,
+                       &best_cost, &best_bits, &best_mv);
+        break;
 
-  if (cfg->fme_level > 0 && info->best_cost < *inter_cost) {
-    search_frac(info);
-
-  } else if (info->best_cost < UINT32_MAX) {
-    // Recalculate inter cost with SATD.
-    info->best_cost = uvg_image_calc_satd(
-        info->state->tile->frame->source,
-        info->ref,
-        info->origin.x,
-        info->origin.y,
-        info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> INTERNAL_MV_PREC),
-        info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> INTERNAL_MV_PREC),
-        info->width,
-        info->height);
-    info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5);
-  }
-
-  mv = info->best_mv;
-
-  int merged = 0;
-  int merge_idx = 0;
-  // Check every candidate to find a match
-  for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
-    if (info->merge_cand[merge_idx].dir != 3 &&
-        info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][0] == mv.x &&
-        info->merge_cand[merge_idx].mv[info->merge_cand[merge_idx].dir - 1][1] == mv.y &&
-        (uint32_t)info->state->frame->ref_LX[info->merge_cand[merge_idx].dir - 1][
-        info->merge_cand[merge_idx].ref[info->merge_cand[merge_idx].dir - 1]] == info->ref_idx)
-    {
-      merged = 1;
-      break;
+      default:
+        hexagon_search(info, best_mv, info->state->encoder_control->cfg.me_max_steps,
+                       &best_cost, &best_bits, &best_mv);
+        break;
     }
   }
 
-  // Only check when candidates are different
-  int cu_mv_cand = 0;
-  if (!merged) {
-    cu_mv_cand =
-      select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL);
+  if (cfg->fme_level == 0 && best_cost < MAX_DOUBLE) {
+    // Recalculate inter cost with SATD.
+    best_cost = uvg_image_calc_satd(
+      info->state->tile->frame->source,
+      info->ref,
+      info->origin.x,
+      info->origin.y,
+      info->state->tile->offset_x + info->origin.x + (best_mv.x >> INTERNAL_MV_PREC),
+      info->state->tile->offset_y + info->origin.y + (best_mv.y >> INTERNAL_MV_PREC),
+      info->width,
+      info->height);
+    best_cost += best_bits * info->state->lambda_sqrt;
   }
 
-  if (info->best_cost < *inter_cost) {
-    // Map reference index to L0/L1 pictures
-    cur_cu->inter.mv_dir = ref_list+1;
+  double LX_cost[2] = { best_cost, best_cost };
+  double LX_bits[2] = { best_bits, best_bits };
+
+  // Compute costs and add entries for both lists, if necessary
+  for (; ref_list < 2 && ref_list_active[ref_list]; ++ref_list) {
+
+    LX_idx = ref_list_idx[ref_list];
     uint8_t mv_ref_coded = LX_idx;
+    int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, best_mv.x, best_mv.y, NULL);
+    const int extra_bits = ref_list + mv_ref_coded; // TODO: check if mv_dir bits are missing
+    LX_cost[ref_list] += extra_bits * info->state->lambda_sqrt;
+    LX_bits[ref_list] += extra_bits;
 
-    cur_cu->merged                  = merged;
-    cur_cu->merge_idx               = merge_idx;
-    cur_cu->inter.mv_ref[ref_list]  = LX_idx;
-    cur_cu->inter.mv[ref_list][0]   = (mv_t)mv.x;
-    cur_cu->inter.mv[ref_list][1]   = (mv_t)mv.y;
+    // Update best unipreds for biprediction
+    bool valid_mv = fracmv_within_tile(info, best_mv.x, best_mv.y);
+    if (valid_mv && best_cost < MAX_DOUBLE) {
 
-    CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand);
-
-    *inter_cost = info->best_cost;
-    *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
-  }
-
-
-  // Update best unipreds for biprediction
-  if (info->best_cost < best_LX_cost[ref_list]) {
-    bool valid_mv = fracmv_within_tile(info, mv.x, mv.y);
-    if (valid_mv) {
       // Map reference index to L0/L1 pictures
-      unipred_LX[ref_list].inter.mv_dir = ref_list + 1;
-      unipred_LX[ref_list].inter.mv_ref[ref_list] = LX_idx;
-      unipred_LX[ref_list].inter.mv[ref_list][0] = (mv_t)mv.x;
-      unipred_LX[ref_list].inter.mv[ref_list][1] = (mv_t)mv.y;
+      unit_stats_map_t *cur_map = &amvp[ref_list];
+      int entry = cur_map->size;
+      cu_info_t *unipred_pu = &cur_map->unit[entry];
+      *unipred_pu = *cur_cu;
+      unipred_pu->type = CU_INTER;
+      unipred_pu->merged  = false;
+      unipred_pu->skipped = false;
+      unipred_pu->inter.mv_dir = ref_list + 1;
+      unipred_pu->inter.mv_ref[ref_list] = LX_idx;
+      unipred_pu->inter.mv[ref_list][0] = (mv_t)best_mv.x;
+      unipred_pu->inter.mv[ref_list][1] = (mv_t)best_mv.y;
+      CU_SET_MV_CAND(unipred_pu, ref_list, cu_mv_cand);
 
-      CU_SET_MV_CAND(&unipred_LX[ref_list], ref_list, cu_mv_cand);
-
-      best_LX_cost[ref_list] = info->best_cost;
+      cur_map->cost[entry] = best_cost;
+      cur_map->bits[entry] = best_bits;
+      cur_map->keys[entry] = entry;
+      cur_map->size++;
     }
   }
 }
@@ -1481,9 +1451,8 @@ static void search_pu_inter_ref(inter_search_info_t *info,
  */
 static void search_pu_inter_bipred(inter_search_info_t *info,
                                    int depth,
-                                   lcu_t *lcu, cu_info_t *cur_cu,
-                                   double *inter_cost,
-                                   uint32_t *inter_bitcost)
+                                   lcu_t *lcu,
+                                   unit_stats_map_t *amvp_bipred)
 {
   const image_list_t *const ref = info->state->frame->ref;
   uint8_t (*ref_LX)[16] = info->state->frame->ref_LX;
@@ -1515,11 +1484,26 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
       continue;
     }
 
-    mv_t mv[2][2];
+    cu_info_t *bipred_pu = &amvp_bipred->unit[amvp_bipred->size];
+    *bipred_pu = *LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+
+    bipred_pu->inter.mv_dir = 3;
+
+    bipred_pu->inter.mv_ref[0] = merge_cand[i].ref[0];
+    bipred_pu->inter.mv_ref[1] = merge_cand[j].ref[1];
+
+    int16_t(*mv)[2] = bipred_pu->inter.mv;
     mv[0][0] = merge_cand[i].mv[0][0];
     mv[0][1] = merge_cand[i].mv[0][1];
     mv[1][0] = merge_cand[j].mv[1][0];
     mv[1][1] = merge_cand[j].mv[1][1];
+    
+    bipred_pu->merged  = false;
+    bipred_pu->skipped = false;
+
+    for (int reflist = 0; reflist < 2; reflist++) {
+      uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+    }
 
     // Don't try merge candidates that don't satisfy mv constraints.
     if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) ||
@@ -1541,10 +1525,10 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
 
     const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
     const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride];
-    uint32_t cost =
+    double cost =
       uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->stride);
 
-    uint32_t bitcost[2] = { 0, 0 };
+    double bitcost[2] = { 0, 0 };
 
     cost += info->mvd_cost_func(info->state,
                                merge_cand[i].mv[0][0],
@@ -1566,51 +1550,25 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
       merge_cand[j].ref[1]
     };
     const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */;
-    cost += info->state->lambda_sqrt * extra_bits + 0.5;
+    cost += info->state->lambda_sqrt * extra_bits;
 
-    if (cost < *inter_cost) {
-      cur_cu->inter.mv_dir = 3;
-
-      cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0];
-      cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1];
-
-      cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0];
-      cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1];
-      cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0];
-      cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1];
-      cur_cu->merged = 0;
-
-      // Check every candidate to find a match
-      for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
-        if (merge_cand[merge_idx].dir != 3) continue;
-        if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
-            merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&
-            merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
-            merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&
-            merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] &&
-            merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1])
-        {
-          cur_cu->merged = 1;
-          cur_cu->merge_idx = merge_idx;
-          break;
-        }
-      }
-
-      // Each motion vector has its own candidate
-      for (int reflist = 0; reflist < 2; reflist++) {
-        uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist);
-        int cu_mv_cand = select_mv_cand(
-            info->state,
-            info->mv_cand,
-            cur_cu->inter.mv[reflist][0],
-            cur_cu->inter.mv[reflist][1],
-            NULL);
-        CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
-      }
-
-      *inter_cost = cost;
-      *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits;
+    // Each motion vector has its own candidate
+    for (int reflist = 0; reflist < 2; reflist++) {
+      int cu_mv_cand = select_mv_cand(
+        info->state,
+        info->mv_cand,
+        bipred_pu->inter.mv[reflist][0],
+        bipred_pu->inter.mv[reflist][1],
+        NULL);
+      CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand);
     }
+
+    bipred_pu->type = CU_INTER;
+
+    amvp_bipred->cost[amvp_bipred->size] = cost;
+    amvp_bipred->bits[amvp_bipred->size] = bitcost[0] + bitcost[1] + extra_bits;
+    amvp_bipred->keys[amvp_bipred->size] = amvp_bipred->size;
+    amvp_bipred->size++;
   }
 }
 
@@ -1624,14 +1582,14 @@ static void search_pu_inter_bipred(inter_search_info_t *info,
  *
  * \return                Does an identical candidate exist in list
  */
-static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
-                                    inter_merge_cand_t * cand_to_add,
-                                    int8_t * added_idx_list,
-                                    int list_size)
+static bool merge_candidate_in_list(inter_merge_cand_t *all_cands,
+                                    inter_merge_cand_t *cand_to_add,
+                                    unit_stats_map_t *merge)
 {
   bool found = false;
-  for (int i = 0; i < list_size && !found; ++i) {
-    inter_merge_cand_t * list_cand = &all_cands[added_idx_list[i]];
+  for (int i = 0; i < merge->size && !found; ++i) {
+    int key = merge->keys[i];
+    inter_merge_cand_t * list_cand = &all_cands[merge->unit[key].merge_idx];
 
     found = cand_to_add->dir == list_cand->dir &&
         cand_to_add->ref[0] == list_cand->ref[0] &&
@@ -1646,7 +1604,7 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
 }
 
 /**
- * \brief Update PU to have best modes at this depth.
+ * \brief Collect PU parameters and costs at this depth.
  *
  * \param state       encoder state
  * \param x_cu        x-coordinate of the containing CU
@@ -1656,28 +1614,26 @@ static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
  * \param i_pu        index of the PU in the CU
  * \param lcu         containing LCU
  *
- * \param inter_cost    Return inter cost of the best mode
- * \param inter_bitcost Return inter bitcost of the best mode
+ * \param amvp        Return searched AMVP PUs sorted by costs
+ * \param merge       Return searched Merge PUs sorted by costs
  */
 static void search_pu_inter(encoder_state_t * const state,
-                            int x_cu, int y_cu,
-                            int depth,
-                            part_mode_t part_mode,
-                            int i_pu,
-                            lcu_t *lcu,
-                            double *inter_cost,
-                            uint32_t *inter_bitcost)
+  int x_cu, int y_cu,
+  int depth,
+  part_mode_t part_mode,
+  int i_pu,
+  lcu_t *lcu,
+  unit_stats_map_t *amvp,
+  unit_stats_map_t *merge,
+  inter_search_info_t *info)
 {
-  *inter_cost = MAX_INT;
-  *inter_bitcost = MAX_INT;
-
   const uvg_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
-  const int width_cu  = LCU_WIDTH >> depth;
-  const int x         = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
-  const int y         = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
-  const int width     = PU_GET_W(part_mode, width_cu, i_pu);
-  const int height    = PU_GET_H(part_mode, width_cu, i_pu);
+  const int width_cu = LCU_WIDTH >> depth;
+  const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
+  const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu);
+  const int width = PU_GET_W(part_mode, width_cu, i_pu);
+  const int height = PU_GET_H(part_mode, width_cu, i_pu);
 
   // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and
   // nRx2N partitions.
@@ -1686,129 +1642,162 @@ static void search_pu_inter(encoder_state_t * const state,
   // 2NxnD partitions.
   const bool merge_b1 = i_pu == 0 || width <= height;
 
-  const int x_local   = SUB_SCU(x);
-  const int y_local   = SUB_SCU(y);
-  cu_info_t *cur_cu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  const int x_local = SUB_SCU(x);
+  const int y_local = SUB_SCU(y);
+  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  cur_pu->type = CU_NOTSET;
+  cur_pu->part_size = part_mode;
+  cur_pu->depth = depth;
+  cur_pu->qp = state->qp;
 
-  inter_search_info_t info = {
-    .state          = state,
-    .pic            = frame->source,
-    .origin         = { x, y },
-    .width          = width,
-    .height         = height,
-    .mvd_cost_func  = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost,
-    .optimized_sad  = uvg_get_optimized_sad(width),
-  };
+  // Default to candidate 0
+  CU_SET_MV_CAND(cur_pu, 0, 0);
+  CU_SET_MV_CAND(cur_pu, 1, 0);
+
+  FILL(*info, 0);
+
+  info->state          = state;
+  info->pic            = frame->source;
+  info->origin.x       = x;
+  info->origin.y       = y;
+  info->width          = width;
+  info->height         = height;
+  info->mvd_cost_func  = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost;
+  info->optimized_sad  = uvg_get_optimized_sad(width);
 
   // Search for merge mode candidates
-  info.num_merge_cand = uvg_inter_get_merge_cand(
+  info->num_merge_cand = uvg_inter_get_merge_cand(
       state,
       x, y,
       width, height,
       merge_a1, merge_b1,
-      info.merge_cand,
+      info->merge_cand,
       lcu
   );
 
-  // Default to candidate 0
-  CU_SET_MV_CAND(cur_cu, 0, 0);
-  CU_SET_MV_CAND(cur_cu, 1, 0);
-
   // Merge Analysis starts here
-  int8_t mrg_cands[MRG_MAX_NUM_CANDS];
-  double mrg_costs[MRG_MAX_NUM_CANDS];
+  merge->size = 0;
   for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) {
-    mrg_cands[i] = -1;
-    mrg_costs[i] = MAX_DOUBLE;
+    merge->keys[i] = -1;
+    merge->cost[i] = MAX_DOUBLE;
   }
 
-  int num_rdo_cands = 0;
-
+  const double merge_flag_cost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_merge_flag_ext_model, 1);
+#ifdef COMPLETE_PRED_MODE_BITS
+  // Technically counting these bits would be correct, however counting
+  // them universally degrades quality so this block is disabled by default
+  const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0);
+#else
+  const double no_skip_flag = 0;
+#endif
   // Check motion vector constraints and perform rough search
-  for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {    
-    inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx];
+  for (int merge_idx = 0; merge_idx < info->num_merge_cand; ++merge_idx) {
 
-    cur_cu->inter.mv_dir = cur_cand->dir;
-    cur_cu->inter.mv_ref[0] = cur_cand->ref[0];
-    cur_cu->inter.mv_ref[1] = cur_cand->ref[1];
-    cur_cu->inter.mv[0][0] = cur_cand->mv[0][0];
-    cur_cu->inter.mv[0][1] = cur_cand->mv[0][1];
-    cur_cu->inter.mv[1][0] = cur_cand->mv[1][0];
-    cur_cu->inter.mv[1][1] = cur_cand->mv[1][1];
+    inter_merge_cand_t *cur_cand = &info->merge_cand[merge_idx];
+    cur_pu->inter.mv_dir = cur_cand->dir;
+    cur_pu->inter.mv_ref[0] = cur_cand->ref[0];
+    cur_pu->inter.mv_ref[1] = cur_cand->ref[1];
+    cur_pu->inter.mv[0][0] = cur_cand->mv[0][0];
+    cur_pu->inter.mv[0][1] = cur_cand->mv[0][1];
+    cur_pu->inter.mv[1][0] = cur_cand->mv[1][0];
+    cur_pu->inter.mv[1][1] = cur_cand->mv[1][1];
 
     // If bipred is not enabled, do not try candidates with mv_dir == 3.
     // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
-    if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
-    if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue;
+    if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
+    if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue;
 
-    bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand,
-      mrg_cands, 
-      num_rdo_cands);
+    bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge);
 
     // Don't try merge candidates that don't satisfy mv constraints.
     // Don't add duplicates to list
-    if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) ||
-        !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1]) ||
+    bool active_L0 = cur_pu->inter.mv_dir & 1;
+    bool active_L1 = cur_pu->inter.mv_dir & 2;
+    if ((active_L0 && !fracmv_within_tile(info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])) ||
+        (active_L1 && !fracmv_within_tile(info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1])) ||
         is_duplicate)
     {
       continue;
     }
     uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
-    mrg_costs[num_rdo_cands] = uvg_satd_any_size(width, height,
-      lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
-      lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
-    
-    // Add cost of coding the merge index
-    mrg_costs[num_rdo_cands] += merge_idx * info.state->lambda_sqrt;
+    merge->unit[merge->size] = *cur_pu;
+    merge->unit[merge->size].type = CU_INTER;
+    merge->unit[merge->size].merge_idx = merge_idx;
+    merge->unit[merge->size].merged = true;
+    merge->unit[merge->size].skipped = false;
 
-    mrg_cands[num_rdo_cands] = merge_idx;
-    num_rdo_cands++;
+    double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0);
+    if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+      uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits);
+    }
+    else {
+      merge->cost[merge->size] = uvg_satd_any_size(width, height,
+        lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
+        lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
+      bits += no_skip_flag;
+      merge->cost[merge->size] += bits * info->state->lambda_sqrt;
+    }
+    // Add cost of coding the merge index
+    merge->bits[merge->size] = bits;
+    merge->keys[merge->size] = merge->size;
+
+
+    merge->size++;
   }
 
-  // Sort candidates by cost
-  uvg_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
+  assert(merge->size <= MAX_UNIT_STATS_MAP_SIZE);
+  uvg_sort_keys_by_cost(merge);
 
-  // Limit by availability
-  // TODO: Do not limit to just 1
-  num_rdo_cands = MIN(1, num_rdo_cands);
+  // Try early skip decision on just one merge candidate if available
+  int num_rdo_cands = MIN(1, merge->size);
     
   // Early Skip Mode Decision
   bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-  if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
-    for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) {
-
-      // Reconstruct blocks with merge candidate.
-      // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
-      // and chroma exists.
-      // Early terminate if merge candidate with zero CBF is found.
-      int merge_idx = mrg_cands[merge_rdo_idx];
-      inter_merge_cand_t *cur_cand = &info.merge_cand[merge_idx];
-
-      cur_cu->inter.mv_dir    = cur_cand->dir;
-      cur_cu->inter.mv_ref[0] = cur_cand->ref[0];
-      cur_cu->inter.mv_ref[1] = cur_cand->ref[1];
-      cur_cu->inter.mv[0][0]  = cur_cand->mv[0][0];
-      cur_cu->inter.mv[0][1]  = cur_cand->mv[0][1];
-      cur_cu->inter.mv[1][0]  = cur_cand->mv[1][0];
-      cur_cu->inter.mv[1][1]  = cur_cand->mv[1][1];
-
-      uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
-      uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
-      uvg_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true);
-
-      if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) {
-        continue;
+  if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) {
+    for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) {
+      if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) {
+        merge->size = 1;
+        merge->bits[0] = merge->bits[merge->keys[merge_key]];
+        merge->cost[0] = merge->cost[merge->keys[merge_key]];
+        merge->unit[0] = merge->unit[merge->keys[merge_key]];
+        merge->keys[0] = 0;
       }
-      else if (has_chroma) {
-        uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
-        uvg_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true);
-        if (!cbf_is_set_any(cur_cu->cbf, depth)) {
-          cur_cu->type = CU_INTER;
-          cur_cu->merge_idx = merge_idx;
-          cur_cu->skipped = true;
-          *inter_cost = 0.0;  // TODO: Check this
-          *inter_bitcost = merge_idx; // TODO: Check this
-          return;
+      else if(cfg->rdo < 2) {
+        // Reconstruct blocks with merge candidate.
+        // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
+        // and chroma exists.
+        // Early terminate if merge candidate with zero CBF is found.
+        int merge_idx           = merge->unit[merge->keys[merge_key]].merge_idx;
+        cur_pu->inter.mv_dir    = info->merge_cand[merge_idx].dir;
+        cur_pu->inter.mv_ref[0] = info->merge_cand[merge_idx].ref[0];
+        cur_pu->inter.mv_ref[1] = info->merge_cand[merge_idx].ref[1];
+        cur_pu->inter.mv[0][0]  = info->merge_cand[merge_idx].mv[0][0];
+        cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
+        cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
+        cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
+        uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
+        uvg_inter_recon_cu(state, lcu, x, y, width, true, false);
+        uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true);
+
+        if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) {
+          continue;
+        }
+        else if (has_chroma) {
+          uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+          uvg_quantize_lcu_residual(state, false, has_chroma, 
+            false, /*we are only checking for lack of coeffs so no need to check jccr*/
+            x, y, depth, cur_pu, lcu, true);
+          if (!cbf_is_set_any(cur_pu->cbf, depth)) {
+            cur_pu->type = CU_INTER;
+            cur_pu->merge_idx = merge_idx;
+            cur_pu->skipped = true;
+
+            merge->size = 1;
+            merge->cost[0] = 0.0; // TODO: Check this
+            merge->bits[0] = merge_idx; // TODO: Check this
+            merge->unit[0] = *cur_pu;
+            return;
+          }
         }
       }
     }
@@ -1816,16 +1805,139 @@ static void search_pu_inter(encoder_state_t * const state,
 
   // AMVP search starts here
 
-  // Store unipred information of L0 and L1 for biprediction
-  // Best cost will be left at MAX_DOUBLE if no valid CU is found
-  double best_cost_LX[2] = { MAX_DOUBLE, MAX_DOUBLE };
-  cu_info_t unipreds[2];
+  amvp[0].size = 0;
+  amvp[1].size = 0;
+  amvp[2].size = 0;
+
+  for (int mv_dir = 1; mv_dir < 4; ++mv_dir) {
+    for (int i = 0; i < state->frame->ref->used_size; ++i) {
+      amvp[mv_dir - 1].cost[i] = MAX_DOUBLE;
+    }
+  }
 
   for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
-    info.ref_idx = ref_idx;
-    info.ref = state->frame->ref->images[ref_idx];
+    info->ref_idx = ref_idx;
+    info->ref = state->frame->ref->images[ref_idx];
 
-    search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds);
+    search_pu_inter_ref(info, depth, lcu, cur_pu, amvp);
+  }
+
+  assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE);
+  assert(amvp[1].size <= MAX_UNIT_STATS_MAP_SIZE);
+  uvg_sort_keys_by_cost(&amvp[0]);
+  uvg_sort_keys_by_cost(&amvp[1]);
+
+  int best_keys[2] = { 
+    amvp[0].size > 0 ? amvp[0].keys[0] : 0, 
+    amvp[1].size > 0 ? amvp[1].keys[0] : 0
+  };
+
+  cu_info_t *best_unipred[2] = {
+    &amvp[0].unit[best_keys[0]],
+    &amvp[1].unit[best_keys[1]]
+  };
+
+  // Prevent using the same ref picture with both lists.
+  // TODO: allow searching two MVs from the same reference picture.
+  if (cfg->bipred && amvp[0].size > 0 && amvp[1].size > 0) {
+
+    uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
+
+    int L0_idx = best_unipred[0]->inter.mv_ref[0];
+    int L1_idx = best_unipred[1]->inter.mv_ref[1];
+    
+    int L0_ref_idx = ref_LX[0][L0_idx];
+    int L1_ref_idx = ref_LX[1][L1_idx];
+
+    if (L0_ref_idx == L1_ref_idx) {
+      // Invalidate the other based the list that has the 2nd best PU
+      double L0_2nd_cost = amvp[0].size > 1 ? amvp[0].cost[amvp[0].keys[1]] : MAX_DOUBLE;
+      double L1_2nd_cost = amvp[1].size > 1 ? amvp[1].cost[amvp[1].keys[1]] : MAX_DOUBLE;
+      int list = (L0_2nd_cost <= L1_2nd_cost) ? 1 : 0;
+      amvp[list].cost[best_keys[list]] = MAX_DOUBLE;
+      uvg_sort_keys_by_cost(&amvp[list]);
+      amvp[list].size--;
+      best_keys[list]    =  amvp[list].keys[0];
+      best_unipred[list] = &amvp[list].unit[best_keys[list]];
+    }
+  }
+
+  // Fractional-pixel motion estimation.
+  // Refine the best PUs so far from both lists, if available.
+  for (int list = 0; list < 2; ++list) {
+
+    // TODO: make configurable
+    int n_best = MIN(1, amvp[list].size);
+    if (cfg->fme_level > 0) {
+
+      for (int i = 0; i < n_best; ++i) {
+
+        int key = amvp[list].keys[i];
+        cu_info_t *unipred_pu = &amvp[list].unit[key];
+
+        // Find the reference picture
+        const image_list_t *const ref = info->state->frame->ref;
+        uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
+
+        int LX_idx = unipred_pu->inter.mv_ref[list];
+        info->ref_idx = ref_LX[list][LX_idx];
+        info->ref = ref->images[info->ref_idx];
+
+        uvg_inter_get_mv_cand(info->state,
+          info->origin.x,
+          info->origin.y,
+          info->width,
+          info->height,
+          info->mv_cand,
+          unipred_pu,
+          lcu,
+          list);
+
+        double     frac_cost = MAX_DOUBLE;
+        double   frac_bits = MAX_INT;
+        vector2d_t frac_mv = { unipred_pu->inter.mv[list][0], unipred_pu->inter.mv[list][1] };
+
+        search_frac(info, &frac_cost, &frac_bits, &frac_mv);
+
+        uint8_t mv_ref_coded = LX_idx;
+        int cu_mv_cand = select_mv_cand(info->state, info->mv_cand, frac_mv.x, frac_mv.y, NULL);
+        const int extra_bits = list + mv_ref_coded; // TODO: check if mv_dir bits are missing
+        frac_cost += extra_bits * info->state->lambda_sqrt;
+        frac_bits += extra_bits;
+
+        bool valid_mv = fracmv_within_tile(info, frac_mv.x, frac_mv.y);
+        if (valid_mv) {
+
+          unipred_pu->inter.mv[list][0] = frac_mv.x;
+          unipred_pu->inter.mv[list][1] = frac_mv.y;
+          CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand);
+
+          if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+            uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits);
+          }
+
+          amvp[list].cost[key] = frac_cost;
+          amvp[list].bits[key] = frac_bits;
+        }
+      }
+
+      // Invalidate PUs with SAD-based costs. (FME not performed).
+      // TODO: Recalculate SAD costs with SATD for further processing.
+      for (int i = n_best; i < amvp[list].size; ++i) {
+        int key = amvp[list].keys[i];
+        amvp[list].cost[key] = MAX_DOUBLE;
+      }
+    }
+
+    // Costs are now, SATD-based. Omit PUs with SAD-based costs.
+    // TODO: Recalculate SAD costs with SATD for further processing.
+    uvg_sort_keys_by_cost(&amvp[list]);
+    amvp[list].size = n_best;
+  }
+
+  if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) {
+    if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]);
+    if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]);
   }
 
   // Search bi-pred positions
@@ -1835,25 +1947,39 @@ static void search_pu_inter(encoder_state_t * const state,
 
   if (can_use_bipred) {
 
+    cu_info_t *bipred_pu = &amvp[2].unit[0];
+    *bipred_pu = *cur_pu;
+    double   best_bipred_cost = MAX_DOUBLE;
+
     // Try biprediction from valid acquired unipreds.
-    if (best_cost_LX[0] != MAX_DOUBLE && best_cost_LX[1] != MAX_DOUBLE) {
+    if (amvp[0].size > 0 && amvp[1].size > 0) {
 
       // TODO: logic is copy paste from search_pu_inter_bipred.
       // Get rid of duplicate code asap.
-      const image_list_t *const ref = info.state->frame->ref;
-      uint8_t(*ref_LX)[16] = info.state->frame->ref_LX;
+      const image_list_t *const ref = info->state->frame->ref;
+      uint8_t(*ref_LX)[16] = info->state->frame->ref_LX;
 
-      inter_merge_cand_t *merge_cand = info.merge_cand;
+      bipred_pu->inter.mv_dir = 3;
 
-      mv_t mv[2][2];
-      mv[0][0] = unipreds[0].inter.mv[0][0];
-      mv[0][1] = unipreds[0].inter.mv[0][1];
-      mv[1][0] = unipreds[1].inter.mv[1][0];
-      mv[1][1] = unipreds[1].inter.mv[1][1];
+      bipred_pu->inter.mv_ref[0] = best_unipred[0]->inter.mv_ref[0];
+      bipred_pu->inter.mv_ref[1] = best_unipred[1]->inter.mv_ref[1];
 
-      uvg_inter_recon_bipred(info.state,
-        ref->images[ref_LX[0][unipreds[0].inter.mv_ref[0]]],
-        ref->images[ref_LX[1][unipreds[1].inter.mv_ref[1]]],
+      int16_t (*mv)[2] = bipred_pu->inter.mv;
+      mv[0][0] = best_unipred[0]->inter.mv[0][0];
+      mv[0][1] = best_unipred[0]->inter.mv[0][1];
+      mv[1][0] = best_unipred[1]->inter.mv[1][0];
+      mv[1][1] = best_unipred[1]->inter.mv[1][1];
+      
+      bipred_pu->merged  = false;
+      bipred_pu->skipped = false;
+
+      for (int reflist = 0; reflist < 2; reflist++) {
+        uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist);
+      }
+
+      uvg_inter_recon_bipred(info->state,
+        ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]],
+        ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]],
         x, y,
         width,
         height,
@@ -1864,104 +1990,77 @@ static void search_pu_inter(encoder_state_t * const state,
 
       const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
       const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)];
-      uint32_t cost =
+
+      best_bipred_cost =
         uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
 
-      uint32_t bitcost[2] = { 0, 0 };
+      double bitcost[2] = { 0, 0 };
 
-      cost += info.mvd_cost_func(info.state,
-        unipreds[0].inter.mv[0][0],
-        unipreds[0].inter.mv[0][1],
+      best_bipred_cost += info->mvd_cost_func(info->state,
+        bipred_pu->inter.mv[0][0],
+        bipred_pu->inter.mv[0][1],
         0,
-        info.mv_cand,
+        info->mv_cand,
         NULL, 0, 0,
         &bitcost[0]);
-      cost += info.mvd_cost_func(info.state,
-        unipreds[1].inter.mv[1][0],
-        unipreds[1].inter.mv[1][1],
+      best_bipred_cost += info->mvd_cost_func(info->state,
+        bipred_pu->inter.mv[1][0],
+        bipred_pu->inter.mv[1][1],
         0,
-        info.mv_cand,
+        info->mv_cand,
         NULL, 0, 0,
         &bitcost[1]);
 
       const uint8_t mv_ref_coded[2] = {
-        unipreds[0].inter.mv_ref[0],
-        unipreds[1].inter.mv_ref[1]
+        bipred_pu->inter.mv_ref[0],
+        bipred_pu->inter.mv_ref[1]
       };
       const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */;
-      cost += info.state->lambda_sqrt * extra_bits + 0.5;
+      best_bipred_cost += info->state->lambda_sqrt * extra_bits;
 
-      if (cost < *inter_cost) {
-        cur_cu->inter.mv_dir = 3;
-
-        cur_cu->inter.mv_ref[0] = unipreds[0].inter.mv_ref[0];
-        cur_cu->inter.mv_ref[1] = unipreds[1].inter.mv_ref[1];
-
-        cur_cu->inter.mv[0][0] = unipreds[0].inter.mv[0][0];
-        cur_cu->inter.mv[0][1] = unipreds[0].inter.mv[0][1];
-        cur_cu->inter.mv[1][0] = unipreds[1].inter.mv[1][0];
-        cur_cu->inter.mv[1][1] = unipreds[1].inter.mv[1][1];
-        cur_cu->merged = 0;
-
-        // Check every candidate to find a match
-        for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) {
-          if (merge_cand[merge_idx].dir != 3) continue;
-          if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
-            merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&
-            merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
-            merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&
-            merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] &&
-            merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1])
-          {
-            cur_cu->merged = 1;
-            cur_cu->merge_idx = merge_idx;
-            break;
-          }
-        }
+      if (best_bipred_cost < MAX_DOUBLE) {
 
         // Each motion vector has its own candidate
         for (int reflist = 0; reflist < 2; reflist++) {
-          uvg_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist);
           int cu_mv_cand = select_mv_cand(
-            info.state,
-            info.mv_cand,
-            cur_cu->inter.mv[reflist][0],
-            cur_cu->inter.mv[reflist][1],
+            info->state,
+            info->mv_cand,
+            bipred_pu->inter.mv[reflist][0],
+            bipred_pu->inter.mv[reflist][1],
             NULL);
-          CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
+          CU_SET_MV_CAND(bipred_pu, reflist, cu_mv_cand);
         }
 
-        *inter_cost = cost;
-        *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits;
+        amvp[2].cost[amvp[2].size] = best_bipred_cost;
+        amvp[2].bits[amvp[2].size] = bitcost[0] + bitcost[1] + extra_bits;
+        amvp[2].keys[amvp[2].size] = amvp[2].size;
+        amvp[2].size++;
       }
     }
 
     // TODO: this probably should have a separate command line option
-    if (cfg->rdo >= 3) {
-      search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
+    if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]);
+    
+    assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE);
+    uvg_sort_keys_by_cost(&amvp[2]);
+    if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) {
+      uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]);
     }
   }
+  if(cfg->rdo < 2) {
+    int predmode_ctx;
+    const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx);
+    const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0);
 
-  // Compare best merge cost to amvp cost
-  if (mrg_costs[0] < *inter_cost) {
-    *inter_cost = mrg_costs[0];
-    *inter_bitcost = 0; // TODO: Check this
-    int merge_idx = mrg_cands[0];
-    cur_cu->type = CU_INTER;
-    cur_cu->merge_idx = merge_idx;
-    cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir;
-    cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0];
-    cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1];
-    cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0];
-    cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1];
-    cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0];
-    cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1];
-    cur_cu->merged = true;
-    cur_cu->skipped = false;
-  }
-
-  if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) {
-    assert(fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]));
+    const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0);
+    const double total_bits = no_skip_flag + pred_mode_bits;
+    for(int i = 0; i < 3; i++) {
+      if(amvp[i].size > 0) {
+        const uint8_t best_key = amvp[i].keys[0];
+        amvp[i].bits[best_key] += total_bits;
+        amvp[i].cost[best_key] += (total_bits)* state->lambda_sqrt;
+      }
+    }
   }
 }
 
@@ -1985,32 +2084,99 @@ static void search_pu_inter(encoder_state_t * const state,
 * \param inter_bitcost Return inter bitcost
 */
 void uvg_cu_cost_inter_rd2(encoder_state_t * const state,
-  int x, int y, int depth,
-  lcu_t *lcu,
-  double   *inter_cost,
-  uint32_t *inter_bitcost){
-
-  cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+                           int x, int y, int depth,
+                           cu_info_t* cur_cu,
+                           lcu_t *lcu,
+                           double   *inter_cost,
+                           double* inter_bitcost){
+  
   int tr_depth = MAX(1, depth);
   if (cur_cu->part_size != SIZE_2Nx2N) {
     tr_depth = depth + 1;
   }
   uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
 
+  const int x_px = SUB_SCU(x);
+  const int y_px = SUB_SCU(y);
+  const int width = LCU_WIDTH >> depth;
+  cabac_data_t cabac_copy;
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
+  cabac_copy.update = 1;
+
+  cu_info_t* cur_pu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  *cur_pu = *cur_cu;
+
   const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
   uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
-  uvg_quantize_lcu_residual(state, true, reconstruct_chroma,
-    x, y, depth,
-    NULL,
-    lcu,
-    false);
 
-  *inter_cost = uvg_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+  int index = y_px * LCU_WIDTH + x_px;
+  double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+                                   LCU_WIDTH, LCU_WIDTH,
+                                   width) * UVG_LUMA_MULT;
   if (reconstruct_chroma) {
-    *inter_cost += uvg_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+    int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
+    double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+                                       LCU_WIDTH_C, LCU_WIDTH_C,
+                                       width / 2);
+    double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+                                       LCU_WIDTH_C, LCU_WIDTH_C,
+                                       width / 2);
+    ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
   }
+  double no_cbf_bits;
+  double bits = 0;
+  const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL);
+  if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+    no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
+    bits += uvg_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu);
+  }
+  else {
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, &cabac_copy, x, y, depth, lcu, cur_cu);
+    bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac_copy.ctx.cu_qt_root_cbf_model, 1);
+  }
+  double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
 
-  *inter_cost += *inter_bitcost * state->lambda;
+  uvg_quantize_lcu_residual(state,
+                            true, reconstruct_chroma,
+                            reconstruct_chroma && state->encoder_control->cfg.jccr, x, y,
+                            depth,
+                            cur_cu,
+                            lcu,
+                            false);
+
+  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+  
+  if(cbf) {
+    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu);
+    if (reconstruct_chroma) {
+      if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
+        *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+      }
+      else {
+        uvg_select_jccr_mode(state, x_px, y_px, depth, cur_cu, lcu, inter_cost);        
+      }
+    }
+  }
+  else {
+    // If we have no coeffs after quant we already have the cost calculated
+    *inter_cost = no_cbf_cost;
+    cur_cu->cbf = 0;
+    *inter_bitcost = no_cbf_bits;
+    return;
+  }
+  
+  *inter_cost += (bits)* state->lambda;
+  *inter_bitcost = bits;
+
+  if(no_cbf_cost < *inter_cost) {
+    cur_cu->cbf = 0;
+    if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+      cur_cu->skipped = 1;
+    }
+    *inter_cost = no_cbf_cost;
+    *inter_bitcost = no_cbf_bits;
+    
+  }
 }
 
 
@@ -2032,21 +2198,79 @@ void uvg_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
                          lcu_t *lcu,
                          double   *inter_cost,
-                         uint32_t *inter_bitcost)
+                         double* inter_bitcost)
 {
+  *inter_cost = MAX_DOUBLE;
+  *inter_bitcost = MAX_INT;
+
+  // Store information of L0, L1, and bipredictions.
+  // Best cost will be left at MAX_DOUBLE if no valid CU is found.
+  // These will be initialized by the following function.
+  unit_stats_map_t amvp[3];
+  unit_stats_map_t merge;
+  inter_search_info_t info;
+
   search_pu_inter(state,
                   x, y, depth,
                   SIZE_2Nx2N, 0,
                   lcu,
-                  inter_cost,
-                  inter_bitcost);
+                  amvp,
+                  &merge,
+                  &info);
 
-  // Calculate more accurate cost when needed
-  if (state->encoder_control->cfg.rdo >= 2) {
-    uvg_cu_cost_inter_rd2(state,
-      x, y, depth,
-      lcu,
-      inter_cost,
-      inter_bitcost);
+  // Early Skip CU decision
+  if (merge.size == 1 && merge.unit[0].skipped) {
+    *inter_cost    = merge.cost[0];
+    *inter_bitcost = merge.bits[0];
+    return;
+  }
+
+  cu_info_t *best_inter_pu = NULL;
+
+  // Find best AMVP PU
+  for (int mv_dir = 1; mv_dir < 4; ++mv_dir) {
+
+    int best_key = amvp[mv_dir - 1].keys[0];
+
+    if (amvp[mv_dir - 1].size > 0 &&
+        amvp[mv_dir - 1].cost[best_key] < *inter_cost) {
+
+      best_inter_pu  = &amvp[mv_dir - 1].unit[best_key];
+      *inter_cost    =  amvp[mv_dir - 1].cost[best_key];
+      *inter_bitcost =  amvp[mv_dir - 1].bits[best_key];
+    }
+  }
+
+  // Compare best AMVP against best Merge mode
+  int best_merge_key = merge.keys[0];
+
+  if (merge.size > 0 && merge.cost[best_merge_key] < *inter_cost) {
+
+    best_inter_pu  = &merge.unit[best_merge_key];
+    *inter_cost    =  merge.cost[best_merge_key];
+    *inter_bitcost =  0; // TODO: Check this
+  }
+
+  if (*inter_cost == MAX_DOUBLE) {
+    // Could not find any motion vector.
+    *inter_cost = MAX_DOUBLE;
+    *inter_bitcost = MAX_INT;
+    return;
+  }
+
+  const int x_local = SUB_SCU(x);
+  const int y_local = SUB_SCU(y);
+  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
+  *cur_pu = *best_inter_pu;
+
+  uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth),
+    true, state->encoder_control->chroma_format != UVG_CSP_400);   
+
+  if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) {
+    assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1]));
+  }
+
+  if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 2) {
+    assert(fracmv_within_tile(&info, cur_pu->inter.mv[1][0], cur_pu->inter.mv[1][1]));
   }
 }
diff --git a/src/search_inter.h b/src/search_inter.h
index d1e1ee71..d76dd927 100644
--- a/src/search_inter.h
+++ b/src/search_inter.h
@@ -64,20 +64,34 @@ enum hpel_position {
   HPEL_POS_DIA = 2
 };
 
-typedef uint32_t uvg_mvd_cost_func(const encoder_state_t *state,
+typedef double uvg_mvd_cost_func(const encoder_state_t *state,
                                   int x, int y,
                                   int mv_shift,
                                   mv_t mv_cand[2][2],
                                   inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                                   int16_t num_cand,
                                   int32_t ref_idx,
-                                  uint32_t *bitcost);
+                                  double *bitcost);
 
 void uvg_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
                          lcu_t *lcu,
                          double *inter_cost,
-                         uint32_t *inter_bitcost);
+                         double* inter_bitcost);
 
 
+
+unsigned uvg_inter_satd_cost(const encoder_state_t* state,
+                             const lcu_t *lcu,
+                             int x,
+                             int y);
+void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
+  int x, int y, int depth,
+  cu_info_t* cur_cu,
+  lcu_t* lcu,
+  double* inter_cost,
+  double* inter_bitcost);
+
+int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);
+
 #endif // SEARCH_INTER_H_
diff --git a/src/search_intra.c b/src/search_intra.c
index e89720fb..9dc24fba 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -37,6 +37,7 @@
 #include "cabac.h"
 #include "encoder.h"
 #include "encoderstate.h"
+#include "encode_coding_tree.h"
 #include "image.h"
 #include "intra.h"
 #include "uvg266.h"
@@ -97,13 +98,13 @@ static double get_cost(encoder_state_t * const state,
 
     // Add the offset bit costs of signaling 'luma and chroma use trskip',
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
-    const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
+    const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
 
     
     // ToDo: Check cost
     if (state->encoder_control->chroma_format != UVG_CSP_400) {
-      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      ctx = &state->search_cabac.ctx.transform_skip_model_chroma;
       trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
     }
     
@@ -253,13 +254,15 @@ static void derive_mts_constraints(cu_info_t *const pred_cu,
 * \param cost_treshold  RD cost at which search can be stopped.
 * \param mts_mode       Selected MTS mode for current intra mode.
 */
-static double search_intra_trdepth(encoder_state_t * const state,
-                                   int x_px, int y_px, int depth, int max_depth,
-                                   int intra_mode, int cost_treshold,
-                                   cu_info_t *const pred_cu,
-                                   lcu_t *const lcu,
-                                   cclm_parameters_t *cclm_params,
-                                   const int mts_mode)
+static double search_intra_trdepth(
+  encoder_state_t * const state,
+  int x_px,
+  int y_px,
+  int depth,
+  int max_depth,
+  int cost_treshold,
+  intra_search_data_t *const search_data,
+  lcu_t *const lcu)
 {
   assert(depth >= 0 && depth <= MAX_PU_DEPTH);
 
@@ -268,9 +271,10 @@ static double search_intra_trdepth(encoder_state_t * const state,
 
   const int offset = width / 2;
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
   const bool reconstruct_chroma = (depth != 4 || (depth == 4 && (x_px & 4 && y_px & 4))) && state->encoder_control->chroma_format != UVG_CSP_400;
+  cu_info_t* pred_cu = &search_data->pred_cu;
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
   struct {
     uvg_pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
@@ -295,16 +299,16 @@ static double search_intra_trdepth(encoder_state_t * const state,
       cbf_clear(&pred_cu->cbf, depth, COLOR_V);
     }
 
-    const int8_t chroma_mode = reconstruct_chroma ? intra_mode : -1;
+    const int8_t chroma_mode = reconstruct_chroma ? pred_cu->intra.mode : -1;
     double best_rd_cost = MAX_INT;
     int best_tr_idx = 0;
 
     int trafo;
     int num_transforms = 1;
-    if (mts_mode != -1)
+    if (pred_cu->tr_idx != MTS_TR_NUM)
     {
-      trafo = mts_mode;
-      num_transforms = mts_mode + 1;
+      trafo = pred_cu->tr_idx;
+      num_transforms = pred_cu->tr_idx + 1;
     }
     else
     {
@@ -315,6 +319,8 @@ static double search_intra_trdepth(encoder_state_t * const state,
     if(state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size) /*&& height == 4*/) {
       num_transforms = MAX(num_transforms, 2);
     }
+    pred_cu->intra.mode_chroma = -1;
+    pred_cu->joint_cb_cr = 4;
     for (; trafo < num_transforms; trafo++) {
       pred_cu->tr_idx = trafo;
       if (mts_enabled)
@@ -330,12 +336,10 @@ static double search_intra_trdepth(encoder_state_t * const state,
       }
      
       uvg_intra_recon_cu(state,
-        x_px, y_px,
-        depth,
-        intra_mode, -1,
-        pred_cu, cclm_params, pred_cu->intra.multi_ref_idx, 
-        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
-        lcu);
+                         x_px, y_px,
+                         depth, search_data,
+                         pred_cu,
+                         lcu);
 
       // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
       if (pred_cu->tr_idx > 1)
@@ -343,7 +347,6 @@ static double search_intra_trdepth(encoder_state_t * const state,
         derive_mts_constraints(pred_cu, lcu, depth, lcu_px);
         if (pred_cu->violates_mts_coeff_constraint || !pred_cu->mts_last_scan_pos)
         {
-          assert(mts_mode == -1); //mts mode should not be decided and then not allowed to be used. (might be some exception here)
           continue;
         }
       }
@@ -359,14 +362,17 @@ static double search_intra_trdepth(encoder_state_t * const state,
       }
     }
     if(reconstruct_chroma) {
+      int8_t luma_mode = pred_cu->intra.mode;
+      pred_cu->intra.mode = -1;
+      pred_cu->intra.mode_chroma = chroma_mode;
+      pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently
       uvg_intra_recon_cu(state,
-        x_px, y_px,
-        depth,
-        -1, chroma_mode,
-        pred_cu, cclm_params, 0, 
-        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
-        lcu);
+                         x_px, y_px,
+                         depth, search_data,
+                         pred_cu, 
+                         lcu);
       best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+      pred_cu->intra.mode = luma_mode;
     }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
     pred_cu->tr_idx = best_tr_idx;
@@ -394,17 +400,17 @@ static double search_intra_trdepth(encoder_state_t * const state,
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->lambda;
+    split_cost = 0;
 
-    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
+    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu);
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, nosplit_cost, search_data, lcu);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
+      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, nosplit_cost, search_data, lcu);
     }
 
     double cbf_bits = 0.0;
@@ -417,14 +423,15 @@ static double search_intra_trdepth(encoder_state_t * const state,
     // so this will code cbf as 0 and not code the cbf at all for descendants.
     if (state->encoder_control->chroma_format != UVG_CSP_400) {
       const uint8_t tr_depth = depth - pred_cu->depth;
+      cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
 
-      const cabac_ctx_t* ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+      cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
       }
       ctx = &(state->cabac.ctx.qt_cbf_model_cr[cbf_is_set(pred_cu->cbf, depth, COLOR_U)]);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+        CABAC_FBITS_UPDATE(cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
       }
     }
 
@@ -452,29 +459,44 @@ static double search_intra_trdepth(encoder_state_t * const state,
     return nosplit_cost;
   }
 }
+static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length)
+{
+  // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about
+  // 60% of the time, so there should be no need for anything more complex
+  // than insertion sort.
+  // Length for merge is 5 or less.
+  for (uint8_t i = 1; i < length; ++i) {
+    const intra_search_data_t cur_cost = modes[i];
+    uint8_t j = i;
+    while (j > 0 && cur_cost.cost < modes[j - 1].cost) {
+      modes[j] = modes[j - 1];
+      --j;
+    }
+    modes[j] = cur_cost;
+  }
+}
 
-
-static void search_intra_chroma_rough(encoder_state_t * const state,
-                                      int x_px, int y_px, int depth,
-                                      const uvg_pixel *orig_u, const uvg_pixel *orig_v, int16_t origstride,
-                                      uvg_intra_references *refs_u, uvg_intra_references *refs_v,
-                                      int8_t luma_mode,
-                                      int8_t modes[8], double costs[8], lcu_t* lcu)
+static void search_intra_chroma_rough(
+  encoder_state_t * const state,
+  int x_px,
+  int y_px,
+  int depth,
+  const uvg_pixel *orig_u,
+  const uvg_pixel *orig_v,
+  int16_t origstride,
+  uvg_intra_references *refs_u,
+  uvg_intra_references *refs_v,
+  intra_search_data_t* chroma_data,
+  lcu_t* lcu)
 {
   assert(!(x_px & 4 || y_px & 4));
 
   const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
-  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2);
-
-  for (int i = 0; i < 8; ++i) {
-    costs[i] = 0;
-  }
 
   cost_pixel_nxn_func *const satd_func = uvg_pixels_get_satd_func(width);
   //cost_pixel_nxn_func *const sad_func = uvg_pixels_get_sad_func(width);
-
-  cclm_parameters_t cclm_params;
-  
+  cu_loc_t loc = { x_px, y_px, width, width, width, width };
+    
   uvg_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   uvg_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
 
@@ -482,34 +504,27 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   uvg_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   uvg_pixels_blit(orig_u, orig_block, width, width, origstride, width);
-  for (int i = 0; i < 5; ++i) {
-    if (modes[i] == -1) continue;
-    uvg_intra_predict(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false, 0);
+  int modes_count = (state->encoder_control->cfg.cclm ? 8 : 5);
+  for (int i = 0; i < modes_count; ++i) {
+    if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue;
+    uvg_intra_predict(state, refs_u, &loc, COLOR_U, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
-    costs[i] += satd_func(pred, orig_block);
-  }
-  for (int i = 5; i < 8; i++) {
-    assert(state->encoder_control->cfg.cclm);
-    uvg_predict_cclm(
-      state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u,  pred, &cclm_params);
+    chroma_data[i].cost += satd_func(pred, orig_block);
   }
 
   uvg_pixels_blit(orig_v, orig_block, width, width, origstride, width);
-  for (int i = 0; i < 5; ++i) {
-    if (modes[i] == -1) continue;
-    uvg_intra_predict(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false, 0);
+  for (int i = 0; i < modes_count; ++i) {
+    if (chroma_data[i].pred_cu.intra.mode_chroma == -1) continue;
+    uvg_intra_predict(state, refs_v, &loc, COLOR_V, pred, &chroma_data[i], lcu);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
-    costs[i] += satd_func(pred, orig_block);
-  }
-  for (int i = 5; i < 8; i++) {
-    assert(state->encoder_control->cfg.cclm);
-    uvg_predict_cclm(
-      state,
-      COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params);
+    chroma_data[i].cost += satd_func(pred, orig_block);
   }
 
-  uvg_sort_modes(modes, costs, 5);
+  for (int i = 0; i < modes_count; ++i) {
+    const double bits = uvg_chroma_mode_bits(state, chroma_data[i].pred_cu.intra.mode_chroma, chroma_data[i].pred_cu.intra.mode);
+    chroma_data[i].bits = bits;
+    chroma_data[i].cost = bits * state->lambda_sqrt;
+  }
 }
 
 
@@ -543,11 +558,16 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
  *
  * \return  Number of prediction modes in param modes.
  */
-static int8_t search_intra_rough(encoder_state_t * const state, 
-                                 uvg_pixel *orig, int32_t origstride,
-                                 uvg_intra_references *refs,
-                                 int log2_width, int8_t *intra_preds,
-                                 int8_t modes[67], double costs[67])
+static int16_t search_intra_rough(
+  encoder_state_t * const state,
+  uvg_pixel *orig,
+  int32_t origstride,
+  uvg_intra_references *refs,
+  int log2_width,
+  int8_t *intra_preds,
+  intra_search_data_t* modes_out,
+  cu_info_t* const pred_cu,
+  uint8_t mip_ctx)
 {
   #define PARALLEL_BLKS 2 // TODO: use 4 for AVX-512 in the future?
   assert(log2_width >= 2 && log2_width <= 5);
@@ -556,9 +576,11 @@ static int8_t search_intra_rough(encoder_state_t * const state,
   cost_pixel_nxn_func *sad_func = uvg_pixels_get_sad_func(width);
   cost_pixel_nxn_multi_func *satd_dual_func = uvg_pixels_get_satd_dual_func(width);
   cost_pixel_nxn_multi_func *sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+  int8_t modes[UVG_NUM_INTRA_MODES];
+  double costs[UVG_NUM_INTRA_MODES];
 
-  const uvg_config *cfg = &state->encoder_control->cfg;
-  const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
+  // const uvg_config *cfg = &state->encoder_control->cfg;
+  // const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
 
   // Temporary block arrays
   uvg_pixel _preds[PARALLEL_BLKS * 32 * 32 + SIMD_ALIGNMENT];
@@ -587,12 +609,18 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
+  cu_loc_t loc = { 0, 0, width, width, width, width };
+  intra_search_data_t search_proxy;
+  FILL(search_proxy, 0);
+  search_proxy.pred_cu = *pred_cu;
+
   for (int mode = 2; mode <= 66; mode += PARALLEL_BLKS * offset) {
     
     double costs_out[PARALLEL_BLKS] = { 0 };
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
       if (mode + i * offset <= 66) {
-        uvg_intra_predict(state, refs, log2_width, mode + i * offset, COLOR_Y, preds[i], filter_boundary, 0);
+        search_proxy.pred_cu.intra.mode = mode + i*offset;
+        uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL);
       }
     }
     
@@ -631,7 +659,8 @@ static int8_t search_intra_rough(encoder_state_t * const state,
       if (mode_in_range) {
         for (int i = 0; i < PARALLEL_BLKS; ++i) {
           if (test_modes[i] >= 2 && test_modes[i] <= 66) {
-            uvg_intra_predict(state, refs, log2_width, test_modes[i], COLOR_Y, preds[i], filter_boundary, 0);
+            search_proxy.pred_cu.intra.mode = test_modes[i];
+            uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[i], &search_proxy, NULL);
           }
         }
 
@@ -653,10 +682,10 @@ static int8_t search_intra_rough(encoder_state_t * const state,
     }
   }
 
-  int8_t add_modes[5] = {intra_preds[0], intra_preds[1], intra_preds[2], 0, 1};
+  int8_t add_modes[INTRA_MPM_COUNT + 2] = {intra_preds[0], intra_preds[1], intra_preds[2], intra_preds[3], intra_preds[4], intra_preds[5], 0, 1};
 
   // Add DC, planar and missing predicted modes.
-  for (int8_t pred_i = 0; pred_i < 5; ++pred_i) {
+  for (int8_t pred_i = 0; pred_i < (INTRA_MPM_COUNT + 2); ++pred_i) {
     bool has_mode = false;
     int8_t mode = add_modes[pred_i];
 
@@ -668,7 +697,8 @@ static int8_t search_intra_rough(encoder_state_t * const state,
     }
 
     if (!has_mode) {
-      uvg_intra_predict(state, refs, log2_width, mode, COLOR_Y, preds[0], filter_boundary, 0);
+      search_proxy.pred_cu.intra.mode = mode;
+      uvg_intra_predict(state, refs, &loc, COLOR_Y, preds[0], &search_proxy, NULL);
       costs[modes_selected] = get_cost(state, preds[0], orig_block, satd_func, sad_func, width);
       modes[modes_selected] = mode;
       ++modes_selected;
@@ -677,16 +707,106 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->lambda_sqrt + 0.5);
+  const double not_mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 0) : 0;
+  const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
+  const double mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 1);
+  const double not_mpm_mode_bit = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.intra_luma_mpm_flag_model), 0);
+  const double planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 1);
+  const double not_planar_mode_flag = CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.luma_planar_model[1]), 0);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * uvg_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
+    int i = 0;
+    int smaller_than_pred = 0;
+    double bits;
+    for (; i < INTRA_MPM_COUNT; i++) {
+      if (intra_preds[i] == modes[mode_i]) {
+        break;
+      }
+      if (modes[mode_i] > intra_preds[i]) {
+        smaller_than_pred += 1;
+      }
+    }
+    if (i == 0) {
+      bits = planar_mode_flag + mpm_mode_bit;
+    }
+    else if (i < INTRA_MPM_COUNT) {
+      bits = not_planar_mode_flag + mpm_mode_bit + MAX(i, 4);
+    }
+    else {
+      bits = not_mpm_mode_bit + 5 + (modes[mode_i] - smaller_than_pred > 3);
+    }
+    bits += not_mrl + not_mip;
+    costs[mode_i] += state->lambda_sqrt * bits;
+    modes_out[mode_i].cost = costs[mode_i];
+    modes_out[mode_i].pred_cu = *pred_cu;
+    modes_out[mode_i].pred_cu.intra.mode = modes[mode_i];
+    modes_out[mode_i].pred_cu.intra.mode_chroma = modes[mode_i];
   }
 
   #undef PARALLEL_BLKS
-
   return modes_selected;
 }
 
+
+static void get_rough_cost_for_2n_modes(
+  encoder_state_t* const state,
+  uvg_intra_references* refs,
+  const cu_loc_t* const cu_loc,
+  uvg_pixel *orig,
+  int orig_stride,
+  intra_search_data_t *search_data,
+  int num_modes,
+  uint8_t mip_ctx)
+{
+#define PARALLEL_BLKS 2
+  assert(num_modes % 2 == 0 && "passing odd number of modes to get_rough_cost_for_2n_modes");
+  const int width = cu_loc->width;
+  cost_pixel_nxn_multi_func* satd_dual_func = uvg_pixels_get_satd_dual_func(width);
+  cost_pixel_nxn_multi_func* sad_dual_func = uvg_pixels_get_sad_dual_func(width);
+
+  uvg_pixel _preds[PARALLEL_BLKS * MIN(LCU_WIDTH, 64)* MIN(LCU_WIDTH, 64)+ SIMD_ALIGNMENT];
+  pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
+
+  uvg_pixel _orig_block[MIN(LCU_WIDTH, 64) * MIN(LCU_WIDTH, 64) + SIMD_ALIGNMENT];
+  uvg_pixel* orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
+
+  uvg_pixels_blit(orig, orig_block, width, width, orig_stride, width);
+  
+  const double mrl = state->encoder_control->cfg.mrl ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[0]), 1) : 0;
+  const double not_mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 0) : 0;
+  const double mip = state->encoder_control->cfg.mip ? CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.mip_flag[mip_ctx]), 1) : 0;
+  double costs_out[PARALLEL_BLKS] = { 0 };
+  double bits[PARALLEL_BLKS] = { 0 };
+  for(int mode = 0; mode < num_modes; mode += PARALLEL_BLKS) {
+    for (int i = 0; i < PARALLEL_BLKS; ++i) {
+      uvg_intra_predict(state, &refs[search_data[mode + i].pred_cu.intra.multi_ref_idx], cu_loc, COLOR_Y, preds[i], &search_data[mode + i], NULL);
+    }
+    get_cost_dual(state, preds, orig_block, satd_dual_func, sad_dual_func, width, costs_out);
+
+    for(int i = 0; i < PARALLEL_BLKS; ++i) {
+      uint8_t multi_ref_idx = search_data[mode + i].pred_cu.intra.multi_ref_idx;
+      if(multi_ref_idx) {
+        bits[i] = mrl + not_mip;
+        bits[i] += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.multi_ref_line[1]), multi_ref_idx != 1);
+        bits[i] += MIN((mode + i + 1) % 6, 4);
+      }
+      else if(search_data[mode + i].pred_cu.intra.mip_flag) {
+        bits[i] = mip + 1;
+        bits[i] += num_modes == 32 ? 4 : (num_modes == 16 ? 3 : (((mode + i) % 6) < 2 ? 2 : 3));
+      }
+      else {
+        assert(0 && "get_rough_cost_for_2n_modes supports only mrl and mip mode cost calculation");
+      }
+    }
+    search_data[mode].cost = costs_out[0];
+    search_data[mode + 1].cost = costs_out[1];
+
+    search_data[mode].cost += bits[0] * state->lambda_sqrt;
+    search_data[mode + 1].cost += bits[1] * state->lambda_sqrt;
+  }
+#undef PARALLEL_BLKS
+}
+
+
 /**
  * \brief  Find best intra mode out of the ones listed in parameter modes.
  *
@@ -713,224 +833,57 @@ static int8_t search_intra_rough(encoder_state_t * const state,
  * \param[out] lcu  If transform split searching is used, the transform split
  *     information for the best mode is saved in lcu.cu structure.
  */
-static int8_t search_intra_rdo(encoder_state_t * const state, 
-                             int x_px, int y_px, int depth,
-                             uvg_pixel *orig, int32_t origstride,
-                             int8_t *intra_preds,
-                             int modes_to_check,
-                             int8_t modes[67], int8_t trafo[67], double costs[67],
-                             int num_mip_modes_full,
-                             int8_t mip_modes[32], int8_t mip_trafo[32], double mip_costs[32],
-                             lcu_t *lcu,
-                             uint8_t multi_ref_idx)
+static int8_t search_intra_rdo(
+  encoder_state_t * const state,
+  int x_px,
+  int y_px,
+  int depth,
+  int modes_to_check,
+  intra_search_data_t *search_data,
+  lcu_t *lcu)
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: proper height for non-square blocks
-
-  uvg_pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];
-
-  uvg_pixels_blit(orig, orig_block, width, height, origstride, width);
-
-  // Check that the predicted modes are in the RDO mode list
-  if (modes_to_check < 67) {
-    int pred_mode = 0;
-    // Skip planar if searching modes for MRL
-    if (multi_ref_idx != 0) {
-      pred_mode = 1;
-    }
-    for (; pred_mode < 6; pred_mode++) {
-      int mode_found = 0;
-      for (int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode++) {
-        if (intra_preds[pred_mode] == modes[rdo_mode]) {
-          mode_found = 1;
-          break;
-        }
-      }
-      // Add this prediction mode to RDO checking
-      if (!mode_found) {
-        modes[modes_to_check] = intra_preds[pred_mode];
-        modes_to_check++;
-      }
-    }
-  }
-
-  // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
-  // MIP_TODO: loop through normal intra modes first
   
-  for (int mip = 0; mip <= 1; mip++) {
-    const int transp_off = mip ? num_mip_modes_full >> 1 : 0;
-    uint8_t ctx_id = mip ? uvg_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL) : 0;
-    uint8_t multi_ref_index = mip ? 0 : multi_ref_idx;
-    int *num_modes = mip ? &num_mip_modes_full : &modes_to_check;
+  for (int mode = 0; mode < modes_to_check; mode++) {
+    double rdo_bitcost = uvg_luma_mode_bits(state, &search_data[mode].pred_cu, x_px, y_px, depth, lcu);
+    search_data[mode].pred_cu.tr_idx = MTS_TR_NUM;
+    search_data[mode].bits = rdo_bitcost;
+    search_data[mode].cost = rdo_bitcost * state->lambda;
 
-    for (uint8_t i = 0; i < *num_modes; i++) {
-      int8_t mode = mip ? mip_modes[i] : modes[i];
-      double *mode_cost_p = mip ? &mip_costs[i] : &costs[i];
-      int8_t *mode_trafo_p = mip ? &mip_trafo[i] : &trafo[i];
-      int rdo_bitcost = uvg_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id);
-
-      *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5);
-
-      // Mip related stuff
-      // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
-      // Half of the modes [16, 31] are indicated with the separate transpose flag.
-      // Number of possible modes is less for larger blocks.
-      const bool is_transposed = mip ? (mode >= transp_off ? true : false) : 0;
-      int8_t pred_mode = (is_transposed ? mode - transp_off : mode);
-
-      // Perform transform split search and save mode RD cost for the best one.
-      cu_info_t pred_cu;
-      pred_cu.depth = depth;
-      pred_cu.type = CU_INTRA;
-      pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks
-      pred_cu.intra.mode = pred_mode;
-      pred_cu.intra.mode_chroma = pred_mode;
-      pred_cu.intra.multi_ref_idx = multi_ref_index;
-      pred_cu.intra.mip_is_transposed = is_transposed;
-      pred_cu.intra.mip_flag = mip ? true : false;
-      pred_cu.joint_cb_cr = 0;
-      FILL(pred_cu.cbf, 0);
-
-      // Reset transform split data in lcu.cu for this area.
-      uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
-
-      double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1);
-      *mode_cost_p += mode_cost;
-      *mode_trafo_p = pred_cu.tr_idx;
-
-      // Early termination if no coefficients has to be coded
-      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
-        *num_modes = i + 1;
-        break;
-      }
+    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, MAX_INT, &search_data[mode], lcu);
+    search_data[mode].cost += mode_cost;
+    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(search_data[mode].pred_cu.cbf, depth)) {
+      modes_to_check = mode + 1;
+      break;
     }
   }
 
   // Update order according to new costs
-  uvg_sort_modes_intra_luma(modes, trafo, costs, modes_to_check);
-  bool use_mip = false;
-  if (num_mip_modes_full) {
-    uvg_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes_full);
-    if (costs[0] > mip_costs[0]) {
-      use_mip = true;
+  double best_cost = MAX_INT;
+  int best_mode = 0;
+  for (int mode = 0; mode < modes_to_check; mode++) {
+    if(search_data[mode].cost < best_cost) {
+      best_cost = search_data[mode].cost;
+      best_mode = mode;
     }
   }
-  
+  search_data[0] = search_data[best_mode];
 
-  // The best transform split hierarchy is not saved anywhere, so to get the
-  // transform split hierarchy the search has to be performed again with the
-  // best mode.
-  if (tr_depth != depth) {
-    cu_info_t pred_cu;
-    pred_cu.depth = depth;
-    pred_cu.type = CU_INTRA;
-    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-    if (use_mip) {
-      int transp_off = num_mip_modes_full >> 1;
-      bool is_transposed = (mip_modes[0] >= transp_off ? true : false);
-      int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]);
-      pred_cu.intra.mode = pred_mode;
-      pred_cu.intra.mode_chroma = pred_mode;
-      pred_cu.intra.multi_ref_idx = 0;
-      pred_cu.intra.mip_flag = true;
-      pred_cu.intra.mip_is_transposed = is_transposed;
-    }
-    else {
-      pred_cu.intra.mode = modes[0];
-      pred_cu.intra.mode_chroma = modes[0];
-      pred_cu.intra.multi_ref_idx = multi_ref_idx;
-      pred_cu.intra.mip_flag = false;
-      pred_cu.intra.mip_is_transposed = false;
-    }
-    FILL(pred_cu.cbf, 0);
-    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
-  }
-
-  // TODO: modes to check does not consider mip modes. Maybe replace with array when mip search is optimized?
   return modes_to_check;
 }
 
 
-double uvg_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id)
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu)
 {
-  double mode_bits = 0.0;
-
-  bool enable_mip = state->encoder_control->cfg.mip;
-  bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false;
-
-  // Mip flag cost must be calculated even if mip is not used in this block
-  if (enable_mip) {
-    // Make a copy of state->cabac for bit cost estimation.
-    cabac_data_t state_cabac_copy;
-    cabac_data_t* cabac;
-    memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
-    // Clear data and set mode to count only
-    state_cabac_copy.only_count = 1;
-    state_cabac_copy.num_buffered_bytes = 0;
-    state_cabac_copy.bits_left = 23;
-
-    cabac = &state_cabac_copy;
-
-    // Do cabac writes as normal
-    const int transp_off = num_mip_modes_half;
-    const bool is_transposed = luma_mode >= transp_off ? true : false;
-    int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode;
-
-    // Write MIP flag
-    cabac->cur_ctx = &(cabac->ctx.mip_flag[mip_flag_ctx_id]);
-    CABAC_BIN(cabac, mip_flag, "mip_flag");
-    
-    if (mip_flag) {
-      // Write MIP transpose flag & mode
-      CABAC_BIN_EP(cabac, is_transposed, "mip_transposed");
-      uvg_cabac_encode_trunc_bin(cabac, mip_mode, transp_off);
-    }
-    
-    // Write is done. Get bit cost out of cabac
-    mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
-  }
-
-  if (!mip_flag) {
-    int8_t mode_in_preds = -1;
-    for (int i = 0; i < INTRA_MPM_COUNT; ++i) {
-      if (luma_mode == intra_preds[i]) {
-        mode_in_preds = i;
-        break;
-      }
-    }
-
-    bool enable_mrl = state->encoder_control->cfg.mrl;
-    uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0;
-
-    const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model);
-
-    if (multi_ref_index == 0) {
-      mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1);
-    }
-
-    // Add MRL bits.
-    if (enable_mrl && MAX_REF_LINE_IDX > 1) {
-      ctx = &(state->cabac.ctx.multi_ref_line[0]);
-      mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0);
-
-      if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) {
-        ctx = &(state->cabac.ctx.multi_ref_line[1]);
-        mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1);
-      }
-    }
-
-    if (mode_in_preds != -1 || multi_ref_index != 0) {
-      ctx = &(state->cabac.ctx.luma_planar_model[0]);
-      if (multi_ref_index == 0) {
-        mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0);
-      }
-      mode_bits += MIN(4.0, mode_in_preds);
-    }
-    else {
-      mode_bits += 6.0;
-    }
-  }
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
+  double mode_bits = 0;
+  cabac_data_t cabac_copy;
+  memcpy(&cabac_copy, cabac, sizeof cabac_copy);
+  uvg_encode_intra_luma_coding_unit(
+    state,
+    &cabac_copy, cur_cu,
+    x, y, depth, lcu, &mode_bits
+  );
 
   return mode_bits;
 }
@@ -938,7 +891,8 @@ double uvg_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
 
 double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode)
 {
-  const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model);
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  const cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model);
   double mode_bits;
   if (chroma_mode == luma_mode) {
     mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
@@ -958,15 +912,26 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
     mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67);
   }
 
+  if(cabac->update) {
+    if(chroma_mode != luma_mode) {
+      // Again it does not matter what we actually write here
+      CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode");      
+    }
+  }
+
   return mode_bits;
 }
 
 
-int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state,
-                                  int x_px, int y_px, int depth,
-                                  int8_t intra_mode,
-                                  int8_t modes[8], int8_t num_modes,
-                                  lcu_t *const lcu, cclm_parameters_t *best_cclm)
+int8_t uvg_search_intra_chroma_rdo(
+  encoder_state_t * const state,
+  int x_px,
+  int y_px,
+  int depth,
+  int8_t num_modes,
+  lcu_t *const lcu,
+  intra_search_data_t* chroma_data,
+  int8_t luma_mode)
 {
   const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
 
@@ -980,84 +945,32 @@ int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state,
 
 
   if (reconstruct_chroma) {
-
-    int c_width = MAX(32 >> (depth), 4);
-
     uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
     uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
-
-    cclm_parameters_t cclm_params[2] = { 0 };
-
+    
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-
-    struct {
-      double cost;
-      int8_t mode;
-      cclm_parameters_t cclm[2];
-    } chroma, best_chroma;
-
-    // chroma.cclm = cclm_params;
-
-    best_chroma.mode = 0;
-    best_chroma.cost = MAX_INT;
-
-    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
-      chroma.mode = modes[chroma_mode_i];
-      if (chroma.mode == -1) continue;
-      if(chroma.mode < 67 || depth == 0) {
-        uvg_intra_recon_cu(state,
-          x_px, y_px,
-          depth,
-          -1, chroma.mode, // skip luma
-          NULL, NULL, 0, false, false, lcu);
+    
+    for (int8_t i = 0; i < num_modes; ++i) {
+      const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma;
+      uvg_intra_recon_cu(state,
+                         x_px, y_px,
+                         depth, &chroma_data[i],
+        &chroma_data[i].pred_cu,
+                         lcu);      
+      
+      if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) {
+        chroma_data[i].cost = uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu);
+      } else {
+        uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost);
       }
-      else {
 
-        uvg_predict_cclm(
-          state, COLOR_U,
-          c_width, c_width,
-          x_px & ~7, y_px & ~7,
-          state->tile->frame->source->stride,
-          chroma.mode, 
-          lcu,
-          &refs[0], NULL,
-          &cclm_params[0]);
-
-        chroma.cclm[0] = cclm_params[0];
-
-        uvg_predict_cclm(
-          state, COLOR_V,
-          c_width, c_width,
-          x_px & ~7, y_px & ~7,
-          state->tile->frame->source->stride, 
-          chroma.mode, 
-          lcu, 
-          &refs[1], NULL,
-          &cclm_params[1]);
-
-        chroma.cclm[1] = cclm_params[1];
-
-        uvg_intra_recon_cu(
-          state,
-          x_px, y_px,
-          depth,
-          -1, chroma.mode, // skip luma
-          NULL, cclm_params, 0, false, false, lcu);
-      }
-      chroma.cost = uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
-
-      double mode_bits = uvg_chroma_mode_bits(state, chroma.mode, intra_mode);
-      chroma.cost += mode_bits * state->lambda;
-
-      if (chroma.cost < best_chroma.cost) {
-        best_chroma = chroma;
-      }
+      double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode);
+      chroma_data[i].cost += mode_bits * state->lambda;
     }
-    best_cclm[0] = best_chroma.cclm[0];
-    best_cclm[1] = best_chroma.cclm[1];
+    sort_modes(chroma_data, num_modes);
 
-    return best_chroma.mode;
+    return chroma_data[0].pred_cu.intra.mode_chroma;
   }
 
   return 100;
@@ -1066,18 +979,25 @@ int8_t uvg_search_intra_chroma_rdo(encoder_state_t * const state,
 
 int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu, cclm_parameters_t *best_cclm)
+                              const int depth, lcu_t *lcu, intra_search_data_t *search_data)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   int8_t intra_mode = cur_pu->intra.mode;
-
-  double costs[8];
+  
   int8_t modes[8] = { 0, 50, 18, 1, -1, 81, 82, 83 };
+  uint8_t total_modes = (state->encoder_control->cfg.cclm ? 8 : 5);
   if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) {
     modes[4] = intra_mode;
   }
+  else {
+    total_modes -= 1;
+    modes[4] = modes[5];
+    modes[5] = modes[6];
+    modes[6] = modes[7];
+  }
+
 
   // The number of modes to select for slower chroma search. Luma mode
   // is always one of the modes, so 2 means the final decision is made
@@ -1087,13 +1007,21 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
   int num_modes = modes_in_depth[depth];
 
   if (state->encoder_control->cfg.rdo >= 3) {
-    num_modes = state->encoder_control->cfg.cclm ? 8 : 5;
+    num_modes = total_modes;
   }
 
+  intra_search_data_t chroma_data[8];
+  FILL(chroma_data, 0);
+  for (int i = 0; i < num_modes; i++) {
+    chroma_data[i].pred_cu = *cur_pu;
+    chroma_data[i].pred_cu.intra.mode_chroma = modes[i];
+    chroma_data[i].pred_cu.intra.mode = -1;
+  }
   // Don't do rough mode search if all modes are selected.
   // FIXME: It might make more sense to only disable rough search if
   // num_modes is 0.is 0.
-  if (num_modes != 1 && num_modes != 5 && num_modes != 4 && num_modes != 8) {
+
+  if (total_modes != num_modes) {
     const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
     const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
     const vector2d_t luma_px = { x_px, y_px };
@@ -1109,16 +1037,18 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
     uvg_pixel *ref_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
 
     search_intra_chroma_rough(state, x_px, y_px, depth,
-                              ref_u, ref_v, LCU_WIDTH_C,
+                              ref_u, ref_v,
+                              LCU_WIDTH_C,
                               &refs_u, &refs_v,
-                              intra_mode, modes, costs, lcu);
+      chroma_data, lcu);
+    sort_modes(chroma_data, total_modes);
   }
 
   int8_t intra_mode_chroma = intra_mode;
   if (num_modes > 1) {
-    intra_mode_chroma = uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu, best_cclm);
+    intra_mode_chroma = uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode);
   }
-
+  *search_data = chroma_data[0];
   return intra_mode_chroma;
 }
 
@@ -1127,25 +1057,29 @@ int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
  * Update lcu to have best modes at this depth.
  * \return Cost of best mode.
  */
-void uvg_search_cu_intra(encoder_state_t * const state,
-                         const int x_px, const int y_px,
-                         const int depth, lcu_t *lcu,
-                         int8_t *mode_out, 
-                         int8_t *trafo_out, 
-                         double *cost_out,
-                         uint8_t *multi_ref_idx_out,
-                         bool *mip_flag_out,
-                         bool * mip_transposed_out)
+void uvg_search_cu_intra(
+  encoder_state_t * const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  intra_search_data_t* mode_out,
+  lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   const int8_t cu_width = LCU_WIDTH >> depth;
+  const cu_loc_t cu_loc = { x_px, y_px, cu_width, cu_width,
+    MAX(cu_width >> 1, TR_MIN_WIDTH), MAX(cu_width >> 1, TR_MIN_WIDTH) };
   const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
+  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
 
   cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
-  uvg_intra_references refs;
+  uvg_intra_references refs[MAX_REF_LINE_IDX];
 
   int8_t candidate_modes[INTRA_MPM_COUNT];
+  // Normal intra modes + mrl modes + mip modes
+  intra_search_data_t search_data[UVG_NUM_INTRA_MODES +(MAX_REF_LINE_IDX - 1) * (INTRA_MPM_COUNT - 1) + 32];
 
   cu_info_t *left_cu = 0;
   cu_info_t *above_cu = 0;
@@ -1161,75 +1095,115 @@ void uvg_search_cu_intra(encoder_state_t * const state,
   uvg_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
   if (depth > 0) {
-    const vector2d_t luma_px = { x_px, y_px };
-    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-
-    // These references will only be used with rough search. No need for MRL stuff here.
-    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, refs, state->encoder_control->cfg.wpp, NULL, 0);
   }
 
-  int8_t modes[MAX_REF_LINE_IDX][67];
-  int8_t trafo[MAX_REF_LINE_IDX][67] = { 0 };
-  double costs[MAX_REF_LINE_IDX][67];
-
-  bool enable_mip = state->encoder_control->cfg.mip;
-  // The maximum number of mip modes is 32. Max modes can be less depending on block size.
-  // Half of the possible modes are transposed, which is indicated by a separate transpose flag
-  int8_t mip_modes[32]; 
-  int8_t mip_trafo[32];
-  double mip_costs[32];
-
   // The maximum number of possible MIP modes depend on block size & shape
   int width = LCU_WIDTH >> depth;
   int height = width; // TODO: proper height for non-square blocks.
-  int num_mip_modes = 0;
 
-  if (enable_mip) {
-    for (int i = 0; i < 32; ++i) {
-      mip_modes[i] = i;
-      mip_costs[i] = MAX_INT;
-    }
-    // MIP is not allowed for 64 x 4 or 4 x 64 blocks
-    if (!((width == 64 && height == 4) || (width == 4 && height == 64))) {
-      num_mip_modes = NUM_MIP_MODES_FULL(width, height);
-    }
-  }
+  // This is needed for bit cost calculation and requires too many parameters to be
+  // calculated inside the rough search functions
+  uint8_t mip_ctx = uvg_get_mip_flag_context(x_px, y_px, cu_width, cu_width, lcu, NULL);
 
   // Find best intra mode for 2Nx2N.
   uvg_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
 
-  int8_t number_of_modes[MAX_REF_LINE_IDX] = { 0 };
+  // Need to set some data for all cus
+  cu_info_t temp_pred_cu;
+  temp_pred_cu = *cur_cu;
+  temp_pred_cu.type = CU_INTRA;
+  FILL(temp_pred_cu.intra, 0);
+
+  int16_t number_of_modes;
   bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4);
   if (!skip_rough_search) {
-    number_of_modes[0] = search_intra_rough(state,
-                                         ref_pixels, LCU_WIDTH,
-                                         &refs,
+    number_of_modes = search_intra_rough(state,
+                                         ref_pixels,
+                                         LCU_WIDTH,
+                                         refs,
                                          log2_width, candidate_modes,
-                                         modes[0], costs[0]);
-    // Copy rough results for other reference lines
-    for (int line = 1; line < MAX_REF_LINE_IDX; ++line) {
-      number_of_modes[line] = number_of_modes[0];
-      for (int i = 0; i < number_of_modes[line]; ++i) {
-        modes[line][i] = modes[0][i];
-        costs[line][i] = costs[0][i];
-      }
-    }
+                                         search_data, &temp_pred_cu,
+                                         mip_ctx);
+
   } else {
-    for(int line = 0; line < MAX_REF_LINE_IDX; ++line) {
-      number_of_modes[line] = 67;
-      for (int i = 0; i < number_of_modes[line]; ++i) {
-        modes[line][i] = i;
-        costs[line][i] = MAX_INT;
-      }
+    for (int8_t i = 0; i < UVG_NUM_INTRA_MODES; i++) {
+      search_data[i].pred_cu = temp_pred_cu;
+      search_data[i].pred_cu.intra.mode = i;
+      search_data[i].pred_cu.intra.mode_chroma = i;
+      search_data[i].cost = MAX_INT;
     }
+    number_of_modes = UVG_NUM_INTRA_MODES;
   }
 
-  uint8_t lines = 1;
-  // Find modes with multiple reference lines if in use. Do not use if CU in first row.
-  if (state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0) {
-    lines = MAX_REF_LINE_IDX;
+  int num_mip_modes = 0;
+  if (state->encoder_control->cfg.mip) {
+    // MIP is not allowed for 64 x 4 or 4 x 64 blocks
+    if (!((width == 64 && height == 4) || (width == 4 && height == 64))) {
+      num_mip_modes = NUM_MIP_MODES_FULL(width, height);
+
+      for (int transpose = 0; transpose < 2; transpose++) {
+        const int half_mip_modes = NUM_MIP_MODES_HALF(width, height);
+        for (int i = 0; i < half_mip_modes; ++i) {
+          const int index = i + number_of_modes + transpose * half_mip_modes;
+          search_data[index].pred_cu = temp_pred_cu;
+          search_data[index].pred_cu.intra.mip_flag = 1;
+          search_data[index].pred_cu.intra.mode = i;
+          search_data[index].pred_cu.intra.mip_is_transposed = transpose;
+          search_data[index].pred_cu.intra.mode_chroma = i;
+          search_data[index].cost = MAX_INT;
+        }
+      }
+      if(!skip_rough_search) {
+        get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+                                    ref_pixels,
+                                    LCU_WIDTH, search_data + number_of_modes, num_mip_modes,
+                                    mip_ctx);
+      }
+    }
+    number_of_modes += num_mip_modes;
   }
 
+  int num_mrl_modes = 0;
+  // Find modes with multiple reference lines if in use. Do not use if CU in first row.
+  uint8_t lines = state->encoder_control->cfg.mrl && (y_px % LCU_WIDTH) != 0 ? MAX_REF_LINE_IDX : 1;
+
+  for(int line = 1; line < lines; ++line) {
+    uvg_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };
+
+    if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0) {
+      videoframe_t* const frame = state->tile->frame;
+
+      // Copy extra ref lines, including ref line 1 and top left corner.
+      for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
+        int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
+        height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
+        height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
+        uvg_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
+          &extra_refs[i * 128],
+          1, height,
+          frame->rec->stride, 1);
+      }
+    }
+    uvg_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs[line], state->encoder_control->cfg.wpp, extra_refs, line);
+    for(int i = 1; i < INTRA_MPM_COUNT; i++) {
+      num_mrl_modes++;
+      const int index = (i - 1) + (INTRA_MPM_COUNT -1)*(line-1) + number_of_modes;
+      search_data[index].pred_cu = temp_pred_cu;
+      search_data[index].pred_cu.intra.mode = candidate_modes[i];
+      search_data[index].pred_cu.intra.multi_ref_idx = line;
+      search_data[index].pred_cu.intra.mode_chroma = candidate_modes[i];
+      search_data[index].cost = MAX_INT;
+    }
+  }
+  if (!skip_rough_search && lines != 1) {
+    get_rough_cost_for_2n_modes(state, refs, &cu_loc,
+                                ref_pixels,
+                                LCU_WIDTH, search_data + number_of_modes, num_mrl_modes,
+                                mip_ctx);
+  }
+  number_of_modes += num_mrl_modes;
+
   // Set transform depth to current depth, meaning no transform splits.
   uvg_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
   // Refine results with slower search or get some results if rough search was skipped.
@@ -1237,79 +1211,56 @@ void uvg_search_cu_intra(encoder_state_t * const state,
   if (rdo_level >= 2 || skip_rough_search) {
     int number_of_modes_to_search;
     if (rdo_level == 4) {
-      number_of_modes_to_search = 67;
+      number_of_modes_to_search = number_of_modes;
     } else if (rdo_level == 2 || rdo_level == 3) {
       number_of_modes_to_search = (cu_width == 4) ? 3 : 2;
     } else {
       // Check only the predicted modes.
       number_of_modes_to_search = 0;
     }
-    
-    for(int8_t line = 0; line < lines; ++line) {
-      // For extra reference lines, only check predicted modes & no MIP search.
-      if (line != 0) {
-        number_of_modes_to_search = 0;
-        num_mip_modes = 0;
+    if(!skip_rough_search) {
+      sort_modes(search_data, number_of_modes);
+    }
+
+    for(int pred_mode = 0; pred_mode < INTRA_MPM_COUNT; ++pred_mode) {
+      bool mode_found = false;
+      for(int i = 0; i < number_of_modes_to_search; i++) {
+        if(search_data[i].pred_cu.intra.mode == candidate_modes[pred_mode]) {
+          mode_found = true;
+          break;
+        }
+      }
+      if(!mode_found) {
+        search_data[number_of_modes_to_search].pred_cu = temp_pred_cu;
+        search_data[number_of_modes_to_search].pred_cu.intra.mode = candidate_modes[pred_mode];
+        search_data[number_of_modes_to_search].pred_cu.intra.mode_chroma = candidate_modes[pred_mode];
+        number_of_modes_to_search++;
       }
-      int num_modes_to_check = MIN(number_of_modes[line], number_of_modes_to_search);
-      uvg_sort_modes(modes[line], costs[line], number_of_modes[line]);
-      // TODO: if rough search is implemented for MIP, sort mip_modes here.
-      number_of_modes[line] = search_intra_rdo(state,
-                            x_px, y_px, depth,
-                            ref_pixels, LCU_WIDTH,
-                            candidate_modes,
-                            num_modes_to_check,
-                            modes[line], trafo[line], costs[line],
-                            num_mip_modes,
-                            mip_modes, mip_trafo, mip_costs,
-                            lcu, line);
     }
+
+    // TODO: if rough search is implemented for MIP, sort mip_modes here.
+    search_intra_rdo(
+      state,
+      x_px,
+      y_px,
+      depth,
+      number_of_modes_to_search,
+      search_data,
+      lcu);
+    // Reset these
+    search_data[0].pred_cu.violates_mts_coeff_constraint = false;
+    search_data[0].pred_cu.mts_last_scan_pos = false;    
   }
-  
-  uint8_t best_line = 0;
-  double best_line_mode_cost = costs[0][0];
-  uint8_t best_mip_mode_idx = 0;
-  uint8_t best_mode_indices[MAX_REF_LINE_IDX];
-
-  int8_t tmp_best_mode;
-  int8_t tmp_best_trafo;
-  double tmp_best_cost;
-  bool tmp_mip_flag = false;
-  bool tmp_mip_transp = false;
-
-  for (int line = 0; line < lines; ++line) {
-    best_mode_indices[line] = select_best_mode_index(modes[line], costs[line], number_of_modes[line]);
-    if (best_line_mode_cost > costs[line][best_mode_indices[line]]) {
-      best_line_mode_cost = costs[line][best_mode_indices[line]];
-      best_line = line;
+  else {
+    double best_cost = MAX_INT;
+    int best_mode = 0;
+    for (int mode = 0; mode < number_of_modes; mode++) {
+      if (search_data[mode].cost < best_cost) {
+        best_cost = search_data[mode].cost;
+        best_mode = mode;
+      }
     }
+    search_data[0] = search_data[best_mode];
   }
-
-  tmp_best_mode = modes[best_line][best_mode_indices[best_line]];
-  tmp_best_trafo = trafo[best_line][best_mode_indices[best_line]];
-  tmp_best_cost = costs[best_line][best_mode_indices[best_line]];
-
-  if (num_mip_modes) {
-    best_mip_mode_idx = select_best_mode_index(mip_modes, mip_costs, num_mip_modes);
-    if (tmp_best_cost > mip_costs[best_mip_mode_idx]) {
-      tmp_best_mode = mip_modes[best_mip_mode_idx];
-      tmp_best_trafo = mip_trafo[best_mip_mode_idx];
-      tmp_best_cost = mip_costs[best_mip_mode_idx];
-      tmp_mip_flag = true;
-      tmp_mip_transp = (tmp_best_mode >= (num_mip_modes >> 1)) ? 1 : 0;
-    }
-  }
-
-  if (tmp_mip_flag) {
-    // Transform best mode index to proper form.
-    // Max mode index is half of max number of modes - 1 (i. e. for size id 2, max mode id is 5)
-    tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode);
-  }
-
-  *mode_out =  tmp_best_mode;
-  *trafo_out = tmp_best_trafo;
-  *cost_out =  tmp_best_cost;
-  *mip_flag_out = tmp_mip_flag;
-  *mip_transposed_out = tmp_mip_transp;
-  *multi_ref_idx_out = tmp_mip_flag ? 0 : best_line;
+  *mode_out = search_data[0];
 }
diff --git a/src/search_intra.h b/src/search_intra.h
index ea73156b..7bcb6480 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -43,24 +43,21 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 
-double uvg_luma_mode_bits(const encoder_state_t *state, 
-                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id);
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
                        
 double uvg_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);
 
 int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm);
+                              const int depth, lcu_t *lcu, intra_search_data_t* best_cclm);
 
-void uvg_search_cu_intra(encoder_state_t * const state,
-                         const int x_px, const int y_px,
-                         const int depth, lcu_t *lcu,
-                         int8_t *mode_out,
-                         int8_t *trafo_out, 
-                         double *cost_out,
-                         uint8_t *multi_ref_idx_out,
-                         bool *mip_flag,
-                         bool *mip_transp);
+void uvg_search_cu_intra(
+  encoder_state_t * const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  intra_search_data_t* search_data,
+  lcu_t *lcu);
 
 #endif // SEARCH_INTRA_H_
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 6ab6994d..a4ea1d58 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -225,39 +225,40 @@ int uvg_quant_cbcr_residual_generic(
   int64_t best_cost = INT64_MAX;
 
   // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
-  for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) {
+  for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) {
     int64_t d1 = 0;
+    const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1);
     for (int y = 0; y < width; y++)
     {
       for (int x = 0; x < width; x++)
       {
         int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
-        if (cbf_mask == 1)
+        if (cbf_mask == 2)
         {
-          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1));
+          u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1));
         }
-        else if (cbf_mask == -1)
+        else if (cbf_mask == -2)
         {
-          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1));
+          u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1));
         }
         else if (cbf_mask == 3)
         {
-          u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]);
+          u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]);
         }
         else if (cbf_mask == -3)
         {
-          u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]);
+          u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]);
         }
-        else if (cbf_mask == 2)
+        else if (cbf_mask == 1)
         {
           v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
           d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
         }
-        else if (cbf_mask == -2)
+        else if (cbf_mask == -1)
         {
           v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
           d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
@@ -270,19 +271,19 @@ int uvg_quant_cbcr_residual_generic(
       }
     }
     if (d1 < best_cost) {
-      best_cbf_mask = cbf_mask;
+      best_cbf_mask = i;
       best_cost = d1;
     }
   }
 
-  uvg_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+  uvg_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
 
   if (state->encoder_control->cfg.rdoq_enable &&
     (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
       scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
   }
   else if (state->encoder_control->cfg.rdoq_enable && false) {
@@ -290,7 +291,7 @@ int uvg_quant_cbcr_residual_generic(
       scan_order);
   }
   else {
-    uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
       scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
   }
 
@@ -309,10 +310,10 @@ int uvg_quant_cbcr_residual_generic(
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
     
-    uvg_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
     
 
     //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@@ -333,32 +334,32 @@ int uvg_quant_cbcr_residual_generic(
     //    }
     //  }
     //}
-
+    const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1);
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
     for (int y = 0; y < width; y++) {
       for (int x = 0; x < width; x++) {
-        if (best_cbf_mask == 1) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        if (temp == 2) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
         }
-        else if (best_cbf_mask == -1) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        else if (temp == -2) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
         }
-        else if (best_cbf_mask == 3) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+        else if (temp == 3) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
         }
-        else if (best_cbf_mask == -3) {
+        else if (temp == -3) {
           // non-normative clipping to prevent 16-bit overflow
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width];
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width];
         }
-        else if (best_cbf_mask == 2) {
+        else if (temp == 1) {
           u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
           v_residual[x + y * width] = v1_residual[x + y * width];
         }
-        else if (best_cbf_mask == -2) {
+        else if (temp == -1) {
           u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
           v_residual[x + y * width] = -v1_residual[x + y * width];
         }
diff --git a/src/transform.c b/src/transform.c
index 925964f2..4ca02c72 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -260,11 +260,9 @@ int uvg_quantize_residual_trskip(
   struct {
     uvg_pixel rec[LCU_WIDTH * LCU_WIDTH];
     coeff_t coeff[LCU_WIDTH * LCU_WIDTH];
-    uint32_t cost;
+    double cost;
     int has_coeffs;
   } skip, *best;
-
-  const int bit_cost = (int)(state->lambda + 0.5);
   
   //noskip.has_coeffs = uvg_quantize_residual(
   //    state, cur_cu, width, color, scan_order,
@@ -278,7 +276,7 @@ int uvg_quantize_residual_trskip(
     1, in_stride, width,
     ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj);
   skip.cost = uvg_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width);
-  skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost;
+  skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda;
 
 /*  if (noskip.cost <= skip.cost) {
     *trskip_out = 0;
@@ -481,15 +479,17 @@ static void quantize_tr_residual(encoder_state_t * const state,
  * - lcu->cbf               coded block flags for the area
  * - lcu->cu.intra.tr_skip  tr skip flags for the area (in case of luma)
  */
-void uvg_quantize_lcu_residual(encoder_state_t * const state,
-                               const bool luma,
-                               const bool chroma,
-                               const int32_t x,
-                               const int32_t y,
-                               const uint8_t depth,
-                               cu_info_t *cur_pu,
-                               lcu_t* lcu,
-                               bool early_skip)
+void uvg_quantize_lcu_residual(
+  encoder_state_t * const state,
+  const bool luma,
+  const bool chroma,
+  const bool jccr,
+  const int32_t x,
+  const int32_t y,
+  const uint8_t depth,
+  cu_info_t *cur_pu,
+  lcu_t* lcu,
+  bool early_skip)
 {
   const int32_t width = LCU_WIDTH >> depth;
   const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
@@ -511,7 +511,7 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
   if (luma) {
     cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
   }
-  if (chroma) {
+  if (chroma || jccr) {
     cbf_clear(&cur_pu->cbf, depth, COLOR_U);
     cbf_clear(&cur_pu->cbf, depth, COLOR_V);
   }
@@ -523,10 +523,11 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    uvg_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip);
+    // jccr is currently not supported if transform is split
+    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -548,10 +549,10 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
     }
     if (chroma) {
       quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
-      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);
-      if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){
-        quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
-      }
+      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);   
+    }
+    if (jccr && cur_pu->tr_depth == cur_pu->depth) {
+      quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
     }
   }
 }
diff --git a/src/transform.h b/src/transform.h
index a7fa232e..6a4f0bb9 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -67,14 +67,16 @@ void uvg_itransform2d(const encoder_control_t * const encoder,
 
 int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);
 
-void uvg_quantize_lcu_residual(encoder_state_t *state,
-                               bool luma,
-                               bool chroma,
-                               int32_t x,
-                               int32_t y,
-                               uint8_t depth,
-                               cu_info_t *cur_cu,
-                               lcu_t* lcu,
-                               bool early_skip);
+void uvg_quantize_lcu_residual(
+  encoder_state_t *state,
+  bool luma,
+  bool chroma,
+  const bool jccr,
+  int32_t x,
+  int32_t y,
+  uint8_t depth,
+  cu_info_t *cur_cu,
+  lcu_t* lcu,
+  bool early_skip);
 
 #endif
diff --git a/src/uvg266.h b/src/uvg266.h
index 4ecc8d48..0593a605 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -267,6 +267,12 @@ enum uvg_amvr_resolution
   UVG_IMV_HPEL    = 3
 };
 
+enum uvg_roi_format
+{
+  UVG_ROI_TXT = 0,
+  UVG_ROI_BIN = 1
+};
+
 // Map from input format to chroma format.
 #define UVG_FORMAT2CSP(format) ((enum uvg_chroma_format)format)
 
@@ -408,10 +414,9 @@ typedef struct uvg_config
   int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */
 
   struct {
-    int32_t width;
-    int32_t height;
-    int8_t *dqps;
-  } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
+    char *file_path;
+    enum uvg_roi_format format;
+  } roi; /*!< \brief Specify delta QPs for region of interest coding. */
 
   unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */
 
@@ -524,6 +529,12 @@ typedef struct uvg_config
   int8_t cclm;
 
   int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
+
+  /** \brief whether to try combining intra cus at the lower depth when search
+   *         is not performed at said depth*/
+  uint8_t combine_intra_cus;
+
+  uint8_t force_inter;
 } uvg_config;
 
 /**
@@ -555,6 +566,14 @@ typedef struct uvg_picture {
   enum uvg_chroma_format chroma_format;
 
   int32_t ref_pocs[16];
+
+  struct
+  {
+    int width;
+    int height;
+    int8_t *roi_array;
+  } roi;
+
 } uvg_picture;
 
 /**
@@ -780,6 +799,9 @@ typedef struct uvg_api {
    * the bitstream, length of the bitstream, the reconstructed frame, the
    * original frame and frame info in data_out, len_out, pic_out, src_out and
    * info_out, respectively. Otherwise, set the output parameters to NULL.
+   * 
+   * Region of interest (ROI) / delta QP map can be specified in the input
+   * picture's ROI field but only when a ROI file is not used.
    *
    * After passing all of the input frames, the caller should keep calling this
    * function with pic_in set to NULL, until no more data is returned in the
diff --git a/tests/test_slices.sh b/tests/test_slices.sh
index 512888b0..a4166036 100755
--- a/tests/test_slices.sh
+++ b/tests/test_slices.sh
@@ -3,6 +3,6 @@
 set -eu
 . "${0%/*}/util.sh"
 
-valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2
+valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --gop 0 --tiles=2x2
 #valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp
 #if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi