Merge branch 'improve-intra-search'

2024-11-23 18:14:06 +00:00 · 2022-05-30 12:11:48 +03:00 · 2022-05-30 12:11:48 +03:00 · 153afc6739
parent a23f1c7035 ede7603361
commit 153afc6739
40 changed files with 2954 additions and 2057 deletions
--- a/README.md
+++ b/README.md
@ -145,11 +145,20 @@ Video structure:
                                   - frametile: Constrain within the tile.
                                   - frametilemargin: Constrain even more.
      --roi <filename>       : Use a delta QP map for region of interest.
-                               Reads an array of delta QP values from a text
-                               file. The file format is: width and height of
-                               the QP delta map followed by width*height delta
-                               QP values in raster order. The map can be of any
-                               size and will be scaled to the video size.
+                               Reads an array of delta QP values from a file.
+                               Text and binary files are supported and detected
+                               from the file extension (.txt/.bin). If a known
+                               extension is not found, the file is treated as
+                               a text file. The file can include one or many
+                               ROI frames each in the following format:
+                               width and height of the QP delta map followed
+                               by width * height delta QP values in raster
+                               order. In binary format, width and height are
+                               32-bit integers whereas the delta QP values are
+                               signed 8-bit values. The map can be of any size
+                               and will be scaled to the video size. The file
+                               reading will loop if end of the file is reached.
+                               See roi.txt in the examples folder.
      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
                               in PPS and slice_qp_delta in slize header zero.
      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
--- a/doc/uvg266.1
+++ b/doc/uvg266.1
@ -164,11 +164,20 @@ Constrain movement vectors. [none]
 .TP
 \fB\-\-roi <filename>      
 Use a delta QP map for region of interest.
-Reads an array of delta QP values from a text
-file. The file format is: width and height of
-the QP delta map followed by width*height delta
-QP values in raster order. The map can be of any
-size and will be scaled to the video size.
+Reads an array of delta QP values from a file.
+Text and binary files are supported and detected
+from the file extension (.txt/.bin). If a known
+extension is not found, the file is treated as
+a text file. The file can include one or many
+ROI frames each in the following format:
+width and height of the QP delta map followed
+by width * height delta QP values in raster
+order. In binary format, width and height are
+32\-bit integers whereas the delta QP values are
+signed 8\-bit values. The map can be of any size
+and will be scaled to the video size. The file
+reading will loop if end of the file is reached.
+See roi.txt in the examples folder.
 .TP
 \fB\-\-set\-qp\-in\-cu        
 Set QP at CU level keeping pic_init_qp_minus26.
--- a/src/alf.c
+++ b/src/alf.c
@ -1236,19 +1236,19 @@ static void code_alf_ctu_filter_index(encoder_state_t * const state,
      assert(filter_set_idx < num_available_filt_sets); //"temporal non-latest set"
      if (num_aps > 1)
      {
-        uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS);
+        uvg_cabac_encode_trunc_bin(cabac, filter_set_idx - ALF_NUM_FIXED_FILTER_SETS, num_available_filt_sets - ALF_NUM_FIXED_FILTER_SETS, NULL);
      }
    }
    else
    {
      assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //"fixed set larger than temporal"
-      uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS);
+      uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL);
    }
  }
  else
  {
    assert(filter_set_idx < ALF_NUM_FIXED_FILTER_SETS); //Fixed set numavail < num_fixed
-    uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS);
+    uvg_cabac_encode_trunc_bin(cabac, filter_set_idx, ALF_NUM_FIXED_FILTER_SETS, NULL);
  }
 }

--- a/src/bitstream.c
+++ b/src/bitstream.c
@ -33,6 +33,7 @@
 #include "bitstream.h"

 #include <math.h>
+#include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>

--- a/src/cabac.c
+++ b/src/cabac.c
@ -70,6 +70,7 @@ void uvg_cabac_start(cabac_data_t * const data)
  data->num_buffered_bytes = 0;
  data->buffered_byte = 0xff;
  data->only_count = 0; // By default, write bits out
+  data->update = 0; 
 }

 /**
@ -199,7 +200,7 @@ void uvg_cabac_encode_bin_trm(cabac_data_t * const data, const uint8_t bin_value
 /**
 * \brief encode truncated binary code
 */
-void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value) {
+void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_value, const uint32_t max_value, double* bits_out) {
  int thresh;
  int symbol = bin_value;
  if (max_value > 256) {
@ -219,9 +220,11 @@ void uvg_cabac_encode_trunc_bin(cabac_data_t * const data, const uint32_t bin_va
  int b = max_value - val;
  if (symbol < val - b) {
    CABAC_BINS_EP(data, symbol, thresh, "TruncSymbols");
+    if (bits_out) *bits_out += thresh;
  } else {
    symbol += val - b;
    CABAC_BINS_EP(data, symbol, thresh + 1, "TruncSymbols");
+    if (bits_out) *bits_out += thresh + 1;
  }
 }

@ -349,7 +352,12 @@ void uvg_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t rem
 /**
 * \brief
 */
-void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol)
+void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, 
+  cabac_ctx_t * const ctx, 
+  uint32_t symbol,
+  const int32_t offset,
+  const uint32_t max_symbol, 
+  double* bits_out)
 {
  int8_t code_last = max_symbol > symbol;

@ -357,18 +365,17 @@ void uvg_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * c

  if (!max_symbol) return;
  
-  data->cur_ctx = ctx;
-  CABAC_BIN(data, symbol, "ums");
+  CABAC_FBITS_UPDATE(data, ctx, symbol, *bits_out, "ums");

  if (!symbol) return;

  data->cur_ctx = &ctx[offset];

  while (--symbol) {
-    CABAC_BIN(data, 1, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 1, *bits_out, "ums");
  }
  if (code_last) {
-    CABAC_BIN(data, 0, "ums");
+    CABAC_FBITS_UPDATE(data, &ctx[offset], 0,*bits_out, "ums");
  }
 }

@ -405,7 +412,7 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t * const data, unsigned int
 /**
 * \brief
 */
-void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+uint32_t uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
                                  cabac_data_t * const data,
                                  uint32_t symbol,
                                  uint32_t count)
@ -426,4 +433,5 @@ void uvg_cabac_write_ep_ex_golomb(encoder_state_t * const state,
  num_bins += count;

  CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb");
+  return num_bins;
 }
--- a/src/cabac.h
+++ b/src/cabac.h
@ -59,7 +59,8 @@ typedef struct
  uint32_t   buffered_byte;
  int32_t    num_buffered_bytes;
  int32_t    bits_left;
-  int8_t     only_count;
+  int8_t     only_count : 4;
+  int8_t     update : 4;
  bitstream_t *stream;

  // CONTEXTS
@ -133,18 +134,18 @@ extern const uint8_t uvg_g_auc_renorm_table[32];
 void uvg_cabac_start(cabac_data_t *data);
 void uvg_cabac_encode_bin(cabac_data_t *data, uint32_t bin_value);
 void uvg_cabac_encode_bin_ep(cabac_data_t *data, uint32_t bin_value);
-void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value);
+void uvg_cabac_encode_trunc_bin(cabac_data_t *data, uint32_t bin_value, uint32_t max_value, double* bits_out);
 void uvg_cabac_encode_bins_ep(cabac_data_t *data, uint32_t bin_values, int num_bins);
 void uvg_cabac_encode_bin_trm(cabac_data_t *data, uint8_t bin_value);
 void uvg_cabac_write(cabac_data_t *data);
 void uvg_cabac_finish(cabac_data_t *data);
 void uvg_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                              uint32_t r_param, const unsigned int cutoff);
-void uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+uint32_t uvg_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
                uint32_t symbol, uint32_t count);
 void uvg_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
-                                  uint32_t symbol, int32_t offset,
-                                  uint32_t max_symbol);
+                                      uint32_t symbol, int32_t offset,
+                                      uint32_t max_symbol, double* bits_out);
 void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol);

 #define CTX_PROB_BITS 15
@ -153,6 +154,18 @@ void uvg_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol
 #define CTX_MASK_0 (~(~0u << CTX_PROB_BITS_0) << (CTX_PROB_BITS - CTX_PROB_BITS_0))
 #define CTX_MASK_1 (~(~0u << CTX_PROB_BITS_1) << (CTX_PROB_BITS - CTX_PROB_BITS_1))

+// Floating point fractional bits, derived from kvz_entropy_bits
+extern const float uvg_f_entropy_bits[512];
+#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]
+
+#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \
+  if((cabac)->only_count) (bits) += uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]; \
+  if((cabac)->update) {\
+    (cabac)->cur_ctx = ctx;\
+    CABAC_BIN((cabac), (val), (name));\
+  } \
+} while(0)
+
 // Macros
 #define CTX_GET_STATE(ctx) ( (ctx)->state[0]+(ctx)->state[1] )
 #define CTX_STATE(ctx) ( CTX_GET_STATE(ctx)>>8 )
@ -185,23 +198,23 @@ extern uint32_t uvg_cabac_bins_count;
 extern bool uvg_cabac_bins_verbose;
 #define CABAC_BIN(data, value, name) { \
    uint32_t prev_state = CTX_STATE(data->cur_ctx); \
-    if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %d  [%d:%d]  %s = %u, range = %u LPS = %u state = %u -> ", \
-           uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS(data->cur_ctx,(data)->range), CTX_LPS(data->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS(data->cur_ctx,(data)->range), prev_state); }\
+    if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %d  [%d:%d]  %s = %u, range = %u LPS = %u state = %u -> ", \
+           uvg_cabac_bins_count++, (data)->range, (data)->range-CTX_LPS((data)->cur_ctx,(data)->range), CTX_LPS((data)->cur_ctx,(data)->range), (name), (uint32_t)(value), (data)->range, CTX_LPS((data)->cur_ctx,(data)->range), prev_state); }\
    uvg_cabac_encode_bin((data), (value)); \
-    if(uvg_cabac_bins_verbose && !data->only_count) printf("%u\n", CTX_STATE(data->cur_ctx)); }
+    if(uvg_cabac_bins_verbose && !(data)->only_count) printf("%u\n", CTX_STATE((data)->cur_ctx)); }
    

  #define CABAC_BINS_EP(data, value, bins, name) { \
-    uint32_t prev_state = CTX_STATE(data->cur_ctx); \
+    uint32_t prev_state = (!(data)->only_count) ? CTX_STATE(data->cur_ctx) : 0; \
    uvg_cabac_encode_bins_ep((data), (value), (bins)); \
    if(uvg_cabac_bins_verbose && !data->only_count) { printf("%d %s = %u(%u bins), state = %u -> %u\n", \
-           uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE(data->cur_ctx));  uvg_cabac_bins_count+=bins;}}
+           uvg_cabac_bins_count, (name), (uint32_t)(value), (bins), prev_state, CTX_STATE((data)->cur_ctx));  uvg_cabac_bins_count+=(bins);}}

  #define CABAC_BIN_EP(data, value, name) { \
-    uint32_t prev_state = CTX_STATE(data->cur_ctx); \
+    uint32_t prev_state = (!(data)->only_count) ? CTX_STATE((data)->cur_ctx) : 0;; \
    uvg_cabac_encode_bin_ep((data), (value)); \
-    if(uvg_cabac_bins_verbose && !data->only_count) {printf("%d %s = %u, state = %u -> %u\n", \
-           uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE(data->cur_ctx)); }}
+    if(uvg_cabac_bins_verbose && !(data)->only_count) {printf("%d %s = %u, state = %u -> %u\n", \
+           uvg_cabac_bins_count++, (name), (uint32_t)(value), prev_state, CTX_STATE((data)->cur_ctx)); }}
 #else
  #define CABAC_BIN(data, value, name) \
    uvg_cabac_encode_bin((data), (value));
--- a/src/cfg.c
+++ b/src/cfg.c
@ -147,9 +147,9 @@ int uvg_config_init(uvg_config *cfg)
  cfg->gop_lp_definition.t = 1;
  cfg->open_gop = true;

-  cfg->roi.width = 0;
-  cfg->roi.height = 0;
-  cfg->roi.dqps = NULL;
+  cfg->roi.file_path = NULL;
+  cfg->roi.format = UVG_ROI_TXT;
+
  cfg->set_qp_in_cu = false;

  cfg->erp_aqp = false;
@ -212,6 +212,9 @@ int uvg_config_init(uvg_config *cfg)

  cfg->cclm = 0;

+
+  cfg->combine_intra_cus = 1;
+  cfg->force_inter = 0;
  return 1;
 }

@ -219,11 +222,11 @@ int uvg_config_destroy(uvg_config *cfg)
 {
  if (cfg) {
    FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->roi.file_path);
    FREE_POINTER(cfg->fast_coeff_table_fn);
    FREE_POINTER(cfg->tiles_width_split);
    FREE_POINTER(cfg->tiles_height_split);
    FREE_POINTER(cfg->slice_addresses_in_ts);
-    FREE_POINTER(cfg->roi.dqps);
    FREE_POINTER(cfg->fastrd_learning_outdir_fn);
  }
  free(cfg);
@ -1269,60 +1272,29 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
  }
  else if OPT("implicit-rdpcm")
    cfg->implicit_rdpcm = (bool)atobool(value);
+
  else if OPT("roi") {
-    // The ROI description is as follows:
-    // First number is width, second number is height,
-    // then follows width * height number of dqp values.
-    FILE* f = fopen(value, "rb");
-    if (!f) {
-      fprintf(stderr, "Could not open ROI file.\n");
+    static enum uvg_roi_format const formats[] = { UVG_ROI_TXT, UVG_ROI_BIN };
+    static const char * const format_names[] = { "txt", "bin", NULL };
+
+    char *roi_file = strdup(value);
+    if (!roi_file) {
+      fprintf(stderr, "Failed to allocate memory for ROI file name.\n");
      return 0;
    }
+    FREE_POINTER(cfg->roi.file_path);
+    cfg->roi.file_path = roi_file;

-    int width = 0;
-    int height = 0;
-    if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) {
-      fprintf(stderr, "Failed to read ROI size.\n");
-      fclose(f);
-      return 0;
+    // Get file extension or the substring after the last dot
+    char *maybe_extension = strrchr(cfg->roi.file_path, '.');
+    if (!maybe_extension) {
+      cfg->roi.format = UVG_ROI_TXT;
+    } else {
+      maybe_extension++;
+      int8_t format;
+      bool unknown_format = !parse_enum(maybe_extension, format_names, &format);
+      cfg->roi.format = unknown_format ? UVG_ROI_TXT : formats[format];
    }
-
-    if (width <= 0 || height <= 0) {
-      fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height);
-      fclose(f);
-      return 0;
-    }
-
-    if (width > 10000 || height > 10000) {
-      fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
-      fclose(f);
-      return 0;
-    }
-
-    const unsigned size = width * height;
-    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps[0]));
-    if (!dqp_array) {
-      fprintf(stderr, "Failed to allocate memory for ROI table.\n");
-      fclose(f);
-      return 0;
-    }
-
-    FREE_POINTER(cfg->roi.dqps);
-    cfg->roi.dqps   = dqp_array;
-    cfg->roi.width  = width;
-    cfg->roi.height = height;
-
-    for (int i = 0; i < size; ++i) {
-      int number; // Need a pointer to int for fscanf
-      if (fscanf(f, "%d", &number) != 1) {
-        fprintf(stderr, "Reading ROI file failed.\n");
-        fclose(f);
-        return 0;
-      }
-      dqp_array[i] = CLIP(-51, 51, number);
-    }
-
-    fclose(f);
  }
  else if OPT("set-qp-in-cu") {
    cfg->set_qp_in_cu = (bool)atobool(value);
@ -1476,6 +1448,12 @@ int uvg_config_parse(uvg_config *cfg, const char *name, const char *value)
  else if OPT("cclm") {
    cfg->cclm = (bool)atobool(value);
  }
+  else if OPT("combine-intra-cus") {
+    cfg->combine_intra_cus = atobool(value);
+  }
+  else if OPT("force-inter") {
+    cfg->force_inter = atobool(value);
+  }
  else {
    return 0;
  }
--- a/src/cli.c
+++ b/src/cli.c
@ -141,6 +141,7 @@ static const struct option long_options[] = {
  { "force-level",        required_argument, NULL, 0 },
  { "high-tier",                no_argument, NULL, 0 },
  { "me-steps",           required_argument, NULL, 0 },
+  { "roi-file",           required_argument, NULL, 0 },
  { "fast-residual-cost", required_argument, NULL, 0 },
  { "set-qp-in-cu",             no_argument, NULL, 0 },
  { "open-gop",                 no_argument, NULL, 0 },
@ -179,6 +180,10 @@ static const struct option long_options[] = {
  { "no-amvr",                  no_argument, NULL, 0 },
  { "cclm",                     no_argument, NULL, 0 },
  { "no-cclm",                  no_argument, NULL, 0 },
+  { "combine-intra-cus",        no_argument, NULL, 0 },
+  { "no-combine-intra-cus",     no_argument, NULL, 0 },
+  { "force-inter",              no_argument, NULL, 0 },
+  { "no-force-inter",           no_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -499,11 +504,20 @@ void print_help(void)
    "                                   - frametile: Constrain within the tile.\n"
    "                                   - frametilemargin: Constrain even more.\n"
    "      --roi <filename>       : Use a delta QP map for region of interest.\n"
-    "                               Reads an array of delta QP values from a text\n"
-    "                               file. The file format is: width and height of\n"
-    "                               the QP delta map followed by width*height delta\n"
-    "                               QP values in raster order. The map can be of any\n"
-    "                               size and will be scaled to the video size.\n"
+    "                               Reads an array of delta QP values from a file.\n"
+    "                               Text and binary files are supported and detected\n"
+    "                               from the file extension (.txt/.bin). If a known\n"
+    "                               extension is not found, the file is treated as\n"
+    "                               a text file. The file can include one or many\n"
+    "                               ROI frames each in the following format:\n"
+    "                               width and height of the QP delta map followed\n"
+    "                               by width * height delta QP values in raster\n"
+    "                               order. In binary format, width and height are\n"
+    "                               32-bit integers whereas the delta QP values are\n"
+    "                               signed 8-bit values. The map can be of any size\n"
+    "                               and will be scaled to the video size. The file\n"
+    "                               reading will loop if end of the file is reached.\n"
+    "                               See roi.txt in the examples folder.\n"
    "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
    "                               in PPS and slice_qp_delta in slize header zero.\n"
    "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
@ -587,6 +601,16 @@ void print_help(void)
    "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
    "                                learning trees, overrides the\n"
    "                                --pu-depth-intra parameter. [disabled]\n"
+    "      --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
+    "                                   on lower depth even when search is not\n"
+    "                                   performed on said depth. Should only\n"
+    "                                   be disabled if cus absolutely must not\n"
+    "                                   be larger than limited by the search.\n"
+    "                                   [enabled]"
+    "      --force-inter          : Force the encoder to use inter always.\n"
+    "                               This is mostly for debugging and is not\n"
+    "                               guaranteed to produce sensible bitstream or\n"
+    "                               work at all. [disabled]"
    "      --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n"
    "      --(no-)bipred          : Bi-prediction [disabled]\n"
    "      --cu-split-termination <string> : CU split search termination [zero]\n"
--- a/src/cu.h
+++ b/src/cu.h
@ -148,7 +148,7 @@ typedef struct
  uint8_t merge_idx   : 3; //!< \brief merge index
  uint8_t tr_skip     : 1; //!< \brief transform skip flag
  uint8_t tr_idx      : 3; //!< \brief transform index
-  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
+  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 

  uint16_t cbf;

@ -183,6 +183,16 @@ typedef struct
  };
 } cu_info_t;

+typedef struct {
+  int16_t x;
+  int16_t y;
+  int8_t width;
+  int8_t height;
+  int8_t chroma_width;
+  int8_t chroma_height;
+} cu_loc_t;
+
+
 #define CU_GET_MV_CAND(cu_info_ptr, reflist) \
  (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)

--- a/src/encmain.c
+++ b/src/encmain.c
@ -441,6 +441,7 @@ int main(int argc, char *argv[])
  FILE *input  = NULL; //!< input file (YUV)
  FILE *output = NULL; //!< output file (HEVC NAL stream)
  FILE *recout = NULL; //!< reconstructed YUV output, --debug
+  FILE *roifile = NULL;
  clock_t start_time = clock();
  clock_t encoding_start_cpu_time;
  UVG_CLOCK_T encoding_start_real_time;
@ -587,7 +588,7 @@ int main(int argc, char *argv[])
    // Give arguments via struct to the input thread
    input_handler_args in_args = {
      .available_input_slots = available_input_slots,
-      .filled_input_slots    = filled_input_slots,
+      .filled_input_slots = filled_input_slots,

      .input = input,
      .api = api,
@ -828,6 +829,7 @@ done:
  if (input)  fclose(input);
  if (output) fclose(output);
  if (recout) fclose(recout);
+  if (roifile) fclose(roifile);

  DBG_YUVIEW_CLEANUP();
  CHECKPOINTS_FINALIZE();
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -56,7 +56,33 @@ void uvg_encode_ts_residual(encoder_state_t* const state,
 void uvg_encode_mvd(encoder_state_t * const state,
                    cabac_data_t *cabac,
                    int32_t mvd_hor,
-                    int32_t mvd_ver);
+                    int32_t mvd_ver,
+                    double* bits_out);
+
+double uvg_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu);
+
+int uvg_encode_inter_prediction_unit(encoder_state_t* const state,
+                                      cabac_data_t* const cabac,
+                                      const cu_info_t* const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, 
+                                      lcu_t* lcu,
+                                      double* bits_out);
+
+void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  int x, int y, int depth, const lcu_t* lcu, double* bits_out);
+
+
+bool uvg_write_split_flag(const encoder_state_t* const state, cabac_data_t* cabac,
+  const cu_info_t* left_cu, const cu_info_t* above_cu,
+  uint8_t split_flag,
+  int depth, int cu_width, int x, int y, double* bits_out);

 void uvg_encode_last_significant_xy(cabac_data_t * const cabac,
  uint8_t lastpos_x, uint8_t lastpos_y,
--- a/src/encoder.c
+++ b/src/encoder.c
@ -32,7 +32,6 @@

 #include "encoder.h"

-// This define is required for M_PI on Windows.
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <stdio.h>
@ -45,14 +44,6 @@
 #include "uvg_math.h"
 #include "fast_coeff_cost.h"

-/**
- * \brief Strength of QP adjustments when using adaptive QP for 360 video.
- *
- * Determined empirically.
- */
-static const double ERP_AQP_STRENGTH = 3.0;
-
-
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);

 static unsigned cfg_num_threads(void)
@ -136,22 +127,6 @@ static int get_max_parallelism(const encoder_control_t *const encoder)
 }


-/**
- * \brief Return weight for 360 degree ERP video
- *
- * Returns the scaling factor of area from equirectangular projection to
- * spherical surface.
- *
- * \param y   y-coordinate of the pixel
- * \param h   height of the picture
- */
-static double ws_weight(int y, int h)
-{
-  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
-}
-
-
-
 /**
 * \brief Update ROI QPs for 360 video with equirectangular projection.
 *
@ -162,55 +137,6 @@ static double ws_weight(int y, int h)
 * \param orig_width    width of orig_roi
 * \param orig_height   height of orig_roi
 */
-static void init_erp_aqp_roi(encoder_control_t* encoder,
-                             int8_t *orig_roi,
-                             int32_t orig_width,
-                             int32_t orig_height)
-{
-  // Update ROI with WS-PSNR delta QPs.
-  int height = encoder->in.height_in_lcu;
-  int width  = orig_roi ? orig_width : 1;
-
-  int frame_height = encoder->in.real_height;
-
-  encoder->cfg.roi.width  = width;
-  encoder->cfg.roi.height = height;
-  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi[0]));
-
-  double total_weight = 0.0;
-  for (int y = 0; y < frame_height; y++) {
-    total_weight += ws_weight(y, frame_height);
-  }
-
-  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
-    int y_orig = LCU_WIDTH * y_lcu;
-    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
-
-    double lcu_weight = 0.0;
-    for (int y = y_orig; y < y_orig + lcu_height; y++) {
-      lcu_weight += ws_weight(y, frame_height);
-    }
-    // Normalize.
-    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
-
-    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
-
-    if (orig_roi) {
-      // If a ROI array already exists, we copy the existing values to the
-      // new array while adding qp_delta to each.
-      int y_roi = y_lcu * orig_height / height;
-      for (int x = 0; x < width; x++) {
-        encoder->cfg.roi.dqps[x + y_lcu * width] =
-          CLIP(-51, 51, orig_roi[x + y_roi * width] + qp_delta);
-      }
-
-    } else {
-      // Otherwise, simply write qp_delta to the ROI array.
-      encoder->cfg.roi.dqps[y_lcu] = qp_delta;
-    }
-  }
-}
-

 static int8_t* derive_chroma_QP_mapping_table(const uvg_config* const cfg, int i)
 {
@ -394,6 +320,16 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
    encoder->scaling_list.use_default_list = 1;
  }

+  // ROI / delta QP
+  if (cfg->roi.file_path) {
+    const char *mode[2] = { "r", "rb" };
+    encoder->roi_file = fopen(cfg->roi.file_path, mode[cfg->roi.format]);
+    if (!encoder->roi_file) {
+      fprintf(stderr, "Could not open ROI file.\n");
+      goto init_failed;
+    }
+  }
+
  if (cfg->fast_coeff_table_fn) {
    FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
    if (fast_coeff_table_f == NULL) {
@ -435,32 +371,10 @@ encoder_control_t* uvg_encoder_control_init(const uvg_config *const cfg)
    goto init_failed;
  }

-  if (cfg->erp_aqp) {
-    init_erp_aqp_roi(encoder,
-                     cfg->roi.dqps,
-                     cfg->roi.width,
-                     cfg->roi.height);
-
-  } else if (cfg->roi.dqps) {
-    // Copy delta QP array for ROI coding.
-    const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
-    encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps[0]));
-    memcpy(encoder->cfg.roi.dqps,
-           cfg->roi.dqps,
-           roi_size * sizeof(*cfg->roi.dqps));
-
-  }
-
  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
  // for SMP and AMP partition units.
  encoder->tr_depth_inter = 0;

-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
-    encoder->max_qp_delta_depth = 0;
-  } else {
-    encoder->max_qp_delta_depth = -1;
-  }
-
  //Tiles
  encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                          encoder->cfg.tiles_height_count > 1;
@ -761,7 +675,7 @@ void uvg_encoder_control_free(encoder_control_t *const encoder)

  FREE_POINTER(encoder->tiles_tile_id);

-  FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.roi.file_path);

  uvg_scalinglist_destroy(&encoder->scaling_list);

@ -773,6 +687,10 @@ void uvg_encoder_control_free(encoder_control_t *const encoder)

  uvg_close_rdcost_outfiles();

+  if (encoder->roi_file) {
+    fclose(encoder->roi_file);
+  }
+
  free(encoder);
 }

--- a/src/encoder.h
+++ b/src/encoder.h
@ -130,7 +130,7 @@ typedef struct encoder_control_t
  //! Picture weights when GOP is used.
  double gop_layer_weights[MAX_GOP_LAYERS];

-  int8_t max_qp_delta_depth;
+  FILE *roi_file;

  int tr_depth_inter;

--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -805,7 +805,7 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream,
  WRITE_U(stream, 0, 1, "pps_ref_wraparound_enabled_flag");

  WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pps_init_qp_minus26");
-  WRITE_U(stream, encoder->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");
+  WRITE_U(stream, state->frame->max_qp_delta_depth >= 0 ? 1:0, 1, "pps_cu_qp_delta_enabled_flag");

  WRITE_U(stream, 0,1, "pps_chroma_tool_offsets_present_flag");
  /* // If chroma_tool_offsets_present
@ -1037,8 +1037,8 @@ static void uvg_encoder_state_write_bitstream_picture_header(
  const int poc_lsb = state->frame->poc & ((1 << encoder->poc_lsb_bits) - 1);
  WRITE_U(stream, poc_lsb, encoder->poc_lsb_bits, "ph_pic_order_cnt_lsb");

-  if (encoder->max_qp_delta_depth >= 0) {
-    WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice");
+  if (state->frame->max_qp_delta_depth >= 0) {
+    WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_intra_slice");
  }

  // alf enable flags and aps IDs
@ -1118,8 +1118,8 @@ static void uvg_encoder_state_write_bitstream_picture_header(
    || state->frame->pictype == UVG_NAL_IDR_N_LP) {
  }
  else {
-    if (encoder->max_qp_delta_depth >= 0) {
-      WRITE_UE(stream, encoder->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice");
+    if (state->frame->max_qp_delta_depth >= 0) {
+      WRITE_UE(stream, state->frame->max_qp_delta_depth, "ph_cu_qp_delta_subdiv_inter_slice");
    }
    if (state->encoder_control->cfg.tmvp_enable) {
      WRITE_U(stream, state->encoder_control->cfg.tmvp_enable, 1, "ph_pic_temporal_mvp_enabled_flag");
@ -1128,7 +1128,7 @@ static void uvg_encoder_state_write_bitstream_picture_header(
  }

  if (encoder->cfg.jccr) {
-    WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag");
+    WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag");
  }
  // END PICTURE HEADER

--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -32,6 +32,9 @@

 #include "encoderstate.h"

+ // This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <ctype.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -53,6 +56,12 @@

 #include "strategies/strategies-picture.h"

+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;

 int uvg_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
  int i;
@ -572,7 +581,7 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
  const int cu_width = LCU_WIDTH >> depth;

-  if (depth <= state->encoder_control->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
    *prev_qp = -1;
  }

@ -624,6 +633,38 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
  }
 }

+
+static void set_joint_cb_cr_modes(encoder_state_t* state, uvg_picture* pic)
+{
+  bool              sgnFlag = true;
+
+  if (state->encoder_control->chroma_format != UVG_CSP_400)
+  {
+    const int       x1 = pic->width / 2 - 1;
+    const int       y1 = pic->height / 2 - 1;
+    const int       cbs = pic->stride / 2;
+    const int       crs = pic->stride / 2;
+    const uvg_pixel* p_cb = pic->u + 1 * cbs;
+    const uvg_pixel* p_cr = pic->v + 1 * crs;
+    int64_t         sum_cb_cr = 0;
+
+    // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes
+    for (int y = 1; y < y1; y++, p_cb += cbs, p_cr += crs)
+    {
+      for (int x = 1; x < x1; x++)
+      {
+        int cb = (12 * (int)p_cb[x] - 2 * ((int)p_cb[x - 1] + (int)p_cb[x + 1] + (int)p_cb[x - cbs] + (int)p_cb[x + cbs]) - ((int)p_cb[x - 1 - cbs] + (int)p_cb[x + 1 - cbs] + (int)p_cb[x - 1 + cbs] + (int)p_cb[x + 1 + cbs]));
+        int cr = (12 * (int)p_cr[x] - 2 * ((int)p_cr[x - 1] + (int)p_cr[x + 1] + (int)p_cr[x - crs] + (int)p_cr[x + crs]) - ((int)p_cr[x - 1 - crs] + (int)p_cr[x + 1 - crs] + (int)p_cr[x - 1 + crs] + (int)p_cr[x + 1 + crs]));
+        sum_cb_cr += cb * cr;
+      }
+    }
+
+    sgnFlag = (sum_cb_cr < 0);
+  }
+
+  state->frame->jccr_sign = sgnFlag;
+}
+
 static void encoder_state_worker_encode_lcu_bitstream(void* opaque);

 static void encoder_state_worker_encode_lcu_search(void * opaque)
@ -665,7 +706,7 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)

  encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);

-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
    int last_qp = state->last_qp;
    int prev_qp = -1;
    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
@ -716,6 +757,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
  const uint64_t existing_bits = uvg_bitstream_tell(&state->stream);

  //Encode SAO
+  state->cabac.update = 1;
  if (encoder->cfg.sao_type) {
    encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
  }
@ -771,6 +813,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
      uvg_cabac_start(&state->cabac);
    }
  }
+  state->cabac.update = 0;


  pthread_mutex_lock(&state->frame->rc_lock);
@ -1421,6 +1464,154 @@ static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
  }
 }

+
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Updates the ROI parameters in frame->roi.
+ *
+ * \param encoder       encoder control
+ * \param frame         frame that will have the ROI map
+ */
+static void init_erp_aqp_roi(const encoder_control_t *encoder, uvg_picture *frame)
+{
+  int8_t *orig_roi    = frame->roi.roi_array;
+  int32_t orig_width  = frame->roi.width;
+  int32_t orig_height = frame->roi.height;
+
+  // Update ROI with WS-PSNR delta QPs.
+  int new_height = encoder->in.height_in_lcu;
+  int new_width = orig_roi ? orig_width : 1;
+  int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi[0]));
+
+  int frame_height = encoder->in.real_height;
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < new_height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / new_height;
+      for (int x = 0; x < new_width; x++) {
+        new_array[x + y_lcu * new_width] =
+          CLIP(-51, 51, orig_roi[x + y_roi * new_width] + qp_delta);
+      }
+
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      new_array[y_lcu] = qp_delta;
+    }
+  }
+
+  // Update new values
+  frame->roi.width = new_width;
+  frame->roi.height = new_height;
+  frame->roi.roi_array = new_array;
+  FREE_POINTER(orig_roi);
+}
+
+
+static void next_roi_frame_from_file(uvg_picture *frame, FILE *file, enum uvg_roi_format format) {
+  // The ROI description is as follows:
+  // First number is width, second number is height,
+  // then follows width * height number of dqp values.
+
+  // Rewind the (seekable) ROI file when end of file is reached.
+  // Allows a single ROI frame to be used for a whole sequence
+  // and looping with --loop-input. Skips possible whitespace.
+  if (ftell(file) != -1L) {
+    int c = fgetc(file);
+    while (format == UVG_ROI_TXT && isspace(c)) c = fgetc(file);
+    ungetc(c, file);
+    if (c == EOF) rewind(file);
+  }
+
+  int *width  = &frame->roi.width;
+  int *height = &frame->roi.height;
+
+  bool failed = false;
+
+  if (format == UVG_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height);
+  if (format == UVG_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2;
+  
+  if (failed) {
+    fprintf(stderr, "Failed to read ROI size.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width <= 0 || *height <= 0) {
+    fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height);
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width > 10000 || *height > 10000) {
+    fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  const unsigned size = (*width) * (*height);
+  int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array[0]));
+  if (!dqp_array) {
+    fprintf(stderr, "Failed to allocate memory for ROI table.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  FREE_POINTER(frame->roi.roi_array);
+  frame->roi.roi_array = dqp_array;
+
+  if (format == UVG_ROI_TXT) {
+    for (int i = 0; i < size; ++i) {
+      int number; // Need a pointer to int for fscanf
+      if (fscanf(file, "%d", &number) != 1) {
+        fprintf(stderr, "Reading ROI file failed.\n");
+        fclose(file);
+        assert(0);
+      }
+      dqp_array[i] = CLIP(-51, 51, number);
+    }
+  } else if (format == UVG_ROI_BIN) {
+    if (fread(dqp_array, 1, size, file) != size) {
+      fprintf(stderr, "Reading ROI file failed.\n");
+      assert(0);
+    }
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_picture* frame) {
  assert(state->type == ENCODER_STATE_TYPE_MAIN);

@ -1437,6 +1628,21 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
    memset(state->tile->frame->hmvp_size, 0, sizeof(uint8_t) * state->tile->frame->height_in_lcu);
  }

+  // ROI / delta QP maps
+  if (frame->roi.roi_array && cfg->roi.file_path) {
+    assert(0 && "Conflict: Other ROI data was supplied when a ROI file was specified.");
+  }
+
+  // Read frame from the file. If no file is specified,
+  // ROI data should be already set by the application.
+  if (cfg->roi.file_path) {
+    next_roi_frame_from_file(frame, state->encoder_control->roi_file, cfg->roi.format);
+  }
+  
+  if (cfg->erp_aqp) {
+    init_erp_aqp_roi(state->encoder_control, state->tile->frame->source);
+  }
+
  // Variance adaptive quantization
  if (cfg->vaq) {
    const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
@ -1523,6 +1729,12 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, uvg_pict
  }
  // Variance adaptive quantization - END

+  if (cfg->target_bitrate > 0 || frame->roi.roi_array || cfg->set_qp_in_cu || cfg->vaq) {
+    state->frame->max_qp_delta_depth = 0;
+  } else {
+    state->frame->max_qp_delta_depth = -1;
+  }
+
  // Use this flag to handle closed gop irap picture selection.
  // If set to true, irap is already set and we avoid
  // setting it based on the intra period
@ -1689,6 +1901,7 @@ void uvg_encode_one_frame(encoder_state_t * const state, uvg_picture* frame)


  encoder_state_init_new_frame(state, frame);
+  if(state->encoder_control->cfg.jccr) set_joint_cb_cr_modes(state, frame);
  
  // Create a separate job for ALF done after everything else, and only then do final bitstream writing (for ALF parameters)
  if (state->encoder_control->cfg.alf_type && state->encoder_control->cfg.wpp) {
@ -1834,10 +2047,9 @@ lcu_stats_t* uvg_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y)

 int uvg_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
 {
-  const encoder_control_t *ctrl = state->encoder_control;
  const cu_array_t *cua = state->tile->frame->cu_array;
  // Quantization group width
-  const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);
+  const int qg_width = LCU_WIDTH >> MIN(state->frame->max_qp_delta_depth, uvg_cu_array_at_const(cua, x, y)->depth);

  // Coordinates of the top-left corner of the quantization group
  const int x_qg = x & ~(qg_width - 1);
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -179,6 +179,8 @@ typedef struct encoder_state_config_frame_t {
  */
  double *aq_offsets;

+  int8_t max_qp_delta_depth;
+
  /**
   * \brief Whether next NAL is the first NAL in the access unit.
   */
@ -193,6 +195,7 @@ typedef struct encoder_state_config_frame_t {

  cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row
  uint8_t* hmvp_size; //!< \brief HMVP LUT size
+  bool jccr_sign; 

 } encoder_state_config_frame_t;

@ -320,6 +323,7 @@ typedef struct encoder_state_t {
  
  bitstream_t stream;
  cabac_data_t cabac;
+  cabac_data_t search_cabac;

  uint32_t stats_bitstream_length; //Bitstream length written in bytes

@ -402,10 +406,10 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
 */
 static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+  if (state->frame->max_qp_delta_depth < 0) return false;

  const int cu_width = LCU_WIDTH >> depth;
-  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
  const int right  = x + cu_width;
  const int bottom = y + cu_width;
  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
--- a/src/fast_coeff_cost.c
+++ b/src/fast_coeff_cost.c
@ -40,7 +40,7 @@ static uint16_t to_q88(float f)
  return (uint16_t)(f * 256.0f + 0.5f);
 }

-static uint64_t to_4xq88(const float f[4])
+static uint64_t to_4xq88(const double f[4])
 {
  int i;
  uint64_t result = 0;
@ -58,9 +58,9 @@ int uvg_fast_coeff_table_parse(fast_coeff_table_t *fast_coeff_table, FILE *fast_
  uint64_t *wts_by_qp = fast_coeff_table->wts_by_qp;

  for (i = 0; i < MAX_FAST_COEFF_COST_QP; i++) {
-    float curr_wts[4];
+    double curr_wts[4];

-    if (fscanf(fast_coeff_table_f, "%f %f %f %f\n", curr_wts + 0,
+    if (fscanf(fast_coeff_table_f, "%lf %lf %lf %lf\n", curr_wts + 0,
                                                    curr_wts + 1,
                                                    curr_wts + 2,
                                                    curr_wts + 3) != 4) {
--- a/src/fast_coeff_cost.h
+++ b/src/fast_coeff_cost.h
@ -45,7 +45,7 @@ typedef struct {

 // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
 // 0 to MAX_FAST_COEFF_COST_QP
-static const float default_fast_coeff_cost_wts[][4] = {
+static const double default_fast_coeff_cost_wts[][4] = {
  // Just extend it by stretching the first actual values..
  {0.164240f, 4.161530f, 3.509033f, 6.928047f},
  {0.164240f, 4.161530f, 3.509033f, 6.928047f},
--- a/src/filter.c
+++ b/src/filter.c
@ -339,7 +339,7 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir)

 static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
 {
-  if (state->encoder_control->max_qp_delta_depth < 0) {
+  if (state->frame->max_qp_delta_depth < 0) {
    return state->qp;
  }

--- a/src/image.c
+++ b/src/image.c
@ -106,6 +106,10 @@ uvg_picture * uvg_image_alloc(enum uvg_chroma_format chroma_format, const int32_

  im->interlacing = UVG_INTERLACING_NONE;

+  im->roi.roi_array = NULL;
+  im->roi.width = 0;
+  im->roi.height = 0;
+
  return im;
 }

@ -132,6 +136,7 @@ void uvg_image_free(uvg_picture *const im)
    uvg_image_free(im->base_image);
  } else {
    free(im->fulldata_buf);
+    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
  }

  // Make sure freed data won't be used.
@ -192,6 +197,8 @@ uvg_picture *uvg_image_make_subimage(uvg_picture *const orig_image,
  im->pts = 0;
  im->dts = 0;

+  im->roi = orig_image->roi;
+
  return im;
 }

--- a/src/inter.c
+++ b/src/inter.c
@ -624,7 +624,9 @@ void uvg_inter_pred_pu(const encoder_state_t * const state,
                       int i_pu)

 {
-  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  const int x_scu = SUB_SCU(x);
+  const int y_scu = SUB_SCU(y);
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu);
  const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
  const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
  const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
@ -673,6 +675,12 @@ void uvg_inter_pred_pu(const encoder_state_t * const state,
      NULL,
      predict_luma, predict_chroma);
  }
+
+  if (predict_chroma && state->encoder_control->cfg.jccr) {
+    const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C;
+    uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+    uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C);
+  }
 }

 /**
@ -1290,7 +1298,7 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state,
                                        int32_t width,
                                        int32_t height,
                                        const merge_candidates_t *merge_cand,
-                                        const cu_info_t *cur_cu,
+                                        const cu_info_t * const cur_cu,
                                        int8_t reflist,
                                        mv_t mv_cand[2][2])
 {
@ -1396,7 +1404,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
                           int32_t width,
                           int32_t height,
                           mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t  * const cur_cu,
                           lcu_t *lcu,
                           int8_t reflist)
 {
--- a/src/inter.h
+++ b/src/inter.h
@ -96,7 +96,7 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state,
                           int32_t width,
                           int32_t height,
                           mv_t mv_cand[2][2],
-                           cu_info_t* cur_cu,
+                           const cu_info_t* cur_cu,
                           lcu_t *lcu,
                           int8_t reflist);

--- a/src/intra.c
+++ b/src/intra.c
@ -82,6 +82,17 @@ static const uint8_t num_ref_pixels_left[16][16] = {
  { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
 };

+
+static void mip_predict(
+  const encoder_state_t* const state,
+  const uvg_intra_references* const refs,
+  const uint16_t pred_block_width,
+  const uint16_t pred_block_height,
+  uvg_pixel* dst,
+  const int mip_mode,
+  const bool mip_transp);
+
+
 int8_t uvg_intra_get_dir_luma_predictor(
  const uint32_t x,
  const uint32_t y,
@ -452,7 +463,7 @@ static void get_cclm_parameters(
  }
 }

-static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) {
+static void linear_transform_cclm(const cclm_parameters_t* cclm_params, uvg_pixel * src, uvg_pixel * dst, int stride, int height) {
  int scale = cclm_params->a;
  int shift = cclm_params->shift;
  int offset = cclm_params->b;
@ -468,7 +479,7 @@ static void linear_transform_cclm(cclm_parameters_t* cclm_params, uvg_pixel * sr
 }


-void uvg_predict_cclm(
+static void predict_cclm(
  encoder_state_t const* const state,
  const color_t color,
  const int8_t width,
@ -477,7 +488,7 @@ void uvg_predict_cclm(
  const int16_t y0,
  const int16_t stride,
  const int8_t mode,
-  lcu_t* const lcu,
+  const lcu_t* const lcu,
  uvg_intra_references* chroma_ref,
  uvg_pixel* dst,
  cclm_parameters_t* cclm_params
@ -498,6 +509,7 @@ void uvg_predict_cclm(


  uvg_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
+  const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);

  // Essentially what this does is that it uses 6-tap filtering to downsample
  // the luma intra references down to match the resolution of the chroma channel.
@ -508,12 +520,12 @@ void uvg_predict_cclm(
  if (y0) {
    for (; available_above_right < width / 2; available_above_right++) {
      int x_extension = x_scu + width * 2 + 4 * available_above_right;
-      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
      if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
    }
    if(y_scu == 0) {
      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
-      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2));
+      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride2 / 2)], sizeof(uvg_pixel) * (width + available_above_right * 2));
    }
    else {
      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
@ -533,16 +545,16 @@ void uvg_predict_cclm(
  if(x0) {
    for (; available_left_below < height / 2; available_left_below++) {
      int y_extension = y_scu + height * 2 + 4 * available_left_below;
-      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      const cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
      if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
    }
    for(int i = 0; i < height + available_left_below * 2; i++) {
-      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1];
+      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride2/2) + x0 / 2 - 1];
    }    
  }

-  uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width);
+  uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride2) / 4], sampled_luma, width, height, stride2 / 2, width);

  int16_t a, b, shift;
  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
@ -727,12 +739,17 @@ void uvg_mip_pred_upsampling_1D(int* const dst, const int* const src, const int*
 }


+
 /** \brief Matrix weighted intra prediction.
 */
-void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* const refs,
-                     const uint16_t pred_block_width, const uint16_t pred_block_height,
-                     uvg_pixel* dst,
-                     const int mip_mode, const bool mip_transp)
+static void mip_predict(
+  const encoder_state_t* const state,
+  const uvg_intra_references* const refs,
+  const uint16_t pred_block_width,
+  const uint16_t pred_block_height,
+  uvg_pixel* dst,
+  const int mip_mode,
+  const bool mip_transp)
 {
  // MIP prediction uses int values instead of uvg_pixel as some temp values may be negative
  
@ -875,14 +892,13 @@ void uvg_mip_predict(encoder_state_t const* const state, uvg_intra_references* c
 }


-void uvg_intra_predict(
-  encoder_state_t *const state,
+static void intra_predict_regular(
+  const encoder_state_t* const state,
  uvg_intra_references *refs,
  int_fast8_t log2_width,
  int_fast8_t mode,
  color_t color,
  uvg_pixel *dst,
-  bool filter_boundary,
  const uint8_t multi_ref_idx)
 {
  const int_fast8_t width = 1 << log2_width;
@ -1350,18 +1366,66 @@ void uvg_intra_build_reference(
  }
 }

+
+void uvg_intra_predict(
+  const encoder_state_t* const state,
+  uvg_intra_references* const refs,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
+  uvg_pixel* dst,
+  const intra_search_data_t* data,
+  const lcu_t* lcu
+  )
+{
+  const int stride = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);
+  // TODO: what is this used for?
+  // const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
+  bool use_mip = false;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int x = cu_loc->x;
+  const int y = cu_loc->y;
+  int8_t intra_mode = color == COLOR_Y ? data->pred_cu.intra.mode : data->pred_cu.intra.mode_chroma;
+  if (data->pred_cu.intra.mip_flag) {
+    if (color == COLOR_Y) {
+      use_mip = true;
+    }
+    else {
+      use_mip = state->encoder_control->chroma_format == UVG_CSP_444;
+      intra_mode = use_mip ? intra_mode : 0;
+    }
+  }
+  if (intra_mode < 68) {
+    if (use_mip) {
+      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
+      mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
+    }
+    else {
+      intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+    }
+  }
+  else {
+    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
+    if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
+      predict_cclm(
+        state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, 
+        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1]);
+    }
+    else {
+      linear_transform_cclm(&data->cclm_parameters[color == COLOR_U ? 0 : 1], dst, dst, width, width);
+    }
+  }
+}
+
+
 static void intra_recon_tb_leaf(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
  int x,
  int y,
  int depth,
-  int8_t intra_mode,
-  cclm_parameters_t *cclm_params,
  lcu_t *lcu,
  color_t color,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp)
+  const intra_search_data_t* search_data)
 {
  const uvg_config *cfg = &state->encoder_control->cfg;
  const int shift = color == COLOR_Y ? 0 : 1;
@ -1383,7 +1447,7 @@ static void intra_recon_tb_leaf(
  int x_scu = SUB_SCU(x);
  int y_scu = SUB_SCU(y);
  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
-  uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;
+  uint8_t multi_ref_index = color == COLOR_Y ? search_data->pred_cu.intra.multi_ref_idx: 0;

  uvg_intra_references refs;
  // Extra reference lines for use with MRL. Extra lines needed only for left edge.
@ -1406,42 +1470,14 @@ static void intra_recon_tb_leaf(
  uvg_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);

  uvg_pixel pred[32 * 32];
-  int stride = state->tile->frame->source->stride;
-  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
-  bool use_mip = false;
-  if (mip_flag) {
-    if (color == COLOR_Y) {
-      use_mip = true;
-    } else {
-      // MIP can be used for chroma if the chroma scheme is 444
-      if (state->encoder_control->chroma_format == UVG_CSP_444) {
-        use_mip = true;
-      } else {
-        // If MIP cannot be used for chroma, set mode to planar
-        intra_mode = 0;
-      }
-    }
-  }

-  if(intra_mode < 68) {
-    if (use_mip) {
-      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
-      uvg_mip_predict(state, &refs, width, height, pred, intra_mode, mip_transp);
-    }
-    else {
-      uvg_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
-    }
-  } else {
-    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
-    if(cclm_params == NULL) {
-      cclm_parameters_t temp_params;
-      uvg_predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
-    }
-    else {
-      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
-    }
-  }
+  cu_loc_t loc = {
+    x, y,
+    width, height,
+    width, height,
+  };
+
+  uvg_intra_predict(state, &refs, &loc, color, pred, search_data, lcu);

  const int index = lcu_px.x + lcu_px.y * lcu_width;
  uvg_pixel *block = NULL;
@ -1483,17 +1519,12 @@ static void intra_recon_tb_leaf(
 * \param lcu           containing LCU
 */
 void uvg_intra_recon_cu(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
  int x,
  int y,
  int depth,
-  int8_t mode_luma,
-  int8_t mode_chroma,
+  intra_search_data_t* search_data,
  cu_info_t *cur_cu,
-  cclm_parameters_t *cclm_params,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp,
  lcu_t *lcu)
 {
  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
@ -1501,12 +1532,16 @@ void uvg_intra_recon_cu(
  if (cur_cu == NULL) {
    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
  }
-  uint8_t multi_ref_index = multi_ref_idx;
-  bool use_mip = mip_flag;
-  bool mip_transposed = mip_transp;
+  const int8_t mode_luma = search_data->pred_cu.intra.mode;
+  const int8_t mode_chroma= search_data->pred_cu.intra.mode_chroma;
+
+  if(mode_chroma != -1 && mode_luma == -1) {
+    x &= ~7;
+    y &= ~7;
+  }
  
  if (mode_luma != -1 && mode_chroma != -1) {
-    if (use_mip) {
+    if (search_data->pred_cu.intra.mip_flag) {
      assert(mode_luma == mode_chroma && "Chroma mode must be derived from luma mode if block uses MIP.");
    }
  }
@ -1527,10 +1562,10 @@ void uvg_intra_recon_cu(
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

-    uvg_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
-    uvg_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
+    uvg_intra_recon_cu(state, x,   y,   depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x2,  y,   depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x,   y2,  depth + 1, search_data, NULL, lcu);
+    uvg_intra_recon_cu(state, x2,  y2,  depth + 1, search_data, NULL, lcu);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
@ -1552,13 +1587,15 @@ void uvg_intra_recon_cu(
   
    // Process a leaf TU.
    if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_Y, search_data);
    }
    if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_U, search_data);
+      intra_recon_tb_leaf(state, x, y, depth, lcu, COLOR_V, search_data);
    }

-    uvg_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
+    uvg_quantize_lcu_residual(state, has_luma, has_chroma && !(search_data->pred_cu.joint_cb_cr & 3),
+      search_data->pred_cu.joint_cb_cr != 4 && state->encoder_control->cfg.jccr && (x % 8 == 0 && y % 8 == 0),
+      x, y, depth, cur_cu, lcu, false);
  }
 }
--- a/src/intra.h
+++ b/src/intra.h
@ -63,6 +63,18 @@ typedef struct
  int16_t b;
 } cclm_parameters_t;

+typedef struct {
+  cu_info_t pred_cu;
+  cclm_parameters_t cclm_parameters[2];
+  double cost;
+  double bits;
+  double coeff_bits;
+  double distortion;
+} intra_search_data_t ;
+
+
+#define UVG_NUM_INTRA_MODES 67
+
 /**
 * \brief Function for deriving intra luma predictions
 * \param x          x-coordinate of the PU in pixels
@ -114,53 +126,22 @@ void uvg_intra_build_reference(
 * \param filter_boundary Whether to filter the boundary on modes 10 and 26.
 */
 void uvg_intra_predict(
-  encoder_state_t *const state,
-  uvg_intra_references *refs,
-  int_fast8_t log2_width,
-  int_fast8_t mode,
-  color_t color,
-  uvg_pixel *dst,
-  bool filter_boundary,
-  const uint8_t multi_ref_idx);
+  const encoder_state_t* const state,
+  uvg_intra_references* const refs,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
+  uvg_pixel* dst,
+  const intra_search_data_t* data,
+  const lcu_t* lcu
+);

 void uvg_intra_recon_cu(
-  encoder_state_t *const state,
+  encoder_state_t* const state,
  int x,
  int y,
  int depth,
-  int8_t mode_luma,
-  int8_t mode_chroma,
+  intra_search_data_t* search_data,
  cu_info_t *cur_cu,
-  cclm_parameters_t* cclm_params,
-  uint8_t multi_ref_idx,
-  bool mip_flag,
-  bool mip_transp,
  lcu_t *lcu);

-
-void uvg_predict_cclm(
-  encoder_state_t const* const state,
-  const color_t color,
-  const int8_t width,
-  const int8_t height,
-  const int16_t x0,
-  const int16_t y0,
-  const int16_t stride,
-  const int8_t mode,
-  lcu_t* const lcu,
-  uvg_intra_references* chroma_ref,
-  uvg_pixel* dst,
-  cclm_parameters_t* cclm_params
-);
-
 int uvg_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
-
-void uvg_mip_predict(
-  encoder_state_t const * const state,
-  uvg_intra_references * refs,
-  const uint16_t width,
-  const uint16_t height,
-  uvg_pixel* dst,
-  const int mip_mode,
-  const bool mip_transp
-);
--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -1088,17 +1088,20 @@ void uvg_set_lcu_lambda_and_qp(encoder_state_t * const state,
  const encoder_control_t * const ctrl = state->encoder_control;
  lcu_stats_t *lcu = uvg_get_lcu_stats(state, pos.x, pos.y);

-  if (ctrl->cfg.roi.dqps != NULL) {
-    vector2d_t lcu = {
+  if (state->tile->frame->source->roi.roi_array) {
+    vector2d_t lcu_vec = {
      pos.x + state->tile->lcu_offset_x,
      pos.y + state->tile->lcu_offset_y
    };
    vector2d_t roi = {
-      lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu,
-      lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu
+      lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu,
+      lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu
    };
-    int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
-    int dqp = ctrl->cfg.roi.dqps[roi_index];
+    int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width;
+    int dqp = state->tile->frame->source->roi.roi_array[roi_index];
+    if(dqp != 0) {
+      pos.x = 0;
+    }
    state->qp = CLIP_TO_QP(state->frame->QP + dqp);
    state->lambda = qp_to_lambda(state, state->qp);
    state->lambda_sqrt = sqrt(state->lambda);
--- a/src/rdo.c
+++ b/src/rdo.c
@ -315,12 +315,12 @@ static INLINE uint32_t get_coeff_cabac_cost(
  // Take a copy of the CABAC so that we don't overwrite the contexts when
  // counting the bits.
  cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));

  // Clear bytes and bits and set mode to "count"
  cabac_copy.only_count = 1;
-  cabac_copy.num_buffered_bytes = 0;
-  cabac_copy.bits_left = 23;
+  int num_buffered_bytes = cabac_copy.num_buffered_bytes;
+  int bits_left = cabac_copy.bits_left;

  // Execute the coding function.
  // It is safe to drop the const modifier since state won't be modified
@ -343,8 +343,10 @@ static INLINE uint32_t get_coeff_cabac_cost(
      type,
      scan_mode);
  }
-
-  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+  if(cabac_copy.update) {
+    memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy));
+  }
+  return (bits_left - cabac_copy.bits_left) + ((cabac_copy.num_buffered_bytes - num_buffered_bytes) << 3);
 }

 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
@ -1741,37 +1743,33 @@ void uvg_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
 /**
 * Calculate cost of actual motion vectors using CABAC coding
 */
-uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       const int32_t mvd_hor,
-                                       const int32_t mvd_ver)
+double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     const int32_t mvd_hor,
+                                     const int32_t mvd_ver)
 {
  cabac_data_t cabac_copy = *cabac;
  cabac_copy.only_count = 1;
-
+  double bits = 0;
  // It is safe to drop const here because cabac->only_count is set.
-  uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+  uvg_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits);

-  uint32_t bitcost =
-    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
-    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
-
-  return bitcost;
+  return bits;
 }

 /** MVD cost calculation with CABAC
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
-                                 int x,
-                                 int y,
-                                 int mv_shift,
-                                 mv_t mv_cand[2][2],
-                                 inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
-                                 int16_t num_cand,
-                                 int32_t ref_idx,
-                                 uint32_t *bitcost)
+double uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
+                               int x,
+                               int y,
+                               int mv_shift,
+                               mv_t mv_cand[2][2],
+                               inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                               int16_t num_cand,
+                               int32_t ref_idx,
+                               double* bitcost)
 {
  cabac_data_t state_cabac_copy;
  cabac_data_t* cabac;
@ -1798,14 +1796,13 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
  }

  // Store cabac state and contexts
-  memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+  memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t));

  // Clear bytes and bits and set mode to "count"
  state_cabac_copy.only_count = 1;
-  state_cabac_copy.num_buffered_bytes = 0;
-  state_cabac_copy.bits_left = 23;

  cabac = &state_cabac_copy;
+  double bits = 0;

  if (!merged) {
    vector2d_t mvd1 = {
@ -1820,8 +1817,8 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
    uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd1);
    uvg_change_precision_vector2d(INTERNAL_MV_PREC, 2, &mvd2);

-    uint32_t cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
-    uint32_t cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
+    double cand1_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    double cand2_cost = uvg_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);

    // Select candidate 1 if it has lower cost
    if (cand2_cost < cand1_cost) {
@ -1834,7 +1831,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,

  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);

-  CABAC_BIN(cabac, merged, "MergeFlag");
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag");
  num_cand = state->encoder_control->cfg.max_merge;
  if (merged) {
    if (num_cand > 1) {
@ -1842,10 +1839,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
      for (ui = 0; ui < num_cand - 1; ui++) {
        int32_t symbol = (ui != merge_idx);
        if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
        } else {
          CABAC_BIN_EP(cabac, symbol, "MergeIndex");
+          bits += 1;
        }
        if (symbol == 0) break;
      }
@ -1869,23 +1866,22 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
          // parseRefFrmIdx
          int32_t ref_frame = ref_idx;
          
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[0]), (ref_frame != 0), bits, "ref_idx_lX");

          if (ref_frame > 0) {
            int32_t i;
            int32_t ref_num = ref_list[ref_list_idx] - 2;
            
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
            ref_frame--;

            for (i = 0; i < ref_num; ++i) {
              const uint32_t symbol = (i == ref_frame) ? 0 : 1;

              if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
+                CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model[1]), symbol, bits, "ref_idx_lX");
              } else {
                CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+                bits += 1;
              }
              if (symbol == 0) break;
            }
@ -1895,7 +1891,7 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
        // ToDo: Bidir vector support
        if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
          // It is safe to drop const here because cabac->only_count is set.
-          uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
+          uvg_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y, &bits);
        }

        // Signal which candidate MV to use
@ -1905,10 +1901,10 @@ uint32_t uvg_calc_mvd_cost_cabac(const encoder_state_t * state,
    }
  }

-  *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
+  *bitcost = bits;

  // Store bitcost before restoring cabac
-  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
+  return *bitcost * state->lambda_sqrt;
 }

 void uvg_close_rdcost_outfiles(void)
--- a/src/rdo.h
+++ b/src/rdo.h
@ -77,10 +77,10 @@ uint32_t uvg_get_coded_level(encoder_state_t * state, double* coded_cost, double

 uvg_mvd_cost_func uvg_calc_mvd_cost_cabac;

-uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       int32_t mvd_hor,
-                                       int32_t mvd_ver);
+double uvg_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     int32_t mvd_hor,
+                                     int32_t mvd_ver);

 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15
@ -90,8 +90,5 @@ uint32_t uvg_get_mvd_coding_cost_cabac(const encoder_state_t *state,
 extern const uint32_t uvg_entropy_bits[512];
 #define CTX_ENTROPY_BITS(ctx, val) uvg_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]

-// Floating point fractional bits, derived from uvg_entropy_bits
-extern const float uvg_f_entropy_bits[512];
-#define CTX_ENTROPY_FBITS(ctx, val) uvg_f_entropy_bits[(CTX_STATE(ctx)<<1) ^ (val)]

 #endif
--- a/src/sao.c
+++ b/src/sao.c
@ -49,63 +49,64 @@ static void init_sao_info(sao_info_t *sao) {
 }


-static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
+static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {    
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded type_idx_, none = 0
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+  CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type");

  return mode_bits;
 }

-static float sao_mode_bits_merge(const encoder_state_t * const state,
+static double sao_mode_bits_merge(const encoder_state_t * const state,
                                 int8_t merge_cand) {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  ctx = &(cabac->ctx.sao_merge_flag_model);

-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag");
  if (merge_cand == 1) return mode_bits;
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag");
  return mode_bits;
 }


-static float sao_mode_bits_edge(const encoder_state_t * const state,
+static double sao_mode_bits_edge(const encoder_state_t * const state,
                              int edge_class, int offsets[NUM_SAO_EDGE_CATEGORIES],
                              sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded type_idx_, edge = 2 = cMax
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;

  // TR coded offsets.
  for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) {
@ -126,26 +127,27 @@ static float sao_mode_bits_edge(const encoder_state_t * const state,
 }


-static float sao_mode_bits_band(const encoder_state_t * const state,
+static double sao_mode_bits_band(const encoder_state_t * const state,
                              int band_position[2], int offsets[10],
                              sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
  // FL coded merges.
  if (sao_left != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }
  if (sao_top != NULL) {
    ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
  }

  // TR coded sao_type_idx_, band = 1
  ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;

  // TR coded offsets and possible FL coded offset signs.
  for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++)
@ -552,7 +554,8 @@ static void sao_search_best_mode(const encoder_state_t * const state, const uvg_
  // Choose between SAO and doing nothing, taking into account the
  // rate-distortion cost of coding do nothing.
  {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
+    float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left);
+    int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5);
    if (sao_out->ddistortion >= cost_of_nothing) {
      sao_out->type = SAO_TYPE_NONE;
      merge_cost[0] = cost_of_nothing;
--- a/src/search.c
+++ b/src/search.c
@ -37,6 +37,7 @@

 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@ -59,14 +60,6 @@
 // Cost threshold for doing intra search in inter frames with --rd=0.
 static const int INTRA_THRESHOLD = 8;

-// Modify weight of luma SSD.
-#ifndef LUMA_MULT
-# define LUMA_MULT 0.8
-#endif
-// Modify weight of chroma SSD.
-#ifndef CHROMA_MULT
-# define CHROMA_MULT 1.5
-#endif

 static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
@ -225,16 +218,16 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
  const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);

  double ssd = 0.0;
-  ssd += LUMA_MULT * uvg_pixels_calc_ssd(
+  ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
    &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
    LCU_WIDTH, LCU_WIDTH, cu_width
    );
  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
-    ssd += CHROMA_MULT * uvg_pixels_calc_ssd(
+    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
      &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
      );
-    ssd += CHROMA_MULT * uvg_pixels_calc_ssd(
+    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
      &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
      );
@ -251,7 +244,8 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
  int x_scu = SUB_SCU(x);
  int y_scu = SUB_SCU(y);
  y_rec += x_scu + y_scu * LCU_WIDTH;
-  int stride = state->tile->frame->source->stride;
+  const int stride = state->tile->frame->rec->stride;
+  const int stride2 = (((state->tile->frame->width + 7) & ~7) + FRAME_PADDING_LUMA);

  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
    for (int x_ = 0; x_ < width; x_++) {
@ -265,13 +259,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
      s += y_rec[2 * x_ + LCU_WIDTH] * 2;
      s += y_rec[2 * x_ + 1 + LCU_WIDTH];
      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
-      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride2 / 2;
      state->tile->frame->cclm_luma_rec[index] = s >> 3;
    }
    y_rec += LCU_WIDTH * 2;
  }
  if((y + height * 2) % 64 == 0) {
-    int line = y / 64 * stride / 2;
+    int line = y / 64 * stride2 / 2;
    y_rec -= LCU_WIDTH;
    for (int i = 0; i < width; ++i) {
      int s = 2;
@ -294,11 +288,13 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * prediction unit data needs to be coded.
 */
 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu)
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu)
 {
  const int width = LCU_WIDTH >> depth;
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;

  // cur_cu is used for TU parameters.
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@ -324,14 +320,36 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
    return sum + tr_tree_bits * state->lambda;
  }

+
+  if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) {
+    // Because these need to be coded before the luma cbf they also need to be counted
+    // before the cabac state changes. However, since this branch is only executed when
+    // calculating the last RD cost it is not problem to include the chroma cbf costs in
+    // luma, because the chroma cost is calculated right after the luma cost.
+    // However, if we have different tr_depth, the bits cannot be written in correct
+    // order anyways so do not touch the chroma cbf here.
+    if (state->encoder_control->chroma_format != UVG_CSP_400) {
+      cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+      cabac->cur_ctx = cr_ctx;
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+      cr_ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
  if (pred_cu->type == CU_INTRA ||
-      tr_depth > 0 ||
+      is_tr_split ||
      cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
      cbf_is_set(tr_cu->cbf, depth, COLOR_V))
  {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma[0]);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma[0]);
+    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
  }

  // SSD between reconstruction and original
@ -343,7 +361,8 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
                                        width);
  }

-  {
+
+  if (!skip_residual_coding) {
    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
    const coeff_t *coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];

@ -351,23 +370,22 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
  }

  double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->lambda;
+  return (double)ssd * UVG_LUMA_MULT + bits * state->lambda;
 }


 double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu)
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu)
 {
  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);

  double tr_tree_bits = 0;
-  double joint_cbcr_tr_tree_bits = 0;
  double coeff_bits = 0;
-  double joint_coeff_bits = 0;

  assert(x_px >= 0 && x_px < LCU_WIDTH);
  assert(y_px >= 0 && y_px < LCU_WIDTH);
@ -378,30 +396,28 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
    return 0;
  }

-  if (depth < MAX_PU_DEPTH) {
+  // See luma for why the second condition
+  if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) {
    const int tr_depth = depth - pred_cu->depth;
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_cb[0]);
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    cabac->cur_ctx = ctx;
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
-    }
-    if(state->encoder_control->cfg.jccr) {
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
    }
    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
-    ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
+    ctx = &(cabac->ctx.qt_cbf_model_cr[is_set]);
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
-    }
-    if(state->encoder_control->cfg.jccr) {
-      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
    }
  }


  if (tr_cu->tr_depth > depth) {
    int offset = LCU_WIDTH >> (depth + 1);
-    int sum = 0;
+    double sum = 0;

    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
@ -418,15 +434,10 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
      ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]);
      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0);      
    }
-    if(pred_cu->joint_cb_cr) {
-      ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]);
-      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1);
-    }
  }

  // Chroma SSD
  int ssd = 0;
-  int joint_ssd = 0;
  if (!state->encoder_control->cfg.lossless) {
    int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
    int ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
@ -436,12 +447,226 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
                                    LCU_WIDTH_C,        LCU_WIDTH_C,
                                    width);
    ssd = ssd_u + ssd_v;
+  }

-    if(state->encoder_control->cfg.jccr) {
+  if (!skip_residual_coding)
+  {
+    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
+    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
+  }
+
+
+  double bits = tr_tree_bits + coeff_bits;
+
+  return (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda;
+}
+
+static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
+                                           const int x_px, const int y_px, const int depth,
+                                           const cu_info_t* const pred_cu,
+                                           lcu_t* const lcu) {
+  const int width = LCU_WIDTH >> depth;
+
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
+  // cur_cu is used for TU parameters.
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+
+  double coeff_bits = 0;
+  double tr_tree_bits = 0;
+
+  // Check that lcu is not in 
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  const uint8_t tr_depth = tr_cu->tr_depth - depth;
+
+  const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+
+  {
+    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    // Only need to signal coded block flag if not skipped or merged
+    // skip = no coded residual, merge = coded residual
+    if (pred_cu->type == CU_INTER && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf");
+    }
+
+  }
+
+  if(state->encoder_control->chroma_format != UVG_CSP_400 && !skip_residual_coding) {
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
+    } 
+    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
+    } 
+  }
+
+  if (tr_depth > 0) {
+    int offset = LCU_WIDTH >> (depth + 1);
+    double sum = 0;
+
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    return sum + tr_tree_bits * state->lambda;
+  }
+  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) ;
+
+  // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = depth - tr_cu->depth;
+  if ((pred_cu->type == CU_INTRA ||
+    is_tr_split ||
+    cb_flag_u ||
+    cb_flag_v) 
+      && !skip_residual_coding)
+  {
+    cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_luma[!is_tr_split]);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
+  }
+
+  if (cb_flag_y | cb_flag_u | cb_flag_v) {
+    // TODO qp_delta_sign_flag
+
+    if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) {
+      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
+    }
+  }
+
+
+  // SSD between reconstruction and original
+  unsigned luma_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    int index = y_px * LCU_WIDTH + x_px;
+    luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
+      LCU_WIDTH, LCU_WIDTH,
+      width);
+  }
+
+  {
+    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
+
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode, tr_cu->tr_skip);
+  }
+
+  unsigned chroma_ssd = 0;
+  if(state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) {
+    const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 };
+    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
+    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+    if(pred_cu->joint_cb_cr == 0) {
+      if (!state->encoder_control->cfg.lossless) {
+        int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+        unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+          LCU_WIDTH_C, LCU_WIDTH_C,
+          chroma_width);
+        unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+          LCU_WIDTH_C, LCU_WIDTH_C,
+          chroma_width);
+        chroma_ssd = ssd_u + ssd_v;
+      }
+
+      {
+
+        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], chroma_width, 2, scan_order, 0);
+        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], chroma_width, 2, scan_order, 0);
+      }
+    } else {
      int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
        LCU_WIDTH_C, LCU_WIDTH_C,
        width);
      int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        chroma_width);
+      chroma_ssd = ssd_u_joint + ssd_v_joint;
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
+    }
+  }
+
+  double bits = tr_tree_bits + coeff_bits;
+  return luma_ssd * UVG_LUMA_MULT + chroma_ssd * UVG_CHROMA_MULT + bits * state->lambda;
+}
+
+
+void uvg_select_jccr_mode(
+  const encoder_state_t* const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  cu_info_t* pred_cu,
+  lcu_t* const lcu,
+  double* cost_out)
+{
+  const vector2d_t lcu_px = { (SUB_SCU(x_px) & ~7) / 2, (SUB_SCU(y_px) & ~7) / 2 };
+  const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  if (pred_cu == NULL) pred_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x_px), SUB_SCU(y_px));
+  assert(pred_cu->depth == pred_cu->tr_depth && "jccr does not support transform splitting");
+  if (cost_out == NULL && pred_cu->joint_cb_cr == 0) {
+    return;
+  }
+
+  double tr_tree_bits = 0;
+  double joint_cbcr_tr_tree_bits = 0;
+  double coeff_bits = 0;
+  double joint_coeff_bits = 0;
+
+  assert(lcu_px.x >= 0 && lcu_px.x < LCU_WIDTH_C);
+  assert(lcu_px.y >= 0 && lcu_px.y < LCU_WIDTH_C);
+
+  if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
+    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
+    // block and return 0 cost for all others.
+    return;
+  }
+
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t* ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+  cabac->cur_ctx = ctx;
+  int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+  CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");   
+  ctx = &(cabac->ctx.qt_cbf_model_cr[u_is_set]);
+  int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+  CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search");
+
+  int cbf_mask = u_is_set * 2 + v_is_set - 1;
+  if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2)
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag");
+
+  if(pred_cu->joint_cb_cr) {
+    const int u_jccr = (pred_cu->joint_cb_cr >> 1) & 1;
+    ctx = &(cabac->ctx.qt_cbf_model_cb[0]);
+    CABAC_FBITS_UPDATE(cabac, ctx, u_jccr, joint_cbcr_tr_tree_bits, "cbf_cb_search");
+    ctx = &(cabac->ctx.qt_cbf_model_cr[u_jccr]);
+    CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cr_search");
+    cbf_mask = pred_cu->joint_cb_cr - 1;
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag");
+  }
+  unsigned ssd = 0;
+  unsigned joint_ssd = 0;
+  if (!state->encoder_control->cfg.lossless) {
+    const int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+    const unsigned ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
+      LCU_WIDTH_C, LCU_WIDTH_C,
+      width);
+    const unsigned ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
+      LCU_WIDTH_C, LCU_WIDTH_C,
+      width);
+    ssd = ssd_u + ssd_v;
+
+    if (pred_cu->joint_cb_cr) {
+      const unsigned ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      const unsigned ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
        LCU_WIDTH_C, LCU_WIDTH_C,
        width);
      joint_ssd = ssd_u_joint + ssd_v_joint;      
@ -455,34 +680,33 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
    coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
    
-    if(state->encoder_control->cfg.jccr) {
-      joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
-    }
+    joint_coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);    
  }


  double bits = tr_tree_bits + coeff_bits;
  double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;

-  double cost = (double)ssd + bits * state->c_lambda;
-  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  double cost = (double)ssd * UVG_CHROMA_MULT + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd * UVG_CHROMA_MULT + joint_bits * state->c_lambda;
  if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
    pred_cu->joint_cb_cr = 0;
-    return cost;    
+    if (cost_out) *cost_out += cost;
+    return;
  }
  cbf_clear(&pred_cu->cbf, depth, COLOR_U);
  cbf_clear(&pred_cu->cbf, depth, COLOR_V);
-  if (pred_cu->joint_cb_cr & 1) {
+  if (pred_cu->joint_cb_cr & 2) {
    cbf_set(&pred_cu->cbf, depth, COLOR_U);
  }
-  if (pred_cu->joint_cb_cr & 2) {
+  if (pred_cu->joint_cb_cr & 1) {
    cbf_set(&pred_cu->cbf, depth, COLOR_V);
  }
  int lcu_width = LCU_WIDTH_C;
  const int index = lcu_px.x + lcu_px.y * lcu_width;
  uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
  uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
-  return joint_cost;
+  if (cost_out) *cost_out += joint_cost;
 }


@ -492,23 +716,9 @@ static double calc_mode_bits(const encoder_state_t *state,
                             const cu_info_t * cur_cu,
                             int x, int y, int depth)
 {
-  int x_local = SUB_SCU(x);
-  int y_local = SUB_SCU(y);
-
  assert(cur_cu->type == CU_INTRA);

-  int8_t candidate_modes[INTRA_MPM_COUNT];
-  {
-    const cu_info_t *left_cu  = ((x >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local - SCU_WIDTH, y_local) : NULL);
-    const cu_info_t *above_cu = ((y >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local, y_local - SCU_WIDTH) : NULL);
-    uvg_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
-  }
-
-  int width = LCU_WIDTH >> depth;
-  int height = width; // TODO: height for non-square blocks
-  int num_mip_modes_half = NUM_MIP_MODES_HALF(width, height);
-  int mip_flag_ctx_id = uvg_get_mip_flag_context(x, y, width, height, lcu, NULL);
-  double mode_bits = uvg_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, num_mip_modes_half, mip_flag_ctx_id);
+  double mode_bits = uvg_luma_mode_bits(state, cur_cu, x, y, depth, lcu);

  if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != UVG_CSP_400) {
    mode_bits += uvg_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
@ -518,6 +728,7 @@ static double calc_mode_bits(const encoder_state_t *state,
 }


+// TODO: replace usages of this by the uvg_sort_indices_by_cost function.
 /**
 * \brief Sort modes and costs to ascending order according to costs.
 */
@ -567,16 +778,25 @@ void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict traf
  }
 }

-
-
-static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
+/**
+ * \brief Sort keys (indices) to ascending order according to costs.
+ */
+void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map)
 {
-  vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
-  bool condA = x >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x - 1, lcu_cu.y    )->depth > depth;
-  bool condL = y >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x,     lcu_cu.y - 1)->depth > depth;
-  return condA + condL;
+  // Size of sorted arrays is expected to be "small". No need for faster algorithm.
+  for (uint8_t i = 1; i < map->size; ++i) {
+    const int8_t cur_indx = map->keys[i];
+    const double cur_cost = map->cost[cur_indx];
+    uint8_t j = i;
+    while (j > 0 && cur_cost < map->cost[map->keys[j - 1]]) {
+      map->keys[j] = map->keys[j - 1];
+      --j;
+    }
+    map->keys[j] = cur_indx;
+  }
 }

+
 /**
 * Search every mode from 0 to MAX_PU_DEPTH and return cost of best mode.
 * - The recursion is started at depth 0 and goes in Z-order to MAX_PU_DEPTH.
@ -592,10 +812,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  const encoder_control_t* ctrl = state->encoder_control;
  const videoframe_t * const frame = state->tile->frame;
  int cu_width = LCU_WIDTH >> depth;
-  double cost = MAX_INT;
-  double inter_zero_coeff_cost = MAX_INT;
-  uint32_t inter_bitcost = MAX_INT;
+  double cost = MAX_DOUBLE;
+  double inter_zero_coeff_cost = MAX_DOUBLE;
+  double inter_bitcost = MAX_INT;
  cu_info_t *cur_cu;
+  cabac_data_t pre_search_cabac;
+  memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac));

  const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH);
  const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS;
@ -626,7 +848,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

  // Assign correct depth limit
  constraint_t* constr = state->constraint;
- if(constr->ml_intra_depth_ctu) {
+  if(constr->ml_intra_depth_ctu) {
    pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth[(x_local >> 3) + (y_local >> 3) * 8];
    pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth[(x_local >> 3) + (y_local >> 3) * 8];
  }
@ -670,7 +892,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

    if (can_use_inter) {
      double mode_cost;
-      uint32_t mode_bitcost;
+      double mode_bitcost;
      uvg_search_cu_inter(state,
                          x, y,
                          depth,
@ -693,33 +915,34 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,

    int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
    bool can_use_intra =
-        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
+      (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
        // When the split was forced because the CTU is partially outside
        // the frame, we permit intra coding even if pu_depth_intra would
        // otherwise forbid it.
        (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width ||
-        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height;
+        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height) &&
+      !(state->encoder_control->cfg.force_inter && state->frame->slicetype != UVG_SLICE_I);

+    intra_search_data_t intra_search;
    if (can_use_intra && !skip_intra) {
-      int8_t intra_mode;
-      int8_t intra_trafo;
-      double intra_cost;
-      uint8_t multi_ref_index = 0;
-      bool mip_flag = false;
-      bool mip_transposed = false;
-      uvg_search_cu_intra(state, x, y, depth, lcu,
-                          &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
-      if (intra_cost < cost) {
-        cost = intra_cost;
+      intra_search.pred_cu = *cur_cu;
+      intra_search.pred_cu.joint_cb_cr = 4;
+      uvg_search_cu_intra(state, x, y, depth, &intra_search,
+                          lcu);
+#ifdef COMPLETE_PRED_MODE_BITS
+      // Technically counting these bits would be correct, however counting
+      // them universally degrades quality so this block is disabled by default
+      if(state->frame->slicetype != UVG_SLICE_I) {
+        double pred_mode_type_bits = 0;
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_pred_mode_model, 1, pred_mode_type_bits, "pred_mode_flag");
+        CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0, pred_mode_type_bits, "skip_flag");
+        intra_cost += pred_mode_type_bits * state->lambda;
+      }
+#endif
+      if (intra_search.cost < cost) {
+        cost = intra_search.cost;
+        *cur_cu = intra_search.pred_cu;
        cur_cu->type = CU_INTRA;
-        cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
-        cur_cu->intra.mode = intra_mode;
-        cur_cu->intra.multi_ref_idx = multi_ref_index;
-        cur_cu->intra.mip_flag = mip_flag;
-        cur_cu->intra.mip_is_transposed = mip_transposed;
-
-        //If the CU is not split from 64x64 block, the MTS is disabled for that CU.
-        cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0;
      }
    }

@ -727,20 +950,19 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // mode search of adjacent CUs.
    if (cur_cu->type == CU_INTRA) {
      assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      cur_cu->intra.mode_chroma = cur_cu->intra.mode;

+      intra_search.pred_cu.intra.mode_chroma = -1; // don't reconstruct chroma before search is performed for it
      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
      uvg_intra_recon_cu(state,
                         x, y,
-                         depth,
-                         cur_cu->intra.mode, -1, // skip chroma
-                         NULL, NULL, cur_cu->intra.multi_ref_idx, 
-                         cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, 
+                         depth, &intra_search,
+                         NULL, 
                         lcu);

      downsample_cclm_rec(
        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
      );
+      cur_cu->joint_cb_cr = 0;

      // TODO: This heavily relies to square CUs
      if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400) {
@ -748,19 +970,47 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        // rd2. Possibly because the luma mode search already takes chroma
        // into account, so there is less of a chanse of luma mode being
        // really bad for chroma.
-        cclm_parameters_t cclm_params[2];
+        intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma
        if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) {
-          cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
+          cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search);
+
+          if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4;
+          else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr;
+
          lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
        }
-
+        intra_search.pred_cu.intra.mode = -1; // skip luma
        uvg_intra_recon_cu(state,
-                           x & ~7, y & ~7, // TODO: as does this
-                           depth,
-                           -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, cclm_params, 0, 
-                           cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           x, y, // TODO: as does this
+                           depth, &intra_search,
+                           NULL,
                           lcu);
+        if(depth != 0 && state->encoder_control->cfg.jccr && ctrl->cfg.rdo < 3) {
+          uvg_select_jccr_mode(state,
+                               x, y,
+                               depth,
+                               NULL,
+                               lcu,
+                               NULL);
+        }
+        else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) {
+          assert(cur_cu->joint_cb_cr < 4);
+          cbf_clear(&cur_cu->cbf, depth, COLOR_U);
+          cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+          if (cur_cu->joint_cb_cr & 2) {
+            cbf_set(&cur_cu->cbf, depth, COLOR_U);
+          }
+          if (cur_cu->joint_cb_cr & 1) {
+            cbf_set(&cur_cu->cbf, depth, COLOR_V);
+          }
+          const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 };
+          int lcu_width = LCU_WIDTH_C;
+          const int index = lcu_px.x + lcu_px.y * lcu_width;
+          const int width = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+          uvg_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
+          uvg_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
+
+        }
      }
    } else if (cur_cu->type == CU_INTER) {

@ -788,11 +1038,20 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        }

        uvg_quantize_lcu_residual(state,
-          true, has_chroma,
-          x, y, depth,
-          NULL,
-          lcu,
-          false);
+                                  true, has_chroma,
+                                  state->encoder_control->cfg.jccr, x, y,
+                                  depth,
+                                  NULL,
+                                  lcu,
+                                  false);
+        if (cur_cu->depth == cur_cu->tr_depth && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr) {
+          uvg_select_jccr_mode(state,
+            x, y,
+            depth,
+            NULL,
+            lcu,
+            NULL);
+        }

        int cbf = cbf_is_set_any(cur_cu->cbf, depth);

@ -800,9 +1059,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
          cur_cu->merged = 0;
          cur_cu->skipped = 1;
          // Selecting skip reduces bits needed to code the CU
-          if (inter_bitcost > 1) {
-            inter_bitcost -= 1;
-          }
+          int skip_ctx = uvg_get_skip_context(x, y, lcu, NULL, NULL);
+          inter_bitcost = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_ctx], 1);
+          inter_bitcost += CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), cur_cu->merge_idx != 0);
+          inter_bitcost += cur_cu->merge_idx;        
        }
      }
      lcu_fill_inter(lcu, x_local, y_local, cu_width);
@ -811,19 +1071,25 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  }

  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-    if (state->encoder_control->chroma_format != UVG_CSP_400) {
-      cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
+    double bits = 0;
+    cabac_data_t* cabac  = &state->search_cabac;
+    cabac->update = 1;
+
+    if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) {
+      bits += uvg_mock_encode_coding_unit(
+        state,
+        cabac,
+        x, y, depth,
+        lcu,
+        cur_cu);
+    }
+    else {
+      assert(0);
    }
    
-    double mode_bits;
-    if (cur_cu->type == CU_INTRA) {
-      mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
-    } else {
-      mode_bits = inter_bitcost;
-    }
+    cost = bits * state->lambda;

-    cost += mode_bits * state->lambda;
+    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
    
    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
      cost = inter_zero_coeff_cost;
@ -846,13 +1112,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      cur_cu->cbf = 0;
      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
    }
+    cabac->update = 0;
  } 

  bool can_split_cu =
    // If the CU is partially outside the frame, we need to split it even
    // if pu_depth_intra and pu_depth_inter would not permit it.
    cur_cu->type == CU_NOTSET ||
-    depth < pu_depth_intra.max ||
+    (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) ||
    (state->frame->slicetype != UVG_SLICE_I &&
      depth < pu_depth_inter.max);

@ -861,21 +1128,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    int half_cu = cu_width / 2;
    double split_cost = 0.0;
    int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+    cabac_data_t post_seach_cabac;
+    memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+    memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac));
+    state->search_cabac.update = 1;
+
+    double split_bits = 0;

    if (depth < MAX_DEPTH) {
      // Add cost of cu_split_flag.
-      uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;
+      uvg_write_split_flag(state, &state->search_cabac, 
+        x > 0 ? LCU_GET_CU_AT_PX(lcu,SUB_SCU(x) -1, SUB_SCU(y)): NULL,
+        y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
+        1, depth, cu_width, x, y, &split_bits);
    }

-    if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
-      // Add cost of intra part_size.
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;  // NxN
-    }
+    state->search_cabac.update = 0;
+    split_cost += split_bits * state->lambda;

    // If skip mode was selected for the block, skip further search.
    // Skip mode means there's no coefficients in the block, so splitting
@ -897,13 +1166,23 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // searching.
    
    if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height 
+        && state->encoder_control->cfg.combine_intra_cus)
    {
+
      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);

      // If the best CU in depth+1 is intra and the biggest it can be, try it.
      if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
+        cabac_data_t temp_cabac;
+        memcpy(&temp_cabac, &state->search_cabac, sizeof(temp_cabac));
+        memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac));
        cost = 0;
+        double bits = 0;
+        uvg_write_split_flag(state, &state->search_cabac,
+          x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
+          y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
+          0, depth, cu_width, x, y, & split_bits);

        cur_cu->intra = cu_d1->intra;
        cur_cu->type = CU_INTRA;
@ -915,28 +1194,24 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth);
        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
        
-        const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
-        const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1;
+        intra_search_data_t proxy;
+        FILL(proxy, 0);
+        proxy.pred_cu = *cur_cu;
+
        uvg_intra_recon_cu(state,
                           x, y,
                           depth,
-                           cur_cu->intra.mode, mode_chroma,
-                           NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           &proxy,
+                           NULL,
                           lcu);

-        cost += uvg_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
-        if (has_chroma) {
-          cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
-        }
-
-        // Add the cost of coding no-split.
-        uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
-        const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
-
-        // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth);
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y, depth) + bits;
        cost += mode_bits * state->lambda;
+
+        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu);
+
+        memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
+        memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
      }
    }

@ -950,6 +1225,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    } else if (depth > 0) {
      // Copy this CU's mode all the way down for use in adjacent CUs mode
      // search.
+      memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
      work_tree_copy_down(x_local, y_local, depth, work_tree);
      downsample_cclm_rec(
        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@ -962,6 +1238,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        uvg_hmvp_add_mv(state, x, y, cu_width, cu_width, cur_cu);
      }
    }
+    else {
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );      
+    }
  } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
    // Need to copy modes down since the lower level of the work tree is used
    // when searching SMP and AMP blocks.
@ -1139,6 +1420,8 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
 */
 void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf, lcu_coeff_t *coeff)
 {
+  memcpy(&state->search_cabac, &state->cabac, sizeof(cabac_data_t));
+  state->search_cabac.only_count = 1;
  assert(x % LCU_WIDTH == 0);
  assert(y % LCU_WIDTH == 0);

--- a/src/search.h
+++ b/src/search.h
@ -44,22 +44,62 @@
 #include "image.h"
 #include "constraint.h"

-#define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)
-#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1
+#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS)
+
+ // Modify weight of luma SSD.
+#ifndef UVG_LUMA_MULT
+#define UVG_LUMA_MULT 0.8
+#endif
+// Modify weight of chroma SSD.
+#ifndef UVG_CHROMA_MULT
+#define UVG_CHROMA_MULT 1.5
+#endif
+
+ /**
+  *  \brief Data collected during search processes.
+  * 
+  *         The intended use is to collect statistics of the
+  *         searched coding/prediction units. Data related to
+  *         a specific unit is found at index i. The arrays
+  *         should be indexed by elements of the "keys" array
+  *         that will be sorted by the RD costs of the units.         
+  */
+typedef struct unit_stats_map_t {
+
+  cu_info_t unit[MAX_UNIT_STATS_MAP_SIZE]; //!< list of searched units
+  double    cost[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching RD costs
+  double    bits[MAX_UNIT_STATS_MAP_SIZE]; //!< list of matching bit costs  
+  int8_t    keys[MAX_UNIT_STATS_MAP_SIZE]; //!< list of keys (indices) to elements in the other arrays
+  int       size;                    //!< number of active elements in the lists
+} unit_stats_map_t;
+
+#define NUM_MIP_MODES_FULL(width, height) (((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12))
+#define NUM_MIP_MODES_HALF(width, height) (NUM_MIP_MODES_FULL((width), (height)) >> 1)

 void uvg_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void uvg_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);

+void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);
+
 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);

 double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu);
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu);
 double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         cu_info_t * pred_cu,
-                         lcu_t *const lcu);
+                             const int x_px, const int y_px, const int depth,
+                             cu_info_t *const pred_cu,
+                             lcu_t *const lcu);
+void uvg_select_jccr_mode(
+  const encoder_state_t* const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  cu_info_t* const pred_cu,
+  lcu_t* const lcu,
+  double* cost_out);
+
 void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);

 void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
--- a/src/search_inter.c
+++ b/src/search_inter.c
--- a/src/search_inter.h
+++ b/src/search_inter.h
@ -64,20 +64,34 @@ enum hpel_position {
  HPEL_POS_DIA = 2
 };

-typedef uint32_t uvg_mvd_cost_func(const encoder_state_t *state,
+typedef double uvg_mvd_cost_func(const encoder_state_t *state,
                                  int x, int y,
                                  int mv_shift,
                                  mv_t mv_cand[2][2],
                                  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                                  int16_t num_cand,
                                  int32_t ref_idx,
-                                  uint32_t *bitcost);
+                                  double *bitcost);

 void uvg_search_cu_inter(encoder_state_t * const state,
                         int x, int y, int depth,
                         lcu_t *lcu,
                         double *inter_cost,
-                         uint32_t *inter_bitcost);
+                         double* inter_bitcost);


+
+unsigned uvg_inter_satd_cost(const encoder_state_t* state,
+                             const lcu_t *lcu,
+                             int x,
+                             int y);
+void uvg_cu_cost_inter_rd2(encoder_state_t* const state,
+  int x, int y, int depth,
+  cu_info_t* cur_cu,
+  lcu_t* lcu,
+  double* inter_cost,
+  double* inter_bitcost);
+
+int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx);
+
 #endif // SEARCH_INTER_H_
--- a/src/search_intra.c
+++ b/src/search_intra.c
--- a/src/search_intra.h
+++ b/src/search_intra.h
@ -43,24 +43,21 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"

-double uvg_luma_mode_bits(const encoder_state_t *state, 
-                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id);
+double uvg_luma_mode_bits(const encoder_state_t *state, const cu_info_t* const cur_cu, int x, int y, int8_t depth, const lcu_t* lcu);
                       
 double uvg_chroma_mode_bits(const encoder_state_t *state,
                        int8_t chroma_mode, int8_t luma_mode);

 int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
                              const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm);
+                              const int depth, lcu_t *lcu, intra_search_data_t* best_cclm);

-void uvg_search_cu_intra(encoder_state_t * const state,
-                         const int x_px, const int y_px,
-                         const int depth, lcu_t *lcu,
-                         int8_t *mode_out,
-                         int8_t *trafo_out, 
-                         double *cost_out,
-                         uint8_t *multi_ref_idx_out,
-                         bool *mip_flag,
-                         bool *mip_transp);
+void uvg_search_cu_intra(
+  encoder_state_t * const state,
+  const int x_px,
+  const int y_px,
+  const int depth,
+  intra_search_data_t* search_data,
+  lcu_t *lcu);

 #endif // SEARCH_INTRA_H_
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -225,39 +225,40 @@ int uvg_quant_cbcr_residual_generic(
  int64_t best_cost = INT64_MAX;

  // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
-  for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) {
+  for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) {
    int64_t d1 = 0;
+    const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1);
    for (int y = 0; y < width; y++)
    {
      for (int x = 0; x < width; x++)
      {
        int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
-        if (cbf_mask == 1)
+        if (cbf_mask == 2)
        {
-          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1));
+          u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1));
        }
-        else if (cbf_mask == -1)
+        else if (cbf_mask == -2)
        {
-          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1));
+          u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1));
        }
        else if (cbf_mask == 3)
        {
-          u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]);
+          u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]);
        }
        else if (cbf_mask == -3)
        {
-          u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2);
-          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]);
+          u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2);
+          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]);
        }
-        else if (cbf_mask == 2)
+        else if (cbf_mask == 1)
        {
          v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
          d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
        }
-        else if (cbf_mask == -2)
+        else if (cbf_mask == -1)
        {
          v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
          d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
@ -270,19 +271,19 @@ int uvg_quant_cbcr_residual_generic(
      }
    }
    if (d1 < best_cost) {
-      best_cbf_mask = cbf_mask;
+      best_cbf_mask = i;
      best_cost = d1;
    }
  }

-  uvg_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+  uvg_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);

  if (state->encoder_control->cfg.rdoq_enable &&
    (width > 4 || !state->encoder_control->cfg.rdoq_skip))
  {
    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
      scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
  }
  else if (state->encoder_control->cfg.rdoq_enable && false) {
@ -290,7 +291,7 @@ int uvg_quant_cbcr_residual_generic(
      scan_order);
  }
  else {
-    uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
  }

@ -309,10 +310,10 @@ int uvg_quant_cbcr_residual_generic(
    int y, x;

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
    
-    uvg_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
    

    //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@ -333,32 +334,32 @@ int uvg_quant_cbcr_residual_generic(
    //    }
    //  }
    //}
-
+    const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1);
    // Get quantized reconstruction. (residual + pred_in -> rec_out)
    for (int y = 0; y < width; y++) {
      for (int x = 0; x < width; x++) {
-        if (best_cbf_mask == 1) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        if (temp == 2) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
        }
-        else if (best_cbf_mask == -1) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        else if (temp == -2) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
        }
-        else if (best_cbf_mask == 3) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+        else if (temp == 3) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
        }
-        else if (best_cbf_mask == -3) {
+        else if (temp == -3) {
          // non-normative clipping to prevent 16-bit overflow
-          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width];
+          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width];
        }
-        else if (best_cbf_mask == 2) {
+        else if (temp == 1) {
          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
          v_residual[x + y * width] = v1_residual[x + y * width];
        }
-        else if (best_cbf_mask == -2) {
+        else if (temp == -1) {
          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
          v_residual[x + y * width] = -v1_residual[x + y * width];
        }
--- a/src/transform.c
+++ b/src/transform.c
@ -260,12 +260,10 @@ int uvg_quantize_residual_trskip(
  struct {
    uvg_pixel rec[LCU_WIDTH * LCU_WIDTH];
    coeff_t coeff[LCU_WIDTH * LCU_WIDTH];
-    uint32_t cost;
+    double cost;
    int has_coeffs;
  } skip, *best;
  
-  const int bit_cost = (int)(state->lambda + 0.5);
-  
  //noskip.has_coeffs = uvg_quantize_residual(
  //    state, cur_cu, width, color, scan_order,
  //    0, in_stride, 4,
@ -278,7 +276,7 @@ int uvg_quantize_residual_trskip(
    1, in_stride, width,
    ref_in, pred_in, skip.rec, skip.coeff, false, lmcs_chroma_adj);
  skip.cost = uvg_pixels_calc_ssd(ref_in, skip.rec, in_stride, width, width);
-  skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * bit_cost;
+  skip.cost += uvg_get_coeff_cost(state, skip.coeff, width, 0, scan_order, 1) * state->frame->lambda;

 /*  if (noskip.cost <= skip.cost) {
    *trskip_out = 0;
@ -481,15 +479,17 @@ static void quantize_tr_residual(encoder_state_t * const state,
 * - lcu->cbf               coded block flags for the area
 * - lcu->cu.intra.tr_skip  tr skip flags for the area (in case of luma)
 */
-void uvg_quantize_lcu_residual(encoder_state_t * const state,
-                               const bool luma,
-                               const bool chroma,
-                               const int32_t x,
-                               const int32_t y,
-                               const uint8_t depth,
-                               cu_info_t *cur_pu,
-                               lcu_t* lcu,
-                               bool early_skip)
+void uvg_quantize_lcu_residual(
+  encoder_state_t * const state,
+  const bool luma,
+  const bool chroma,
+  const bool jccr,
+  const int32_t x,
+  const int32_t y,
+  const uint8_t depth,
+  cu_info_t *cur_pu,
+  lcu_t* lcu,
+  bool early_skip)
 {
  const int32_t width = LCU_WIDTH >> depth;
  const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
@ -511,7 +511,7 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
  if (luma) {
    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
  }
-  if (chroma) {
+  if (chroma || jccr) {
    cbf_clear(&cur_pu->cbf, depth, COLOR_U);
    cbf_clear(&cur_pu->cbf, depth, COLOR_V);
  }
@ -523,10 +523,11 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

-    uvg_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu, early_skip);
-    uvg_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip);
+    // jccr is currently not supported if transform is split
+    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x,  y, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2,  y, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0,  x, y2, depth + 1, NULL, lcu, early_skip);
+    uvg_quantize_lcu_residual(state, luma, chroma, 0, x2, y2, depth + 1, NULL, lcu, early_skip);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
@ -549,9 +550,9 @@ void uvg_quantize_lcu_residual(encoder_state_t * const state,
    if (chroma) {
      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);   
-      if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){
-        quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
-      }
+    }
+    if (jccr && cur_pu->tr_depth == cur_pu->depth) {
+      quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
    }
  }
 }
--- a/src/transform.h
+++ b/src/transform.h
@ -67,14 +67,16 @@ void uvg_itransform2d(const encoder_control_t * const encoder,

 int32_t uvg_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);

-void uvg_quantize_lcu_residual(encoder_state_t *state,
-                               bool luma,
-                               bool chroma,
-                               int32_t x,
-                               int32_t y,
-                               uint8_t depth,
-                               cu_info_t *cur_cu,
-                               lcu_t* lcu,
-                               bool early_skip);
+void uvg_quantize_lcu_residual(
+  encoder_state_t *state,
+  bool luma,
+  bool chroma,
+  const bool jccr,
+  int32_t x,
+  int32_t y,
+  uint8_t depth,
+  cu_info_t *cur_cu,
+  lcu_t* lcu,
+  bool early_skip);

 #endif
--- a/src/uvg266.h
+++ b/src/uvg266.h
@ -267,6 +267,12 @@ enum uvg_amvr_resolution
  UVG_IMV_HPEL    = 3
 };

+enum uvg_roi_format
+{
+  UVG_ROI_TXT = 0,
+  UVG_ROI_BIN = 1
+};
+
 // Map from input format to chroma format.
 #define UVG_FORMAT2CSP(format) ((enum uvg_chroma_format)format)

@ -408,10 +414,9 @@ typedef struct uvg_config
  int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */

  struct {
-    int32_t width;
-    int32_t height;
-    int8_t *dqps;
-  } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
+    char *file_path;
+    enum uvg_roi_format format;
+  } roi; /*!< \brief Specify delta QPs for region of interest coding. */

  unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */

@ -524,6 +529,12 @@ typedef struct uvg_config
  int8_t cclm;

  int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
+
+  /** \brief whether to try combining intra cus at the lower depth when search
+   *         is not performed at said depth*/
+  uint8_t combine_intra_cus;
+
+  uint8_t force_inter;
 } uvg_config;

 /**
@ -555,6 +566,14 @@ typedef struct uvg_picture {
  enum uvg_chroma_format chroma_format;

  int32_t ref_pocs[16];
+
+  struct
+  {
+    int width;
+    int height;
+    int8_t *roi_array;
+  } roi;
+
 } uvg_picture;

 /**
@ -781,6 +800,9 @@ typedef struct uvg_api {
   * original frame and frame info in data_out, len_out, pic_out, src_out and
   * info_out, respectively. Otherwise, set the output parameters to NULL.
   * 
+   * Region of interest (ROI) / delta QP map can be specified in the input
+   * picture's ROI field but only when a ROI file is not used.
+   *
   * After passing all of the input frames, the caller should keep calling this
   * function with pic_in set to NULL, until no more data is returned in the
   * output parameters.
--- a/tests/test_slices.sh
+++ b/tests/test_slices.sh
@ -3,6 +3,6 @@
 set -eu
 . "${0%/*}/util.sh"

-valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --tiles=2x2
+valgrind_test 512x256 10 yuv420p --threads=2 --owf=1 --preset=ultrafast --gop 0 --tiles=2x2
 #valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp
 #if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi