From e591e89ade3b6b9c39d671fe58a7c4e4f13fb842 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 15 Sep 2014 15:46:00 +0300
Subject: [PATCH 01/28] Add prediction mode to chroma reconstruction
 parameters.

- Just like in luma.
---
 src/intra.c  | 14 +++++++-------
 src/intra.h  |  2 +-
 src/search.c |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 84833a58..b4e90b56 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -703,7 +703,7 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int
   }
 }
 
-void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, lcu_t *lcu)
+void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   const vector2d lcu_px = { x & 0x3f, y & 0x3f };
@@ -714,10 +714,10 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
   if (depth == 0 || cur_cu->tr_depth > depth) {
     int offset = width / 2;
 
-    intra_recon_lcu_chroma(encoder_state, x,          y,          depth+1, lcu);
-    intra_recon_lcu_chroma(encoder_state, x + offset, y,          depth+1, lcu);
-    intra_recon_lcu_chroma(encoder_state, x,          y + offset, depth+1, lcu);
-    intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, lcu);
+    intra_recon_lcu_chroma(encoder_state, x,          y,          depth+1, intra_mode, lcu);
+    intra_recon_lcu_chroma(encoder_state, x + offset, y,          depth+1, intra_mode, lcu);
+    intra_recon_lcu_chroma(encoder_state, x,          y + offset, depth+1, intra_mode, lcu);
+    intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu);
 
     if (depth < MAX_DEPTH) {
       cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
@@ -757,7 +757,7 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
                   width_c,
                   recbase_u,
                   rec_stride >> 1,
-                  cur_cu->intra[0].mode_chroma,
+                  intra_mode,
                   1);
 
       intra_build_reference_border(encoder, x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 2,
@@ -768,7 +768,7 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
                   width_c,
                   recbase_v,
                   rec_stride >> 1,
-                  cur_cu->intra[0].mode_chroma,
+                  intra_mode,
                   2);
     }
 
diff --git a/src/intra.h b/src/intra.h
index b6d4de42..e01733ae 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -49,6 +49,6 @@ void intra_get_angular_pred(const encoder_control *encoder, pixel* src, int32_t
 void intra_recon(const encoder_control *encoder, pixel* rec, int32_t rec_stride, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
 
 void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu);
-void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, lcu_t *lcu);
+void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu);
 
 #endif
diff --git a/src/search.c b/src/search.c
index 01d7084f..98bccbeb 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1385,7 +1385,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
                          intra_mode_chroma,
                          cur_cu->part_size);
       intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
-      intra_recon_lcu_chroma(encoder_state, x, y, depth, &work_tree[depth]);
+      intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
     } else if (cur_cu->type == CU_INTER) {
       int cbf;
       inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);

From 549ac96438e90d3009601fc699a3cd90a31e578a Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 17 Sep 2014 11:52:22 +0300
Subject: [PATCH 02/28] Change costs to doubles to avoid rounding intermediate
 results.

- Helps with debugging.
---
 src/rdo.c    |  18 +++----
 src/search.c | 137 ++++++++++++++++++++++++---------------------------
 2 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/src/rdo.c b/src/rdo.c
index 7bf664f3..ff493990 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -196,8 +196,6 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
     int16_t temp_block[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient temp_coeff[LCU_WIDTH*LCU_WIDTH>>2];
-    uint32_t cost = 0;
-    uint32_t coeffcost = 0;
     int8_t luma_scan_mode = SCAN_DIAG;
 
     int i = 0,x,y;
@@ -225,26 +223,28 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     dequant(encoder_state, temp_coeff, pre_quant_coeff, width, width, 0, CU_INTRA);
     itransform2d(encoder, temp_block,pre_quant_coeff,width,0);
 
+    unsigned ssd = 0;
     // SSD between original and reconstructed
     for (i = 0; i < width*width; i++) {
       int diff = temp_block[i]-block[i];
-      cost += diff*diff;
+      ssd += diff*diff;
     }
 
+    double coeff_bits = 0;
     // Simple RDO
     if(encoder->rdo == 1) {
       // SSD between reconstruction and original + sum of coeffs
+      int coeff_abs = 0;
       for (i = 0; i < width*width; i++) {
-        coeffcost += abs((int)temp_coeff[i]);
+        coeff_abs += abs((int)temp_coeff[i]);
       }
-      cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
+      coeff_bits += 1 + 1.5 * coeff_abs;
       // Full RDO
     } else if(encoder->rdo >= 2) {
-      coeffcost = get_coeff_cost(encoder_state, temp_coeff, width, 0, luma_scan_mode);
-
-      cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
+      coeff_bits = get_coeff_cost(encoder_state, temp_coeff, width, 0, luma_scan_mode);
     }
-    return cost;
+
+    return (uint32_t)(0.5 + ssd + coeff_bits * encoder_state->global->cur_lambda_cost);
 }
 
 
diff --git a/src/search.c b/src/search.c
index 98bccbeb..a0fb37c7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -742,7 +742,7 @@ static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info *cu
 * Takes into account SSD of reconstruction and the cost of encoding whatever
 * prediction unit data needs to be coded.
 */
-static int cu_rd_cost_luma(const encoder_state *const encoder_state,
+static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   const int x_px, const int y_px, const int depth,
   const cu_info *const pred_cu,
   lcu_t *const lcu)
@@ -753,76 +753,70 @@ static int cu_rd_cost_luma(const encoder_state *const encoder_state,
   // cur_cu is used for TU parameters.
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH];
 
-  int x, y;
-  int cost = 0;
+  double coeff_bits = 0;
+  double trtree_bits = 0;
 
   // Check that lcu is not in 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
-  {
-    int trtree_bits = 0;
-    if (width <= TR_MAX_WIDTH
+  if (width <= TR_MAX_WIDTH
       && width > TR_MIN_WIDTH
-      && pred_cu->part_size != SIZE_NxN) {
-      trtree_bits += 1; // split_transform_flag
-    }
-    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+      && pred_cu->part_size != SIZE_NxN)
+  {
+    trtree_bits += 1; // split_transform_flag
   }
+
   if (tr_cu->tr_depth > depth) {
     int offset = width / 2;
+    double sum = 0;
 
-    cost += cu_rd_cost_luma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_luma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return cost;
+    return sum + trtree_bits * encoder_state->global->cur_lambda_cost;
   }
 
   if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) {
-    int trtree_bits = 1; // cbf_luma
-    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+    trtree_bits += 1;  // cbf_luma
   }
 
+  unsigned ssd = 0;
   // SSD between reconstruction and original
-  for (y = y_px; y < y_px + width; ++y) {
-    for (x = x_px; x < x_px + width; ++x) {
+  for (int y = y_px; y < y_px + width; ++y) {
+    for (int x = x_px; x < x_px + width; ++x) {
       int diff = (int)lcu->rec.y[y * LCU_WIDTH + x] - (int)lcu->ref.y[y * LCU_WIDTH + x];
-      cost += diff*diff;
+      ssd += diff*diff;
     }
   }
 
   if (rdo == 1) {
-    int coeff_cost = 0;
+    int coeff_abs = 0;
 
     // Estimate coding cost to be 1.5 * summ of abs coeffs.
-    for (y = y_px; y < y_px + width; ++y) {
-      for (x = x_px; x < x_px + width; ++x) {
-        coeff_cost += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]);
+    for (int y = y_px; y < y_px + width; ++y) {
+      for (int x = x_px; x < x_px + width; ++x) {
+        coeff_abs += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]);
       }
     }
-    cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
-
+    coeff_bits += 1.5 * coeff_abs;
   } else if (rdo >= 2) {
-    int coeff_cost = 0;
-
     coefficient coeff_temp[32 * 32];
     int8_t luma_scan_mode = get_scan_order(pred_cu->type, pred_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth);
 
     // Code coeffs using cabac to get a better estimate of real coding costs.
     coefficients_blit(&lcu->coeff.y[(y_px*LCU_WIDTH) + x_px], coeff_temp, width, width, LCU_WIDTH, width);
-    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode);
-
-    // Multiply bit count with lambda to get RD-cost
-    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+    coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode);
   }
 
-  return cost;
+  double bits = trtree_bits + coeff_bits;
+  return ssd + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
-static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
+static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
   const int x_px, const int y_px, const int depth,
   const cu_info *const pred_cu,
   lcu_t *const lcu)
@@ -832,15 +826,13 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
   const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH];
 
-  int x, y;
-
-  int cost = 0;
+  double trtree_bits = 0;
+  double coeff_bits = 0;
 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
   if (depth < MAX_PU_DEPTH) {
-    int trtree_bits = 0;
     // cbf_c bits are present only when log2TrafoSize > 2
     if (tr_cu->tr_depth == depth) {
       // cbf_c bits are always present at transform depth 0.
@@ -850,7 +842,6 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
       trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1);
       trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1);
     }
-    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
   } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
     // For MAX_PU_DEPTH calculate chroma for previous depth for the first
     // block and return 0 cost for all others.
@@ -859,57 +850,58 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
 
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
+    int sum = 0;
 
-    cost += cu_rd_cost_chroma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_chroma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return cost;
+    return sum + trtree_bits * encoder_state->global->cur_lambda_cost;
   }
 
   // Chroma SSD
-  for (y = lcu_px.y; y < lcu_px.y + width; ++y) {
-    for (x = lcu_px.x; x < lcu_px.x + width; ++x) {
+  int ssd = 0;
+  for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
+    for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
       int diff = (int)lcu->rec.u[y * LCU_WIDTH_C + x] - (int)lcu->ref.u[y * LCU_WIDTH_C + x];
-      cost += diff * diff;
-      diff = (int)lcu->rec.v[y * LCU_WIDTH_C + x] - (int)lcu->ref.v[y * LCU_WIDTH_C + x];
-      cost += diff * diff;
+      ssd += diff * diff;
+    }
+  }
+  for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
+    for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
+      int diff = (int)lcu->rec.v[y * LCU_WIDTH_C + x] - (int)lcu->ref.v[y * LCU_WIDTH_C + x];
+      ssd += diff * diff;
     }
   }
 
   if (rdo == 1) {
-    int coeff_cost = 0;
+    int coeff_abs = 0;
 
     // Estimate coding cost to be 1.5 * summ of abs coeffs.
-    for (y = lcu_px.y; y < lcu_px.y + width; ++y) {
-      for (x = lcu_px.x; x < lcu_px.x + width; ++x) {
-        coeff_cost += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]);
-        coeff_cost += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]);
+    for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
+      for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
+        coeff_abs += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]);
+        coeff_abs += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]);
       }
     }
 
-    cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+    coeff_bits = 1.5 * coeff_abs;
   } else if (rdo >= 2) {
     coefficient coeff_temp[16 * 16];
     int8_t scan_order = get_scan_order(pred_cu->type, pred_cu->intra[0].mode_chroma, depth);
-
-    int coeff_cost = 0;
-
+    
     coefficients_blit(&lcu->coeff.u[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x],
-      coeff_temp, width, width, LCU_WIDTH_C, width);
-    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
-
+                      coeff_temp, width, width, LCU_WIDTH_C, width);
+    coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
 
     coefficients_blit(&lcu->coeff.v[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x],
-      coeff_temp, width, width, LCU_WIDTH_C, width);
-    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
-
-    // Multiply bit count with lambda to get RD-cost
-    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+                      coeff_temp, width, width, LCU_WIDTH_C, width);
+    coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
   }
 
-  return cost;
+  double bits = trtree_bits + coeff_bits;
+  return ssd + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
@@ -925,7 +917,7 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
 * \param intra_mode  Intra prediction mode.
 * \param cost_treshold  RD cost at which search can be stopped.
 */
-static int32_t search_intra_trdepth(encoder_state * const encoder_state,
+static double search_intra_trdepth(encoder_state * const encoder_state,
   int x_px, int y_px, int depth, int max_depth,
   int intra_mode, int cost_treshold,
   const cu_info *const pred_cu,
@@ -938,8 +930,8 @@ static int32_t search_intra_trdepth(encoder_state * const encoder_state,
 
   pixel nosplit_pixels[TR_MAX_WIDTH*TR_MAX_WIDTH];
 
-  int32_t split_cost = INT32_MAX;
-  int32_t nosplit_cost = INT32_MAX;
+  double split_cost = INT32_MAX;
+  double nosplit_cost = INT32_MAX;
 
   assert(width >= TR_MIN_WIDTH);
 
@@ -962,7 +954,7 @@ static int32_t search_intra_trdepth(encoder_state * const encoder_state,
   }
 
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+    split_cost = 3 * encoder_state->global->cur_lambda_cost;
 
     split_cost += search_intra_trdepth(encoder_state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     if (split_cost < nosplit_cost) {
@@ -1200,7 +1192,8 @@ static void search_intra_rdo(encoder_state * const encoder_state,
       // Reset transform split data in lcu.cu for this area.
       lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
 
-      costs[rdo_mode] += search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
+      double mode_cost = search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
+      costs[rdo_mode] += (uint32_t)(0.5 + mode_cost);
     }
   }
 

From c164978e210ec7090b40df919ad232a9382c160e Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 17 Sep 2014 12:09:15 +0300
Subject: [PATCH 03/28] Add FULL_CU_SPLIT_SEARCH macro for disabling cu split
 optimization.

---
 src/search.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/search.c b/src/search.c
index a0fb37c7..efdbeae6 100644
--- a/src/search.c
+++ b/src/search.c
@@ -47,6 +47,13 @@
   && (x) + (block_width) <= (width) \
   && (y) + (block_height) <= (height))
 
+#ifndef CU_SPLIT_COST
+#  define CU_SPLIT_COST 9
+#endif
+#ifndef FULL_CU_SPLIT_SEARCH
+#  define FULL_CU_SPLIT_SEARCH false
+#endif
+
 /**
  * This is used in the hexagon_search to select 3 points to search.
  *
@@ -1404,11 +1411,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
     // Bitcost
     cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
   }
-
-#ifndef CU_SPLIT_COST
-#  define CU_SPLIT_COST 9
-#endif
-
+  
   // Recursively split all the way to max search depth.
   if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) {
     int half_cu = cu_width / 2;
@@ -1419,7 +1422,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting
     // might not give any better results but takes more time to do.
-    if(cur_cu->type == CU_NOTSET || cbf) {
+    if (cur_cu->type == CU_NOTSET || cbf || FULL_CU_SPLIT_SEARCH) {
       split_cost += search_cu(encoder_state, x,           y,           depth + 1, work_tree);
       split_cost += search_cu(encoder_state, x + half_cu, y,           depth + 1, work_tree);
       split_cost += search_cu(encoder_state, x,           y + half_cu, depth + 1, work_tree);

From a0ab469c89fe9973066ef8db7da3b7a69ee65750 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 17 Sep 2014 12:26:26 +0300
Subject: [PATCH 04/28] Disable rdo_cost_intra.

---
 src/search.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.c b/src/search.c
index efdbeae6..a9765351 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1184,7 +1184,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
     int rdo_bitcost = intra_pred_ratecost(modes[rdo_mode], intra_preds);
     costs[rdo_mode] = rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
 
-    if (tr_depth == depth) {
+    if (0 && tr_depth == depth) {
       // The reconstruction is calculated again here, it could be saved from before..
       intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[rdo_mode], 0);
       costs[rdo_mode] += rdo_cost_intra(encoder_state, pred, orig_block, width, modes[rdo_mode], width == 4 ? 1 : 0);

From ccc575e2c6e0814199b39523c3e442d36cb576fe Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 23 Sep 2014 09:47:59 +0300
Subject: [PATCH 05/28] Disable transform tree bits.

---
 src/search.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/search.c b/src/search.c
index a9765351..2416b252 100644
--- a/src/search.c
+++ b/src/search.c
@@ -771,7 +771,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
       && width > TR_MIN_WIDTH
       && pred_cu->part_size != SIZE_NxN)
   {
-    trtree_bits += 1; // split_transform_flag
+    //trtree_bits += 1; // split_transform_flag
   }
 
   if (tr_cu->tr_depth > depth) {
@@ -787,7 +787,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   }
 
   if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) {
-    trtree_bits += 1;  // cbf_luma
+    //trtree_bits += 1;  // cbf_luma
   }
 
   unsigned ssd = 0;
@@ -843,11 +843,11 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
     // cbf_c bits are present only when log2TrafoSize > 2
     if (tr_cu->tr_depth == depth) {
       // cbf_c bits are always present at transform depth 0.
-      trtree_bits += 2;
+      //trtree_bits += 2;
     } else {
       // cbf_c bits are not present if cbf has already been set to 0.
-      trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1);
-      trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1);
+      //trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1);
+      //trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1);
     }
   } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
     // For MAX_PU_DEPTH calculate chroma for previous depth for the first

From bc7d7d5cb6a68d541fbadf17ea48a2b4b796e840 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 23 Sep 2014 14:41:25 +0300
Subject: [PATCH 06/28] Add cu_info* as parameter to reconstruction functions.

- This is required so these functions can be used for searching. When NULL
  is given they take the CU from LCU struct as they did previously.

Conflicts:
	src/search.c
---
 src/intra.c     | 33 +++++++++++++++++++--------------
 src/intra.h     |  4 ++--
 src/search.c    | 10 +++++-----
 src/transform.c | 28 ++++++++++++++++------------
 src/transform.h |  4 ++--
 5 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index b4e90b56..49236100 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -656,20 +656,22 @@ void intra_get_planar_pred(pixel* src, int32_t srcstride, uint32_t width, pixel*
   }
 }
 
-void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu)
+void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   const vector2d lcu_px = { x & 0x3f, y & 0x3f };
-  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
+  if (cur_cu == NULL) {
+    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+  }
   const int8_t width = LCU_WIDTH >> depth;
 
   if (depth == 0 || cur_cu->tr_depth > depth) {
     int offset = width / 2;
 
-    intra_recon_lcu_luma(encoder_state, x,          y,          depth+1, intra_mode, lcu);
-    intra_recon_lcu_luma(encoder_state, x + offset, y,          depth+1, intra_mode, lcu);
-    intra_recon_lcu_luma(encoder_state, x,          y + offset, depth+1, intra_mode, lcu);
-    intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu);
+    intra_recon_lcu_luma(encoder_state, x,          y,          depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_luma(encoder_state, x + offset, y,          depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_luma(encoder_state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
       cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
@@ -699,25 +701,28 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int
     intra_recon(encoder, rec_shift, width * 2 + 8,
                 width, recbase_y, rec_stride, intra_mode, 0);
 
-    quantize_lcu_luma_residual(encoder_state, x, y, depth, lcu);
+    quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, lcu);
   }
 }
 
-void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu)
+void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   const vector2d lcu_px = { x & 0x3f, y & 0x3f };
-  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
   const int8_t width = LCU_WIDTH >> depth;
   const int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
 
+  if (cur_cu == NULL) {
+    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+  }
+
   if (depth == 0 || cur_cu->tr_depth > depth) {
     int offset = width / 2;
 
-    intra_recon_lcu_chroma(encoder_state, x,          y,          depth+1, intra_mode, lcu);
-    intra_recon_lcu_chroma(encoder_state, x + offset, y,          depth+1, intra_mode, lcu);
-    intra_recon_lcu_chroma(encoder_state, x,          y + offset, depth+1, intra_mode, lcu);
-    intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu);
+    intra_recon_lcu_chroma(encoder_state, x,          y,          depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_chroma(encoder_state, x + offset, y,          depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_chroma(encoder_state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
+    intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
       cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
@@ -772,6 +777,6 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
                   2);
     }
 
-    quantize_lcu_chroma_residual(encoder_state, x, y, depth, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, lcu);
   }
 }
diff --git a/src/intra.h b/src/intra.h
index e01733ae..3df46c9f 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -48,7 +48,7 @@ void intra_get_angular_pred(const encoder_control *encoder, pixel* src, int32_t
 
 void intra_recon(const encoder_control *encoder, pixel* rec, int32_t rec_stride, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
 
-void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu);
-void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu);
+void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu);
+void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu);
 
 #endif
diff --git a/src/search.c b/src/search.c
index 2416b252..f993750e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -944,7 +944,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 
   if (depth > 0) {
     tr_cu->tr_depth = depth;
-    intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, lcu);
+    intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
     // Clear cbf bits because they have been set by the reconstruction.
@@ -1384,13 +1384,13 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
                          intra_mode,
                          intra_mode_chroma,
                          cur_cu->part_size);
-      intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
-      intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
+      intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]);
+      intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]);
     } else if (cur_cu->type == CU_INTER) {
       int cbf;
       inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
-      quantize_lcu_luma_residual(encoder_state, x, y, depth, &work_tree[depth]);
-      quantize_lcu_chroma_residual(encoder_state, x, y, depth, &work_tree[depth]);
+      quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, &work_tree[depth]);
+      quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, &work_tree[depth]);
 
       cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
 
diff --git a/src/transform.c b/src/transform.c
index daa6a11d..2b9042e2 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -523,12 +523,14 @@ int quantize_residual_trskip(
  * - lcu->cbf  coded block flags for the area
  * - lcu->cu.intra[].tr_skip  for the area
  */
-void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu)
+void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, cu_info *cur_cu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
   const vector2d lcu_px = {x & 0x3f, y & 0x3f};
   const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
-  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
+  if (cur_cu == NULL) {
+    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+  }
   const int8_t width = LCU_WIDTH>>depth;
   
   // Tell clang-analyzer what is up. For some reason it can't figure out from
@@ -538,10 +540,10 @@ void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x,
   // Split transform and increase depth
   if (depth == 0 || cur_cu->tr_depth > depth) {
     int offset = width / 2;
-    quantize_lcu_luma_residual(encoder_state, x,          y,          depth+1, lcu);
-    quantize_lcu_luma_residual(encoder_state, x + offset, y,          depth+1, lcu);
-    quantize_lcu_luma_residual(encoder_state, x,          y + offset, depth+1, lcu);
-    quantize_lcu_luma_residual(encoder_state, x + offset, y + offset, depth+1, lcu);
+    quantize_lcu_luma_residual(encoder_state, x,          y,          depth+1, NULL, lcu);
+    quantize_lcu_luma_residual(encoder_state, x + offset, y,          depth+1, NULL, lcu);
+    quantize_lcu_luma_residual(encoder_state, x,          y + offset, depth+1, NULL, lcu);
+    quantize_lcu_luma_residual(encoder_state, x + offset, y + offset, depth+1, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     if (depth < MAX_DEPTH) {
@@ -605,13 +607,15 @@ void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x,
 }
 
 
-void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu)
+void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, cu_info *cur_cu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
   const vector2d lcu_px = {x & 0x3f, y & 0x3f};
   const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
-  cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH];
   const int8_t width = LCU_WIDTH>>depth;
+  if (cur_cu == NULL) {
+    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+  }
   
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
@@ -620,10 +624,10 @@ void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x
   // Split transform and increase depth
   if (depth == 0 || cur_cu->tr_depth > depth) {
     int offset = width / 2;
-    quantize_lcu_chroma_residual(encoder_state, x,          y,          depth+1, lcu);
-    quantize_lcu_chroma_residual(encoder_state, x + offset, y,          depth+1, lcu);
-    quantize_lcu_chroma_residual(encoder_state, x,          y + offset, depth+1, lcu);
-    quantize_lcu_chroma_residual(encoder_state, x + offset, y + offset, depth+1, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x,          y,          depth+1, NULL, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x + offset, y,          depth+1, NULL, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x,          y + offset, depth+1, NULL, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x + offset, y + offset, depth+1, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     if (depth < MAX_DEPTH) {
diff --git a/src/transform.h b/src/transform.h
index 1479e90e..e45b1fe6 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -46,7 +46,7 @@ void itransform2d(const encoder_control *encoder, int16_t *block,int16_t *coeff,
 
 int32_t get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset);
 
-void quantize_lcu_luma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu);
-void quantize_lcu_chroma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu);
+void quantize_lcu_luma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, cu_info *cur_cu, lcu_t* lcu);
+void quantize_lcu_chroma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, cu_info *cur_cu, lcu_t* lcu);
 
 #endif

From 51662e1081b69d206d953432681fd66e29d98091 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 23 Sep 2014 15:17:56 +0300
Subject: [PATCH 07/28] Fix differences between cu_rd_cost_luma and
 rdo_cost_intra.

---
 src/intra.c     |  2 +-
 src/rdo.c       |  4 +++-
 src/search.c    | 13 +++++++++++--
 src/transform.c |  4 +++-
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 49236100..3ccf0fb6 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -701,7 +701,7 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int
     intra_recon(encoder, rec_shift, width * 2 + 8,
                 width, recbase_y, rec_stride, intra_mode, 0);
 
-    quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, lcu);
+    quantize_lcu_luma_residual(encoder_state, x, y, depth, cur_cu, lcu);
   }
 }
 
diff --git a/src/rdo.c b/src/rdo.c
index ff493990..d3635f37 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -226,7 +226,9 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
     unsigned ssd = 0;
     // SSD between original and reconstructed
     for (i = 0; i < width*width; i++) {
-      int diff = temp_block[i]-block[i];
+      //int diff = temp_block[i]-block[i];
+      int diff = orig_block[i] - CLIP(0, 255, pred[i] + temp_block[i]);
+
       ssd += diff*diff;
     }
 
diff --git a/src/search.c b/src/search.c
index f993750e..d1eb5b2d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -927,7 +927,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
 static double search_intra_trdepth(encoder_state * const encoder_state,
   int x_px, int y_px, int depth, int max_depth,
   int intra_mode, int cost_treshold,
-  const cu_info *const pred_cu,
+  cu_info *const pred_cu,
   lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
@@ -944,6 +944,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 
   if (depth > 0) {
     tr_cu->tr_depth = depth;
+    pred_cu->tr_depth = depth;
     intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
@@ -1195,6 +1196,10 @@ static void search_intra_rdo(encoder_state * const encoder_state,
       pred_cu.type = CU_INTRA;
       pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
       pred_cu.intra[0].mode = modes[rdo_mode];
+      pred_cu.intra[1].mode = modes[rdo_mode];
+      pred_cu.intra[2].mode = modes[rdo_mode];
+      pred_cu.intra[3].mode = modes[rdo_mode];
+      pred_cu.intra[0].mode_chroma = modes[rdo_mode];
 
       // Reset transform split data in lcu.cu for this area.
       lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
@@ -1212,6 +1217,10 @@ static void search_intra_rdo(encoder_state * const encoder_state,
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
     pred_cu.intra[0].mode = modes[0];
+    pred_cu.intra[1].mode = modes[0];
+    pred_cu.intra[2].mode = modes[0];
+    pred_cu.intra[3].mode = modes[0];
+    pred_cu.intra[0].mode_chroma = modes[0];
     search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu);
   }
 }
@@ -1294,7 +1303,7 @@ static int search_cu_intra(encoder_state * const encoder_state,
       }
       int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search);
       search_intra_rdo(encoder_state, 
-                       lcu_px.x, lcu_px.y, depth,
+                       x_px, y_px, depth,
                        ref_pixels, LCU_WIDTH,
                        cu_in_rec_buffer, cu_width * 2 + 8,
                        candidate_modes,
diff --git a/src/transform.c b/src/transform.c
index 2b9042e2..ee16d7b2 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -361,8 +361,10 @@ int quantize_residual(encoder_state *const encoder_state,
 
   // Quantize coeffs. (coeff -> quant_coeff)
   if (encoder_state->encoder_control->rdoq_enable) {
+    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
+    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     rdoq(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
-         scan_order, cur_cu->type, cur_cu->tr_depth-cur_cu->depth);
+         scan_order, cur_cu->type, tr_depth);
   } else {
     quant(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
           scan_order, cur_cu->type);

From 8a80845b914d89e60100240968c4c9abbd425f43 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 1 Oct 2014 12:31:10 +0300
Subject: [PATCH 08/28] Add chroma to transform split search.

---
 src/intra.c  |  2 +-
 src/search.c | 33 ++++++++++++++++++++++++++-------
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 3ccf0fb6..b50eb457 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -777,6 +777,6 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
                   2);
     }
 
-    quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, lcu);
+    quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu);
   }
 }
diff --git a/src/search.c b/src/search.c
index d1eb5b2d..69b7b603 100644
--- a/src/search.c
+++ b/src/search.c
@@ -931,11 +931,17 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
   lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
+  const int width_c = width > TR_MIN_WIDTH ? width / 2 : width;
+
   const int offset = width / 2;
   const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
 
-  pixel nosplit_pixels[TR_MAX_WIDTH*TR_MAX_WIDTH];
+  struct {
+    pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
+    pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
+    pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH];
+  } nosplit_pixels;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
@@ -945,11 +951,16 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
   if (depth > 0) {
     tr_cu->tr_depth = depth;
     pred_cu->tr_depth = depth;
-    intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
-    nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
-    // Clear cbf bits because they have been set by the reconstruction.
-    cbf_clear(&tr_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4));
+    nosplit_cost = 0.0;
+
+    intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
+    nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+
+    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+      intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
+      nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+    }
 
     // Early stop codition for the recursive search.
     // If the cost of any 1/4th of the transform is already larger than the
@@ -958,7 +969,11 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
       return nosplit_cost;
     }
 
-    pixels_blit(lcu->rec.y, nosplit_pixels, width, width, LCU_WIDTH, width);
+    pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width);
+    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+      pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c);
+      pixels_blit(lcu->rec.v, nosplit_pixels.v, width_c, width_c, LCU_WIDTH_C, width_c);
+    }
   }
 
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
@@ -985,7 +1000,11 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 
     // We only restore the pixel data and not coefficients or cbf data.
     // The only thing we really need are the border pixels.
-    pixels_blit(nosplit_pixels, lcu->rec.y, width, width, width, LCU_WIDTH);
+    pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
+    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+      pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C);
+      pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C);
+    }
 
     return nosplit_cost;
   }

From e1b801eb6ffe2f9b67ce79f5ab6915a0aa6693e4 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 1 Oct 2014 12:32:29 +0300
Subject: [PATCH 09/28] Add transform tree chroma cbf bits.

---
 src/search.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/src/search.c b/src/search.c
index 69b7b603..68f8d561 100644
--- a/src/search.c
+++ b/src/search.c
@@ -767,14 +767,37 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
+  bool split_transform_flag = tr_cu->tr_depth > depth;
+
+  // Add cost of intra split flag on transform tree.
+  bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
+  double tr_tree_bits = 0.0;
   if (width <= TR_MAX_WIDTH
       && width > TR_MIN_WIDTH
-      && pred_cu->part_size != SIZE_NxN)
+      && !intra_split_flag)
   {
-    //trtree_bits += 1; // split_transform_flag
+    const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]);
+    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag);
   }
 
-  if (tr_cu->tr_depth > depth) {
+  // Add cost of cbf chroma bits on transform tree.
+  // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true
+  // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0
+  // if this and any previous transform block has no chroma coefficients.
+  // When searching the first block we don't actually know the real values,
+  // so this will code cbf as 0 and not code the cbf at all for descendants.
+  int tr_depth = depth - pred_cu->depth;
+  if (depth < MAX_PU_DEPTH) {  // log2TrafoSize > 2 
+    const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+    }
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+    }
+  }
+
+  if (split_transform_flag) {
     int offset = width / 2;
     double sum = 0;
 
@@ -989,6 +1012,35 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     if (split_cost < nosplit_cost) {
       split_cost += search_intra_trdepth(encoder_state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     }
+
+    double tr_split_bit = 0.0;
+    double cbf_bits = 0.0;
+
+    bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
+    if (depth >= 1 && depth <= 3 && !intra_split_flag) {
+      const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]);
+      tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1);
+    }
+
+    // Add cost of cbf chroma bits on transform tree.
+    // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true
+    // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0
+    // if this and any previous transform block has no chroma coefficients.
+    // When searching the first block we don't actually know the real values,
+    // so this will code cbf as 0 and not code the cbf at all for descendants.
+    int tr_depth = depth - pred_cu->depth;
+    if (depth < MAX_PU_DEPTH) {  // log2TrafoSize > 2 
+      const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+      }
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+      }
+    }
+
+    double bits = tr_split_bit + cbf_bits;
+    split_cost += bits * encoder_state->global->cur_lambda_cost;
   } else {
     assert(width <= TR_MAX_WIDTH);
   }

From 85dea10f3f56a90f534f619e29296db1162874b1 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 1 Oct 2014 16:51:49 +0300
Subject: [PATCH 10/28] Clean up transform split search.

- Remove unnecessary checks and comment.
---
 src/search.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/search.c b/src/search.c
index 68f8d561..10b3769a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -960,6 +960,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
   const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
 
+  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
+
   struct {
     pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH];
     pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
@@ -980,7 +982,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
-    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+    if (reconstruct_chroma) {
       intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
       nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
@@ -993,12 +995,17 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     }
 
     pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width);
-    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+    if (reconstruct_chroma) {
       pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c);
       pixels_blit(lcu->rec.v, nosplit_pixels.v, width_c, width_c, LCU_WIDTH_C, width_c);
     }
   }
 
+  // Recurse further if all of the following:
+  // - Current depth is less than maximum depth of the search (max_depth).
+  //   - Maximum transform hierarchy depth is constrained by clipping
+  //     max_depth.
+  // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
     split_cost = 3 * encoder_state->global->cur_lambda_cost;
 
@@ -1016,8 +1023,9 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     double tr_split_bit = 0.0;
     double cbf_bits = 0.0;
 
-    bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
-    if (depth >= 1 && depth <= 3 && !intra_split_flag) {
+    // Add bits for split_transform_flag = 1, because transform depth search bypasses
+    // the normal recursion in the cost functions.
+    if (depth >= 1 && depth <= 3) {
       const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]);
       tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1);
     }
@@ -1028,8 +1036,9 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     // if this and any previous transform block has no chroma coefficients.
     // When searching the first block we don't actually know the real values,
     // so this will code cbf as 0 and not code the cbf at all for descendants.
-    int tr_depth = depth - pred_cu->depth;
-    if (depth < MAX_PU_DEPTH) {  // log2TrafoSize > 2 
+    {
+      const uint8_t tr_depth = depth - pred_cu->depth;
+
       const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
         cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
@@ -1053,7 +1062,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     // We only restore the pixel data and not coefficients or cbf data.
     // The only thing we really need are the border pixels.
     pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
-    if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) {
+    if (reconstruct_chroma) {
       pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C);
       pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C);
     }

From 296f142d9e258697ad14dff7bb7f0f22125325d0 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 1 Oct 2014 18:06:28 +0300
Subject: [PATCH 11/28] Retain coded block flag data during transform split
 search.

---
 src/search.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/search.c b/src/search.c
index 10b3769a..7e5e0d9c 100644
--- a/src/search.c
+++ b/src/search.c
@@ -967,6 +967,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH];
     pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH];
   } nosplit_pixels;
+  cu_cbf_t nosplit_cbf;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
@@ -979,10 +980,15 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 
     nosplit_cost = 0.0;
 
+    cbf_clear(&pred_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4));
+
     intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
     if (reconstruct_chroma) {
+      cbf_clear(&pred_cu->cbf.u, depth);
+      cbf_clear(&pred_cu->cbf.v, depth);
+
       intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
       nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
@@ -994,6 +1000,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
       return nosplit_cost;
     }
 
+    nosplit_cbf = pred_cu->cbf;
+
     pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width);
     if (reconstruct_chroma) {
       pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c);
@@ -1059,6 +1067,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
   } else {
     lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
 
+    pred_cu->cbf = nosplit_cbf;
+
     // We only restore the pixel data and not coefficients or cbf data.
     // The only thing we really need are the border pixels.
     pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
@@ -1280,6 +1290,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
       pred_cu.intra[2].mode = modes[rdo_mode];
       pred_cu.intra[3].mode = modes[rdo_mode];
       pred_cu.intra[0].mode_chroma = modes[rdo_mode];
+      memset(&pred_cu.cbf, 0, sizeof(pred_cu.cbf));
 
       // Reset transform split data in lcu.cu for this area.
       lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
@@ -1301,6 +1312,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
     pred_cu.intra[2].mode = modes[0];
     pred_cu.intra[3].mode = modes[0];
     pred_cu.intra[0].mode_chroma = modes[0];
+    memset(&pred_cu.cbf, 0, sizeof(pred_cu.cbf));
     search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu);
   }
 }

From 3b04d39db40a1302df6986aa834c35e387e7dbf4 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 2 Oct 2014 11:51:34 +0300
Subject: [PATCH 12/28] Take cabac bits into account on transform tree.

---
 src/search.c | 53 ++++++++++++++++++----------------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/src/search.c b/src/search.c
index 7e5e0d9c..f952a277 100644
--- a/src/search.c
+++ b/src/search.c
@@ -761,7 +761,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH];
 
   double coeff_bits = 0;
-  double trtree_bits = 0;
+  double tr_tree_bits = 0;
 
   // Check that lcu is not in 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
@@ -771,7 +771,6 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
 
   // Add cost of intra split flag on transform tree.
   bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
-  double tr_tree_bits = 0.0;
   if (width <= TR_MAX_WIDTH
       && width > TR_MIN_WIDTH
       && !intra_split_flag)
@@ -780,23 +779,6 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
     tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag);
   }
 
-  // Add cost of cbf chroma bits on transform tree.
-  // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true
-  // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0
-  // if this and any previous transform block has no chroma coefficients.
-  // When searching the first block we don't actually know the real values,
-  // so this will code cbf as 0 and not code the cbf at all for descendants.
-  int tr_depth = depth - pred_cu->depth;
-  if (depth < MAX_PU_DEPTH) {  // log2TrafoSize > 2 
-    const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
-    }
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
-    }
-  }
-
   if (split_transform_flag) {
     int offset = width / 2;
     double sum = 0;
@@ -806,7 +788,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
     sum += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + trtree_bits * encoder_state->global->cur_lambda_cost;
+    return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost;
   }
 
   if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) {
@@ -841,7 +823,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
     coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode);
   }
 
-  double bits = trtree_bits + coeff_bits;
+  double bits = tr_tree_bits + coeff_bits;
   return ssd + bits * encoder_state->global->cur_lambda_cost;
 }
 
@@ -856,28 +838,29 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
   const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH];
 
-  double trtree_bits = 0;
+  double tr_tree_bits = 0;
   double coeff_bits = 0;
 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
-  if (depth < MAX_PU_DEPTH) {
-    // cbf_c bits are present only when log2TrafoSize > 2
-    if (tr_cu->tr_depth == depth) {
-      // cbf_c bits are always present at transform depth 0.
-      //trtree_bits += 2;
-    } else {
-      // cbf_c bits are not present if cbf has already been set to 0.
-      //trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1);
-      //trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1);
-    }
-  } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
+  if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
     // For MAX_PU_DEPTH calculate chroma for previous depth for the first
     // block and return 0 cost for all others.
     return 0;
   }
 
+  if (depth < MAX_PU_DEPTH) {
+    const int tr_depth = depth - pred_cu->depth;
+    const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]);
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+    }
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+    }
+  }
+
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
     int sum = 0;
@@ -887,7 +870,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
     sum += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + trtree_bits * encoder_state->global->cur_lambda_cost;
+    return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost;
   }
 
   // Chroma SSD
@@ -930,7 +913,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
     coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
   }
 
-  double bits = trtree_bits + coeff_bits;
+  double bits = tr_tree_bits + coeff_bits;
   return ssd + bits * encoder_state->global->cur_lambda_cost;
 }
 

From 17473624d3c61c8aa29ff9836fc4f9018e60c483 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 2 Oct 2014 12:33:17 +0300
Subject: [PATCH 13/28] Add transform tree bit costs for cbf_luma.

---
 src/search.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/search.c b/src/search.c
index f952a277..7ffce895 100644
--- a/src/search.c
+++ b/src/search.c
@@ -756,6 +756,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
 {
   const int rdo = encoder_state->encoder_control->rdo;
   const int width = LCU_WIDTH >> depth;
+  const uint8_t pu_index = PU_INDEX(x_px / 4, y_px / 4);
 
   // cur_cu is used for TU parameters.
   cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH];
@@ -767,19 +768,19 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
-  bool split_transform_flag = tr_cu->tr_depth > depth;
+  const uint8_t tr_depth = tr_cu->tr_depth - depth;
 
-  // Add cost of intra split flag on transform tree.
+  // Add transform_tree split_transform_flag bit cost.
   bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
   if (width <= TR_MAX_WIDTH
       && width > TR_MIN_WIDTH
       && !intra_split_flag)
   {
     const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag);
+    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0);
   }
 
-  if (split_transform_flag) {
+  if (tr_depth > 0) {
     int offset = width / 2;
     double sum = 0;
 
@@ -791,8 +792,14 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
     return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost;
   }
 
-  if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) {
-    //trtree_bits += 1;  // cbf_luma
+  // Add transform_tree cbf_luma bit cost.
+  if (pred_cu->type == CU_INTRA ||
+      tr_depth > 0 ||
+      cbf_is_set(tr_cu->cbf.u, depth) ||
+      cbf_is_set(tr_cu->cbf.v, depth))
+  {
+    const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_luma[!tr_depth]);
+    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.y, depth + pu_index));
   }
 
   unsigned ssd = 0;

From 38b224cf693b68b026e1d6c8f098e67e34fb1dfb Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 6 Oct 2014 17:44:15 +0300
Subject: [PATCH 14/28] Change rest of cu split search costs to double.

---
 src/cu.h     |  4 ++--
 src/search.c | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 3518743e..2acdf163 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -46,7 +46,7 @@ typedef struct {
  */
 typedef struct
 {
-  uint32_t cost;
+  double cost;
   uint32_t bitcost;
   int8_t mode;
   int8_t mode_chroma;
@@ -58,7 +58,7 @@ typedef struct
  */
 typedef struct
 {
-  uint32_t cost;
+  double cost;
   uint32_t bitcost;
   int16_t mv[2];
   int16_t mvd[2];
diff --git a/src/search.c b/src/search.c
index 7ffce895..13b051dc 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1072,13 +1072,13 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 }
 
 
-static void sort_modes(int8_t *modes, uint32_t *costs, int length)
+static void sort_modes(int8_t *modes, double *costs, int length)
 {
   int i, j;
   for (i = 0; i < length; ++i) {
     j = i;
     while (j > 0 && costs[j] < costs[j - 1]) {
-      SWAP(costs[j], costs[j - 1], uint32_t);
+      SWAP(costs[j], costs[j - 1], double);
       SWAP(modes[j], modes[j - 1], int8_t);
       --j;
     }
@@ -1089,7 +1089,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
                                  pixel *orig, int32_t origstride,
                                  pixel *rec, int16_t recstride,
                                  int width, int8_t *intra_preds,
-                                 int8_t modes[35], uint32_t costs[35])
+                                 int8_t modes[35], double costs[35])
 {
   cost_pixel_nxn_func *cost_func = pixels_get_sad_func(width);
 
@@ -1217,7 +1217,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
                              pixel *rec, int16_t recstride,
                              int8_t *intra_preds,
                              int modes_to_check,
-                             int8_t modes[35], uint32_t costs[35],
+                             int8_t modes[35], double costs[35],
                              lcu_t *lcu)
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + encoder_state->encoder_control->tr_depth_intra);
@@ -1286,7 +1286,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
       lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
 
       double mode_cost = search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
-      costs[rdo_mode] += (uint32_t)(0.5 + mode_cost);
+      costs[rdo_mode] += mode_cost;
     }
   }
 
@@ -1312,7 +1312,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
  * Update lcu to have best modes at this depth.
  * \return Cost of best mode.
  */
-static int search_cu_intra(encoder_state * const encoder_state,
+static double search_cu_intra(encoder_state * const encoder_state,
                            const int x_px, const int y_px,
                            const int depth, lcu_t *lcu)
 {
@@ -1358,7 +1358,7 @@ static int search_cu_intra(encoder_state * const encoder_state,
     unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2);
 
     int8_t modes[35];
-    uint32_t costs[35];
+    double costs[35];
     int8_t number_of_modes;
     bool skip_rough_search = (depth == 0 || encoder_state->encoder_control->rdo >= 3);
     if (!skip_rough_search) {
@@ -1415,11 +1415,11 @@ static int search_cu_intra(encoder_state * const encoder_state,
  * - All the final data for the LCU gets eventually copied to depth 0, which
  *   will be the final output of the recursion.
  */
-static int search_cu(encoder_state * const encoder_state, int x, int y, int depth, lcu_t work_tree[MAX_PU_DEPTH])
+static double search_cu(encoder_state * const encoder_state, int x, int y, int depth, lcu_t work_tree[MAX_PU_DEPTH])
 {
   const videoframe * const frame = encoder_state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
-  int cost = MAX_INT;
+  double cost = MAX_INT;
   cu_info *cur_cu;
   int x_local = (x&0x3f), y_local = (y&0x3f);
 #ifdef _DEBUG
@@ -1459,7 +1459,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
     if (depth >= MIN_INTRA_SEARCH_DEPTH &&
         depth <= MAX_INTRA_SEARCH_DEPTH)
     {
-      int mode_cost = search_cu_intra(encoder_state, x, y, depth, &work_tree[depth]);
+      double mode_cost = search_cu_intra(encoder_state, x, y, depth, &work_tree[depth]);
       if (mode_cost < cost) {
         cost = mode_cost;
         cur_cu->type = CU_INTRA;
@@ -1507,7 +1507,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
   if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) {
     int half_cu = cu_width / 2;
     // Using Cost = lambda * 9 to compensate on the price of the split
-    int split_cost = (int)(encoder_state->global->cur_lambda_cost + 0.5) * CU_SPLIT_COST;
+    double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST;
     int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
 
     // If skip mode was selected for the block, skip further search.

From b6710e78934b6d8349514456c69f84c3c7583fc1 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 6 Oct 2014 16:48:50 +0300
Subject: [PATCH 15/28] Add cabac bits for cu split flag.

---
 src/search.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/search.c b/src/search.c
index 13b051dc..b0a6ba1a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1509,6 +1509,18 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
     // Using Cost = lambda * 9 to compensate on the price of the split
     double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST;
     int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
+        
+    if (depth < MAX_DEPTH) {
+      vector2d lcu_cu = { x_local / 8, y_local / 8 };
+      cu_info *cu_array = &(&work_tree[depth])->cu[LCU_CU_OFFSET];
+      bool condA = x >= 8 && cu_array[(lcu_cu.x - 1) * lcu_cu.y * LCU_T_CU_WIDTH].depth > depth;
+      bool condL = y >= 8 && cu_array[lcu_cu.x * (lcu_cu.y - 1) * LCU_T_CU_WIDTH].depth > depth;
+      uint8_t split_model = condA + condL;
+
+      const cabac_ctx *ctx = &(encoder_state->cabac.ctx.split_flag_model[split_model]);
+      cost += CTX_ENTROPY_FBITS(ctx, 0);
+      split_cost += CTX_ENTROPY_FBITS(ctx, 1);
+    }
 
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting

From 49ad845c3356e6f96010f81359b9dc191cc92246 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 6 Oct 2014 19:19:51 +0300
Subject: [PATCH 16/28] Add cabac bits for part_mode.

---
 src/search.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/search.c b/src/search.c
index b0a6ba1a..822b1563 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1522,6 +1522,12 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
       split_cost += CTX_ENTROPY_FBITS(ctx, 1);
     }
 
+    if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
+      const cabac_ctx *ctx = &(encoder_state->cabac.ctx.part_size_model[0]);
+      cost += CTX_ENTROPY_FBITS(ctx, 1);  // 2Nx2N
+      split_cost += CTX_ENTROPY_FBITS(ctx, 0);  // NxN
+    }
+
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting
     // might not give any better results but takes more time to do.

From cbb2aa75b74c910e170283234a072f9a1dce8872 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 6 Oct 2014 21:46:12 +0300
Subject: [PATCH 17/28] Add macros for adjusting weight of distortion between
 luma and chroma.

- Everything needs to have a short name because windows has a maximum path
  length limitation that is breaking my testing framework.
---
 src/encoderstate.c |  5 +++++
 src/search.c       | 17 ++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index 2faf81ea..0a348bd6 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -43,6 +43,9 @@
 #include "sao.h"
 #include "rdo.h"
 
+#ifndef LMBD
+# define LMBD 1.0
+#endif
 
 /*!
   \brief Initializes lambda-value for current QP
@@ -70,6 +73,8 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
     lambda *= 0.95;
   }
 
+  lambda *= LMBD;
+
   encoder_state->global->cur_lambda_cost = lambda;
   encoder_state->global->cur_lambda_cost_sqrt = sqrt(lambda);
 }
diff --git a/src/search.c b/src/search.c
index 822b1563..4ee39d43 100644
--- a/src/search.c
+++ b/src/search.c
@@ -47,13 +47,20 @@
   && (x) + (block_width) <= (width) \
   && (y) + (block_height) <= (height))
 
-#ifndef CU_SPLIT_COST
-#  define CU_SPLIT_COST 9
+#ifndef CUSPL
+#  define CUSPL 9
 #endif
 #ifndef FULL_CU_SPLIT_SEARCH
 #  define FULL_CU_SPLIT_SEARCH false
 #endif
 
+#ifndef LMUL
+# define LMUL 1.0
+#endif
+#ifndef CMUL
+# define CMUL 1.0
+#endif
+
 /**
  * This is used in the hexagon_search to select 3 points to search.
  *
@@ -831,7 +838,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return ssd + bits * encoder_state->global->cur_lambda_cost;
+  return (double)ssd * LMUL + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
@@ -921,7 +928,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return ssd + bits * encoder_state->global->cur_lambda_cost;
+  return (double)ssd * CMUL + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
@@ -1507,7 +1514,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
   if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) {
     int half_cu = cu_width / 2;
     // Using Cost = lambda * 9 to compensate on the price of the split
-    double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST;
+    double split_cost = encoder_state->global->cur_lambda_cost * CUSPL;
     int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
         
     if (depth < MAX_DEPTH) {

From 28d1532578796e62b4cb0e43a5de1363fd76a1e4 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 8 Oct 2014 12:50:03 +0300
Subject: [PATCH 18/28] Make rd=1 use cabac for coeff cost estimation.

---
 src/search.c | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/src/search.c b/src/search.c
index 4ee39d43..47112206 100644
--- a/src/search.c
+++ b/src/search.c
@@ -818,17 +818,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
     }
   }
 
-  if (rdo == 1) {
-    int coeff_abs = 0;
-
-    // Estimate coding cost to be 1.5 * summ of abs coeffs.
-    for (int y = y_px; y < y_px + width; ++y) {
-      for (int x = x_px; x < x_px + width; ++x) {
-        coeff_abs += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]);
-      }
-    }
-    coeff_bits += 1.5 * coeff_abs;
-  } else if (rdo >= 2) {
+  if (rdo >= 1) {
     coefficient coeff_temp[32 * 32];
     int8_t luma_scan_mode = get_scan_order(pred_cu->type, pred_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth);
 
@@ -902,19 +892,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
     }
   }
 
-  if (rdo == 1) {
-    int coeff_abs = 0;
-
-    // Estimate coding cost to be 1.5 * summ of abs coeffs.
-    for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
-      for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
-        coeff_abs += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]);
-        coeff_abs += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]);
-      }
-    }
-
-    coeff_bits = 1.5 * coeff_abs;
-  } else if (rdo >= 2) {
+  if (rdo >= 1) {
     coefficient coeff_temp[16 * 16];
     int8_t scan_order = get_scan_order(pred_cu->type, pred_cu->intra[0].mode_chroma, depth);
     

From f164a5ba794ffa262082529a7b0ee86b852fe6eb Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 8 Oct 2014 17:39:55 +0300
Subject: [PATCH 19/28] Add fast transform skip estimation to rough intra
 search.

---
 src/search.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/search.c b/src/search.c
index 47112206..0fe9a505 100644
--- a/src/search.c
+++ b/src/search.c
@@ -60,6 +60,9 @@
 #ifndef CMUL
 # define CMUL 1.0
 #endif
+#ifndef MN // fast tr_skip Magic Number
+# define MN 0.0
+#endif
 
 /**
  * This is used in the hexagon_search to select 3 points to search.
@@ -1070,13 +1073,31 @@ static void sort_modes(int8_t *modes, double *costs, int length)
   }
 }
 
+
+static unsigned get_cost(pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width)
+{
+  unsigned cost = satd_func(pred, orig_block);
+  if (MN != 0 && width == 4) {
+    // If the mode looks better with SAD than SATD it might be a good
+    // candidate for transform skip. How much better SAD has to be is
+    // controlled by MN.
+    unsigned sad_cost = MN * sad_func(pred, orig_block);
+    if (sad_cost < cost) {
+      cost = sad_cost;
+    }
+  }
+  return cost;
+}
+
+
 static int8_t search_intra_rough(encoder_state * const encoder_state, 
                                  pixel *orig, int32_t origstride,
                                  pixel *rec, int16_t recstride,
                                  int width, int8_t *intra_preds,
                                  int8_t modes[35], double costs[35])
 {
-  cost_pixel_nxn_func *cost_func = pixels_get_sad_func(width);
+  cost_pixel_nxn_func *satd_func = pixels_get_satd_func(width);
+  cost_pixel_nxn_func *sad_func = pixels_get_sad_func(width);
 
   // Temporary block arrays
   pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
@@ -1099,7 +1120,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
     int16_t x, y;
     for (y = -1; y < recstride; y++) {
       ref[1][y*recstride - 1] = rec[y*recstride - 1];
-                                                }
+    }
     for (x = 0; x < recstride; x++) {
       ref[1][x - recstride] = rec[x - recstride];
     }
@@ -1127,7 +1148,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
   // the recursive search.
   for (int mode = 2; mode <= 34; mode += offset) {
     intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-    costs[modes_selected] = cost_func(pred, orig_block);
+    costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
     modes[modes_selected] = mode;
 
     min_cost = MIN(min_cost, costs[modes_selected]);
@@ -1147,7 +1168,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
       int8_t mode = modes[0] - offset;
       if (mode >= 2) {
         intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-        costs[modes_selected] = cost_func(pred, orig_block);
+        costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         ++modes_selected;
       }
@@ -1155,7 +1176,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
       mode = modes[0] + offset;
       if (mode <= 34) {
         intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-        costs[modes_selected] = cost_func(pred, orig_block);
+        costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         ++modes_selected;
       }
@@ -1178,7 +1199,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
 
     if (!has_mode) {
       intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-      costs[modes_selected] = cost_func(pred, orig_block);
+      costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
       modes[modes_selected] = mode;
       ++modes_selected;
     }

From 7a5cf5d8651986c7f692677b2618b74c78d389e4 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 9 Oct 2014 19:08:47 +0300
Subject: [PATCH 20/28] Add trskip mode cost to fast trskip mode decision.

---
 src/search.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/search.c b/src/search.c
index 0fe9a505..88e1ca9d 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1074,19 +1074,21 @@ static void sort_modes(int8_t *modes, double *costs, int length)
 }
 
 
-static unsigned get_cost(pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width)
+static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width)
 {
-  unsigned cost = satd_func(pred, orig_block);
+  double satd_cost = satd_func(pred, orig_block);
   if (MN != 0 && width == 4) {
     // If the mode looks better with SAD than SATD it might be a good
     // candidate for transform skip. How much better SAD has to be is
     // controlled by MN.
-    unsigned sad_cost = MN * sad_func(pred, orig_block);
-    if (sad_cost < cost) {
-      cost = sad_cost;
+    const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma;
+    double trskip_cost = encoder_state->global->cur_lambda_cost_sqrt * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    double sad_cost = MN * sad_func(pred, orig_block) + trskip_cost;
+    if (sad_cost < satd_cost) {
+      return sad_cost;
     }
   }
-  return cost;
+  return satd_cost;
 }
 
 
@@ -1148,7 +1150,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
   // the recursive search.
   for (int mode = 2; mode <= 34; mode += offset) {
     intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-    costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
+    costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     modes[modes_selected] = mode;
 
     min_cost = MIN(min_cost, costs[modes_selected]);
@@ -1168,7 +1170,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
       int8_t mode = modes[0] - offset;
       if (mode >= 2) {
         intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-        costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
+        costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         ++modes_selected;
       }
@@ -1176,7 +1178,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
       mode = modes[0] + offset;
       if (mode <= 34) {
         intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-        costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
+        costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         ++modes_selected;
       }
@@ -1199,7 +1201,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
 
     if (!has_mode) {
       intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
-      costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width);
+      costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
       modes[modes_selected] = mode;
       ++modes_selected;
     }

From a469c059a59b3674d1532fd7bf716d35e9b1671a Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 13 Oct 2014 10:48:39 +0300
Subject: [PATCH 21/28] Take chroma tr-skip bits into account.

---
 src/search.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/search.c b/src/search.c
index 88e1ca9d..e459bad8 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1082,8 +1082,10 @@ static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *
     // candidate for transform skip. How much better SAD has to be is
     // controlled by MN.
     const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma;
-    double trskip_cost = encoder_state->global->cur_lambda_cost_sqrt * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
-    double sad_cost = MN * sad_func(pred, orig_block) + trskip_cost;
+    double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
+    ctx = &encoder_state->cabac.ctx.transform_skip_model_chroma;
+    trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    double sad_cost = MN * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits;
     if (sad_cost < satd_cost) {
       return sad_cost;
     }

From c9e212ba92d0c79f2de5d3d60ba040d7562c41b4 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 15 Oct 2014 16:01:58 +0300
Subject: [PATCH 22/28] Add intra chroma mode search.

- Based on full chroma reconstruction so enabled only for --rd=2.
---
 src/intra.c     |  4 +--
 src/search.c    | 90 +++++++++++++++++++++++++++++++++++++++++++++----
 src/transform.c |  3 ++
 3 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index b50eb457..ce4b0f1d 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -775,8 +775,8 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i
                   rec_stride >> 1,
                   intra_mode,
                   2);
-    }
 
-    quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu);
+      quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu);
+    }
   }
 }
diff --git a/src/search.c b/src/search.c
index e459bad8..b02db394 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1060,6 +1060,68 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 }
 
 
+static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma_mode, int8_t luma_mode)
+{
+  const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]);
+  double mode_bits;
+  if (chroma_mode == luma_mode) {
+    mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
+  } else {
+    mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+  }
+
+  return mode_bits;
+}
+
+
+static int8_t search_intra_chroma(encoder_state * const encoder_state,
+                                int x_px, int y_px, int depth,
+                                int8_t intra_mode,
+                                lcu_t *const lcu)
+{
+  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
+
+  if (reconstruct_chroma) {
+    const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
+    cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+
+    int8_t chroma_modes[5] = { 0, 26, 10, 1, intra_mode };
+    const int8_t num_chroma_modes = 5;
+
+    if (intra_mode == 0 || intra_mode == 26 || intra_mode == 10 || intra_mode == 1) {
+      chroma_modes[4] = 34;
+    }
+
+    struct {
+      double cost;
+      int8_t mode;
+    } chroma, best_chroma;
+
+    best_chroma.mode = 0;
+    best_chroma.cost = MAX_INT;
+
+    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_chroma_modes; ++chroma_mode_i) {
+      chroma.mode = chroma_modes[chroma_mode_i];
+
+      intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, chroma.mode, NULL, lcu);
+      chroma.cost = cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
+
+      const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]);
+      double mode_bits = chroma_mode_bits(encoder_state, chroma.mode, intra_mode);
+      chroma.cost += mode_bits * encoder_state->global->cur_lambda_cost;
+
+      if (chroma.cost < best_chroma.cost) {
+        best_chroma = chroma;
+      }
+    }
+
+    return best_chroma.mode;
+  }
+
+  return 100;
+}
+
+
 static void sort_modes(int8_t *modes, double *costs, int length)
 {
   int i, j;
@@ -1406,8 +1468,6 @@ static double search_cu_intra(encoder_state * const encoder_state,
     cur_cu->intra[pu_index].mode = modes[0];
     cur_cu->intra[pu_index].cost = costs[0];
     cur_cu->intra[pu_index].bitcost = intra_pred_ratecost(modes[0], candidate_modes);
-
-    cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode;
   }
 
   return cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].cost;
@@ -1480,13 +1540,22 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       int8_t intra_mode = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode;
-      int8_t intra_mode_chroma = cur_cu->intra[0].mode_chroma;
       lcu_set_intra_mode(&work_tree[depth], x, y, depth,
                          intra_mode,
-                         intra_mode_chroma,
+                         100,
                          cur_cu->part_size);
       intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]);
-      intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]);
+
+      if (PU_INDEX(x >> 2, y >> 2) == 0) {
+        int8_t intra_mode_chroma = intra_mode;
+        if (encoder_state->encoder_control->rdo >= 2) {
+          intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
+        }
+        lcu_set_intra_mode(&work_tree[depth], x, y, depth,
+                           intra_mode, intra_mode_chroma,
+                           cur_cu->part_size);
+        intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode_chroma, NULL, &work_tree[depth]);
+      }
     } else if (cur_cu->type == CU_INTER) {
       int cbf;
       inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
@@ -1509,8 +1578,17 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
     cost = cu_rd_cost_luma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]);
     cost += cu_rd_cost_chroma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]);
     
+    double mode_bits;
     // Bitcost
-    cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+    if (cur_cu->type == CU_INTER) {
+      mode_bits = cur_cu->inter.bitcost;
+    } else {
+      mode_bits = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost;
+      if (PU_INDEX(x >> 2, y >> 2) == 0) {
+        mode_bits += chroma_mode_bits(encoder_state, cur_cu->intra[0].mode_chroma, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode);
+      }
+    }
+    cost += mode_bits * encoder_state->global->cur_lambda_cost;
   }
   
   // Recursively split all the way to max search depth.
diff --git a/src/transform.c b/src/transform.c
index ee16d7b2..5c10e401 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -650,6 +650,9 @@ void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
   if (depth <= MAX_DEPTH || pu_index == 0) {
+    cbf_clear(&cur_cu->cbf.u, depth);
+    cbf_clear(&cur_cu->cbf.v, depth);
+
     const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH_C;
     pixel *recbase_u = &lcu->rec.u[chroma_offset];
     pixel *recbase_v = &lcu->rec.v[chroma_offset];

From 8a407b0313ee6d2b1475d4e17458478d29608a4d Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 15 Oct 2014 14:15:10 +0300
Subject: [PATCH 23/28] Estimate luma and chroma intra mode bits separately.

- Remove cu_info.intra[].cost and bitcost as unnecessary.
- Add luma_mode_bits to complement chroma_mode_bits and remove
  intra_pred_ratecost as unneccessary. Difference is that intra_pred_ratecost
  was more coarse and included chroma mode with the assumption that it would
  be the same as chroma.
---
 src/cu.h     |  2 --
 src/rdo.c    | 20 --------------------
 src/rdo.h    |  1 -
 src/search.c | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 2acdf163..77eff46a 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -46,8 +46,6 @@ typedef struct {
  */
 typedef struct
 {
-  double cost;
-  uint32_t bitcost;
   int8_t mode;
   int8_t mode_chroma;
   int8_t tr_skip;    //!< \brief transform skip flag
diff --git a/src/rdo.c b/src/rdo.c
index d3635f37..ef54d22c 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -127,26 +127,6 @@ const float f_entropy_bits[128] =
 };
 
 
-/**
- * \brief Helper function to find intra merge costs
- * \returns intra mode coding cost in bits
- */
-uint32_t intra_pred_ratecost(int16_t mode, int8_t *intra_preds)
-{
-   // merge mode -1 means they are not used -> cost 0
-   if(intra_preds[0] == -1) return 0;
-
-   // First candidate needs only one bit and two other need two
-   if(intra_preds[0] == mode) {
-     return 1;
-   } else if(intra_preds[1] == mode || intra_preds[2] == mode) {
-     return 2;
-   }
-   // Without merging the cost is 5 bits
-   return 5;
-}
-
-
 /**
  * \brief Function to compare RDO costs
  * \param rdo_costs array of current costs
diff --git a/src/rdo.h b/src/rdo.h
index 889f10d7..33954cf1 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -42,7 +42,6 @@ typedef struct
 extern const uint32_t g_go_rice_range[5];
 extern const uint32_t g_go_rice_prefix_len[5];
 
-uint32_t intra_pred_ratecost(int16_t mode, int8_t *intra_preds);
 int intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost);
 
 void  rdoq(encoder_state *encoder_state, coefficient *coef, coefficient *dest_coeff, int32_t width,
diff --git a/src/search.c b/src/search.c
index b02db394..8941ec2c 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1048,7 +1048,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
     pred_cu->cbf = nosplit_cbf;
 
     // We only restore the pixel data and not coefficients or cbf data.
-    // The only thing we really need are the border pixels.
+    // The only thing we really need are the border pixels.intra_get_dir_luma_predictor
     pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH);
     if (reconstruct_chroma) {
       pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C);
@@ -1060,6 +1060,30 @@ static double search_intra_trdepth(encoder_state * const encoder_state,
 }
 
 
+static double luma_mode_bits(const encoder_state *encoder_state, int8_t luma_mode, const int8_t *intra_preds)
+{
+  double mode_bits;
+
+  bool mode_in_preds = false;
+  for (int i = 0; i < 3; ++i) {
+    if (luma_mode == intra_preds[i]) {
+      mode_in_preds = true;
+    }
+  }
+
+  const cabac_ctx *ctx = &(encoder_state->cabac.ctx.intra_mode_model);
+  mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds);
+
+  if (mode_in_preds) {
+    mode_bits += ((luma_mode == intra_preds[0]) ? 1 : 2);
+  } else {
+    mode_bits += 5;
+  }
+
+  return mode_bits;
+}
+
+
 static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma_mode, int8_t luma_mode)
 {
   const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]);
@@ -1275,7 +1299,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,
   // affecting the halving search.
   int lambda_cost = (int)(encoder_state->global->cur_lambda_cost_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * intra_pred_ratecost(modes[mode_i], intra_preds);
+    costs[mode_i] += lambda_cost * luma_mode_bits(encoder_state, modes[mode_i], intra_preds);
   }
 
   sort_modes(modes, costs, modes_selected);
@@ -1334,7 +1358,7 @@ static void search_intra_rdo(encoder_state * const encoder_state,
   }
 
   for(rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost = intra_pred_ratecost(modes[rdo_mode], intra_preds);
+    int rdo_bitcost = luma_mode_bits(encoder_state, modes[rdo_mode], intra_preds);
     costs[rdo_mode] = rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
 
     if (0 && tr_depth == depth) {
@@ -1424,13 +1448,14 @@ static double search_cu_intra(encoder_state * const encoder_state,
                                lcu);
   }
 
+  int8_t modes[35];
+  double costs[35];
+
   // Find best intra mode for 2Nx2N.
   {
     pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
     unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2);
 
-    int8_t modes[35];
-    double costs[35];
     int8_t number_of_modes;
     bool skip_rough_search = (depth == 0 || encoder_state->encoder_control->rdo >= 3);
     if (!skip_rough_search) {
@@ -1466,11 +1491,9 @@ static double search_cu_intra(encoder_state * const encoder_state,
     }
 
     cur_cu->intra[pu_index].mode = modes[0];
-    cur_cu->intra[pu_index].cost = costs[0];
-    cur_cu->intra[pu_index].bitcost = intra_pred_ratecost(modes[0], candidate_modes);
   }
 
-  return cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].cost;
+  return costs[0];
 }
 
 
@@ -1583,7 +1606,17 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
     if (cur_cu->type == CU_INTER) {
       mode_bits = cur_cu->inter.bitcost;
     } else {
-      mode_bits = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost;
+      int8_t candidate_modes[3];
+      {
+        lcu_t *lcu = &work_tree[depth];
+        const vector2d lcu_px = { x & 0x3f, y & 0x3f };
+        const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
+        const cu_info *left_cu = ((x >> 3) ? &cur_cu[-1] : NULL);
+        const cu_info *above_cu = ((lcu_cu.y) ? &cur_cu[-LCU_T_CU_WIDTH] : NULL);
+        intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
+      }
+
+      mode_bits = luma_mode_bits(encoder_state, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode, candidate_modes);
       if (PU_INDEX(x >> 2, y >> 2) == 0) {
         mode_bits += chroma_mode_bits(encoder_state, cur_cu->intra[0].mode_chroma, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode);
       }

From 3e6023dfb54dc377997e6366283bd26a6ea11960 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 15 Oct 2014 20:17:38 +0300
Subject: [PATCH 24/28] Rename search constants and set sane defaults.

---
 src/global.h |  2 +-
 src/search.c | 37 ++++++++++++++++++++++---------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/global.h b/src/global.h
index df74c657..22a2ecb6 100644
--- a/src/global.h
+++ b/src/global.h
@@ -71,7 +71,7 @@ typedef int16_t coefficient;
 #   define MAX_INTRA_SEARCH_DEPTH 4
 #endif
 #ifndef MIN_INTRA_SEARCH_DEPTH
-#   define MIN_INTRA_SEARCH_DEPTH 0
+#   define MIN_INTRA_SEARCH_DEPTH 1
 #endif
 
 // Maximum CU depth when descending form LCU level.
diff --git a/src/search.c b/src/search.c
index 8941ec2c..70b13d76 100644
--- a/src/search.c
+++ b/src/search.c
@@ -47,21 +47,28 @@
   && (x) + (block_width) <= (width) \
   && (y) + (block_height) <= (height))
 
-#ifndef CUSPL
-#  define CUSPL 9
+// Extra cost for CU split.
+// Compensates for missing or incorrect bit costs. Must be recalculated if
+// bits are added or removed from cu-tree search.
+#ifndef CU_COST
+#  define CU_COST 3
 #endif
+// Disable early cu-split pruning.
 #ifndef FULL_CU_SPLIT_SEARCH
 #  define FULL_CU_SPLIT_SEARCH false
 #endif
-
-#ifndef LMUL
-# define LMUL 1.0
+// Modify weight of luma SSD.
+#ifndef LUMA_MULT
+# define LUMA_MULT 0.8
 #endif
-#ifndef CMUL
-# define CMUL 1.0
+// Modify weight of chroma SSD.
+#ifndef CHROMA_MULT
+# define CHROMA_MULT 1.5
 #endif
-#ifndef MN // fast tr_skip Magic Number
-# define MN 0.0
+// Normalize SAD for comparison against SATD to estimate transform skip
+// for 4x4 blocks.
+#ifndef TRSKIP_RATIO
+# define TRSKIP_RATIO 1.7
 #endif
 
 /**
@@ -831,7 +838,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LMUL + bits * encoder_state->global->cur_lambda_cost;
+  return (double)ssd * LUMA_MULT + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
@@ -909,7 +916,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * CMUL + bits * encoder_state->global->cur_lambda_cost;
+  return (double)ssd * CHROMA_MULT + bits * encoder_state->global->cur_lambda_cost;
 }
 
 
@@ -1163,15 +1170,15 @@ static void sort_modes(int8_t *modes, double *costs, int length)
 static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width)
 {
   double satd_cost = satd_func(pred, orig_block);
-  if (MN != 0 && width == 4) {
+  if (TRSKIP_RATIO != 0 && width == 4) {
     // If the mode looks better with SAD than SATD it might be a good
     // candidate for transform skip. How much better SAD has to be is
-    // controlled by MN.
+    // controlled by TRSKIP_RATIO.
     const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
     ctx = &encoder_state->cabac.ctx.transform_skip_model_chroma;
     trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
-    double sad_cost = MN * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits;
+    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits;
     if (sad_cost < satd_cost) {
       return sad_cost;
     }
@@ -1628,7 +1635,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
   if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) {
     int half_cu = cu_width / 2;
     // Using Cost = lambda * 9 to compensate on the price of the split
-    double split_cost = encoder_state->global->cur_lambda_cost * CUSPL;
+    double split_cost = encoder_state->global->cur_lambda_cost * CU_COST;
     int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
         
     if (depth < MAX_DEPTH) {

From d12dbd4aa0db85ae663325fa7a725df1e04afd48 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 15 Oct 2014 22:11:45 +0300
Subject: [PATCH 25/28] Add fast intra chroma mode search.

---
 src/search.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/search.c b/src/search.c
index 70b13d76..3dade9d3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1187,6 +1187,64 @@ static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *
 }
 
 
+
+static void search_intra_chroma_rough(encoder_state * const encoder_state,
+                                      int x_px, int y_px, int depth,
+                                      const pixel *orig_u, const pixel *orig_v, int16_t origstride,
+                                      const pixel *rec_u, const pixel *rec_v, int16_t recstride,
+                                      int8_t luma_mode,
+                                      int8_t modes[5], double costs[5])
+{
+  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
+  if (!reconstruct_chroma) return;
+
+  const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+  const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
+
+  modes[0] = 0;
+  modes[1] = 26;
+  modes[2] = 10;
+  modes[3] = 1;
+  if (luma_mode == 0 || luma_mode == 26 || luma_mode == 10 || luma_mode == 1) {
+    modes[4] = 34;
+  } else {
+    modes[4] = luma_mode;
+  }
+
+  cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width);
+  //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width);
+
+  pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
+
+  pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
+
+  for (int i = 0; i < 5; ++i) {
+    costs[i] = encoder_state->global->cur_lambda_cost_sqrt * chroma_mode_bits(encoder_state, modes[i], luma_mode);
+  }
+  
+  // Chroma doesn't use filtered pixels, so filtered pixels pointer is NULL.
+  const pixel *ref[2] = { rec_u, NULL };
+  pixels_blit(orig_u, orig_block, width, width, origstride, width);
+  for (int i = 0; i < 5; ++i) {
+    intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 1);
+    //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
+    costs[i] += satd_func(pred, orig_block);
+  }
+
+  ref[0] = rec_v;
+  pixels_blit(orig_v, orig_block, width, width, origstride, width);
+  for (int i = 0; i < 5; ++i) {
+    intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 2);
+    //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
+    costs[i] += satd_func(pred, orig_block);
+  }
+
+  sort_modes(modes, costs, 5);
+}
+
+
 static int8_t search_intra_rough(encoder_state * const encoder_state, 
                                  pixel *orig, int32_t origstride,
                                  pixel *rec, int16_t recstride,
@@ -1521,6 +1579,11 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
   cu_info *cur_cu;
+
+  const vector2d lcu_px = { x & 0x3f, y & 0x3f };
+  const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
+  lcu_t *const lcu = &work_tree[depth];
+
   int x_local = (x&0x3f), y_local = (y&0x3f);
 #ifdef _DEBUG
   int debug_split = 0;
@@ -1578,6 +1641,42 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
 
       if (PU_INDEX(x >> 2, y >> 2) == 0) {
         int8_t intra_mode_chroma = intra_mode;
+        
+        if (encoder_state->encoder_control->rdo >= 1) {
+          const videoframe * const frame = encoder_state->tile->frame;
+
+          int8_t modes[5];
+          double costs[5];
+
+          pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
+          pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
+
+          const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+          const int16_t rec_stride = width_c * 2 + 8;
+          const int16_t out_stride = rec_stride;
+
+          intra_build_reference_border(encoder_state->encoder_control,
+                                       x, y, out_stride,
+                                       rec_u, rec_stride, COLOR_U,
+                                       frame->width / 2, frame->height / 2,
+                                       lcu);
+          intra_build_reference_border(encoder_state->encoder_control,
+                                       x, y, out_stride,
+                                       rec_v, rec_stride, COLOR_V,
+                                       frame->width / 2, frame->height / 2,
+                                       lcu);
+
+          vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 };
+          pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+          pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+
+          search_intra_chroma_rough(encoder_state, x, y, depth,
+                                    ref_u, ref_v, LCU_WIDTH_C, 
+                                    &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride,
+                                    intra_mode, modes, costs);
+          intra_mode_chroma = modes[0];
+        }
+
         if (encoder_state->encoder_control->rdo >= 2) {
           intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
         }
@@ -1615,9 +1714,6 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
     } else {
       int8_t candidate_modes[3];
       {
-        lcu_t *lcu = &work_tree[depth];
-        const vector2d lcu_px = { x & 0x3f, y & 0x3f };
-        const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
         const cu_info *left_cu = ((x >> 3) ? &cur_cu[-1] : NULL);
         const cu_info *above_cu = ((lcu_cu.y) ? &cur_cu[-LCU_T_CU_WIDTH] : NULL);
         intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);

From 3cf5e422e8c67bccbdd999592fc1706c0399cf6a Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 15 Oct 2014 23:07:28 +0300
Subject: [PATCH 26/28] Make fast chroma mode search select modes for slower
 chroma search.

---
 src/search.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/search.c b/src/search.c
index 3dade9d3..26bd65f4 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1108,6 +1108,7 @@ static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma
 static int8_t search_intra_chroma(encoder_state * const encoder_state,
                                 int x_px, int y_px, int depth,
                                 int8_t intra_mode,
+                                int8_t modes[5], int8_t num_modes,
                                 lcu_t *const lcu)
 {
   const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
@@ -1116,13 +1117,6 @@ static int8_t search_intra_chroma(encoder_state * const encoder_state,
     const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
     cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
 
-    int8_t chroma_modes[5] = { 0, 26, 10, 1, intra_mode };
-    const int8_t num_chroma_modes = 5;
-
-    if (intra_mode == 0 || intra_mode == 26 || intra_mode == 10 || intra_mode == 1) {
-      chroma_modes[4] = 34;
-    }
-
     struct {
       double cost;
       int8_t mode;
@@ -1131,8 +1125,8 @@ static int8_t search_intra_chroma(encoder_state * const encoder_state,
     best_chroma.mode = 0;
     best_chroma.cost = MAX_INT;
 
-    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_chroma_modes; ++chroma_mode_i) {
-      chroma.mode = chroma_modes[chroma_mode_i];
+    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
+      chroma.mode = modes[chroma_mode_i];
 
       intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, chroma.mode, NULL, lcu);
       chroma.cost = cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
@@ -1193,7 +1187,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
                                       const pixel *orig_u, const pixel *orig_v, int16_t origstride,
                                       const pixel *rec_u, const pixel *rec_v, int16_t recstride,
                                       int8_t luma_mode,
-                                      int8_t modes[5], double costs[5])
+                                      int8_t modes[5], double costs[5], int num_modes)
 {
   const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
   if (!reconstruct_chroma) return;
@@ -1211,6 +1205,14 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
     modes[4] = luma_mode;
   }
 
+  for (int i = 0; i < 5; ++i) {
+    costs[i] = 0;
+  }
+
+  // If the number of modes is all of them, skip ordering them.
+  if (num_modes == 5) return;
+
+
   cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width);
   //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width);
 
@@ -1220,14 +1222,12 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
   pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
   pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
-  for (int i = 0; i < 5; ++i) {
-    costs[i] = encoder_state->global->cur_lambda_cost_sqrt * chroma_mode_bits(encoder_state, modes[i], luma_mode);
-  }
   
   // Chroma doesn't use filtered pixels, so filtered pixels pointer is NULL.
   const pixel *ref[2] = { rec_u, NULL };
   pixels_blit(orig_u, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
+    if (modes[i] == luma_mode) continue;
     intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 1);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
@@ -1236,6 +1236,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
   ref[0] = rec_v;
   pixels_blit(orig_v, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
+    if (modes[i] == luma_mode) continue;
     intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 2);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
@@ -1641,8 +1642,12 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
 
       if (PU_INDEX(x >> 2, y >> 2) == 0) {
         int8_t intra_mode_chroma = intra_mode;
-        
-        if (encoder_state->encoder_control->rdo >= 1) {
+
+        // There is almost no benefit to doing the chroma mode search for
+        // rd2. Possibly because the luma mode search already takes chroma
+        // into account, so there is less of a chanse of luma mode being
+        // really bad for chroma.
+        if (encoder_state->encoder_control->rdo < 2) {
           const videoframe * const frame = encoder_state->tile->frame;
 
           int8_t modes[5];
@@ -1670,16 +1675,22 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
           pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
           pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
 
+          // The number of modes to select for slower chroma search. Luma mode
+          // is always one of the modes, so 2 means the final decision is made
+          // between luma mode and one other mode that looks the best
+          // according to search_intra_chroma_rough.
+          // When tested 2 modes is around -0.5% bdrate compared to 0 and 5 modes
+          // is around -0.8.
+          int num_modes = 2;
+
           search_intra_chroma_rough(encoder_state, x, y, depth,
                                     ref_u, ref_v, LCU_WIDTH_C, 
                                     &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride,
-                                    intra_mode, modes, costs);
-          intra_mode_chroma = modes[0];
+                                    intra_mode, modes, costs, num_modes);
+
+          intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]);
         }
 
-        if (encoder_state->encoder_control->rdo >= 2) {
-          intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
-        }
         lcu_set_intra_mode(&work_tree[depth], x, y, depth,
                            intra_mode, intra_mode_chroma,
                            cur_cu->part_size);

From 02ec26fceaac09093fc9b5c0e54b5646ba52c93f Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 16 Oct 2014 00:42:22 +0300
Subject: [PATCH 27/28] Try different number of chroma intra modes for
 different depths.

- And avoid doing extra work if no extra modes are tested for certain depths.
---
 src/search.c | 92 ++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 50 deletions(-)

diff --git a/src/search.c b/src/search.c
index 26bd65f4..552b5bd1 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1187,7 +1187,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
                                       const pixel *orig_u, const pixel *orig_v, int16_t origstride,
                                       const pixel *rec_u, const pixel *rec_v, int16_t recstride,
                                       int8_t luma_mode,
-                                      int8_t modes[5], double costs[5], int num_modes)
+                                      int8_t modes[5], double costs[5])
 {
   const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
   if (!reconstruct_chroma) return;
@@ -1195,24 +1195,10 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state,
   const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
   const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
 
-  modes[0] = 0;
-  modes[1] = 26;
-  modes[2] = 10;
-  modes[3] = 1;
-  if (luma_mode == 0 || luma_mode == 26 || luma_mode == 10 || luma_mode == 1) {
-    modes[4] = 34;
-  } else {
-    modes[4] = luma_mode;
-  }
-
   for (int i = 0; i < 5; ++i) {
     costs[i] = 0;
   }
 
-  // If the number of modes is all of them, skip ordering them.
-  if (num_modes == 5) return;
-
-
   cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width);
   //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width);
 
@@ -1636,7 +1622,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
       int8_t intra_mode = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode;
       lcu_set_intra_mode(&work_tree[depth], x, y, depth,
                          intra_mode,
-                         100,
+                         intra_mode,
                          cur_cu->part_size);
       intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]);
 
@@ -1650,50 +1636,56 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d
         if (encoder_state->encoder_control->rdo < 2) {
           const videoframe * const frame = encoder_state->tile->frame;
 
-          int8_t modes[5];
           double costs[5];
-
-          pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
-          pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
-
-          const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
-          const int16_t rec_stride = width_c * 2 + 8;
-          const int16_t out_stride = rec_stride;
-
-          intra_build_reference_border(encoder_state->encoder_control,
-                                       x, y, out_stride,
-                                       rec_u, rec_stride, COLOR_U,
-                                       frame->width / 2, frame->height / 2,
-                                       lcu);
-          intra_build_reference_border(encoder_state->encoder_control,
-                                       x, y, out_stride,
-                                       rec_v, rec_stride, COLOR_V,
-                                       frame->width / 2, frame->height / 2,
-                                       lcu);
-
-          vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 };
-          pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
-          pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+          int8_t modes[5] = { 0, 26, 10, 1, 34 };
+          if (intra_mode != 0 && intra_mode != 26 && intra_mode != 10 && intra_mode != 1) {
+            modes[4] = intra_mode;
+          }
 
           // The number of modes to select for slower chroma search. Luma mode
           // is always one of the modes, so 2 means the final decision is made
           // between luma mode and one other mode that looks the best
           // according to search_intra_chroma_rough.
-          // When tested 2 modes is around -0.5% bdrate compared to 0 and 5 modes
-          // is around -0.8.
-          int num_modes = 2;
+          const int8_t modes_in_depth[5] = { 1, 1, 1, 1, 2 };
+          int num_modes = modes_in_depth[depth];
 
-          search_intra_chroma_rough(encoder_state, x, y, depth,
-                                    ref_u, ref_v, LCU_WIDTH_C, 
-                                    &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride,
-                                    intra_mode, modes, costs, num_modes);
+          if (num_modes != 1 && num_modes != 5) {
+            pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
+            pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
 
-          intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]);
+            const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+            const int16_t rec_stride = width_c * 2 + 8;
+            const int16_t out_stride = rec_stride;
+
+            intra_build_reference_border(encoder_state->encoder_control,
+                                         x, y, out_stride,
+                                         rec_u, rec_stride, COLOR_U,
+                                         frame->width / 2, frame->height / 2,
+                                         lcu);
+            intra_build_reference_border(encoder_state->encoder_control,
+                                         x, y, out_stride,
+                                         rec_v, rec_stride, COLOR_V,
+                                         frame->width / 2, frame->height / 2,
+                                         lcu);
+
+            vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 };
+            pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+            pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
+
+            search_intra_chroma_rough(encoder_state, x, y, depth,
+                                      ref_u, ref_v, LCU_WIDTH_C,
+                                      &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride,
+                                      intra_mode, modes, costs);
+          }
+
+          if (num_modes > 1) {
+            intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]);
+            lcu_set_intra_mode(&work_tree[depth], x, y, depth,
+                               intra_mode, intra_mode_chroma,
+                               cur_cu->part_size);
+          }
         }
 
-        lcu_set_intra_mode(&work_tree[depth], x, y, depth,
-                           intra_mode, intra_mode_chroma,
-                           cur_cu->part_size);
         intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode_chroma, NULL, &work_tree[depth]);
       }
     } else if (cur_cu->type == CU_INTER) {

From afb9e8c3f45297a430c8eae03e735349892a004e Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 16 Oct 2014 03:23:33 +0300
Subject: [PATCH 28/28] Remove extra parameter sets.

---
 src/encoder_state-bitstream.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 0157ce08..3f2d2383 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -689,24 +689,26 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
   }
 
   if (main_state->global->is_radl_frame) {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      encoder_state_write_bitstream_aud(main_state);
+    if (main_state->global->frame == 0) {
+      // Access Unit Delimiter (AUD)
+      if (encoder->aud_enable)
+        encoder_state_write_bitstream_aud(main_state);
 
-    // Video Parameter Set (VPS)
-    nal_write(stream, NAL_VPS_NUT, 0, 1);
-    encoder_state_write_bitstream_vid_parameter_set(main_state);
-    bitstream_align(stream);
+      // Video Parameter Set (VPS)
+      nal_write(stream, NAL_VPS_NUT, 0, 1);
+      encoder_state_write_bitstream_vid_parameter_set(main_state);
+      bitstream_align(stream);
 
-    // Sequence Parameter Set (SPS)
-    nal_write(stream, NAL_SPS_NUT, 0, 1);
-    encoder_state_write_bitstream_seq_parameter_set(main_state);
-    bitstream_align(stream);
+      // Sequence Parameter Set (SPS)
+      nal_write(stream, NAL_SPS_NUT, 0, 1);
+      encoder_state_write_bitstream_seq_parameter_set(main_state);
+      bitstream_align(stream);
 
-    // Picture Parameter Set (PPS)
-    nal_write(stream, NAL_PPS_NUT, 0, 1);
-    encoder_state_write_bitstream_pic_parameter_set(main_state);
-    bitstream_align(stream);
+      // Picture Parameter Set (PPS)
+      nal_write(stream, NAL_PPS_NUT, 0, 1);
+      encoder_state_write_bitstream_pic_parameter_set(main_state);
+      bitstream_align(stream);
+    }
 
     if (main_state->global->frame == 0) {
       // Prefix SEI