Rebase transform split search.

2024-11-27 11:24:05 +00:00 · 2014-09-05 11:29:30 +03:00 · 2014-09-05 11:29:30 +03:00 · c5fa824347
parent 79b86ce6e1
commit c5fa824347
7 changed files with 291 additions and 126 deletions
--- a/build/C_Properties.props
+++ b/build/C_Properties.props
@ -16,7 +16,7 @@
      <AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4244;4204;4206;4028;4152</DisableSpecificWarnings>
      <OpenMPSupport>false</OpenMPSupport>
-      <TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700</TreatSpecificWarningsAsErrors>
+      <TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700;4020;4021</TreatSpecificWarningsAsErrors>
    </ClCompile>
    <Link>
      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
--- a/build/kvazaar_VS2013.sln
+++ b/build/kvazaar_VS2013.sln
@ -1,7 +1,7 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 2013
-VisualStudioVersion = 12.0.30110.0
+VisualStudioVersion = 12.0.30723.0
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}"
 EndProject
--- a/src/global.h
+++ b/src/global.h
@ -90,7 +90,9 @@ typedef int16_t coefficient;
 #define LCU_WIDTH_C (LCU_WIDTH / 2) /*!< spec: CtbWidthC and CtbHeightC */

 #define TR_MAX_LOG2_SIZE 5 /*!< spec: Log2MaxTrafoSize <= Min(CtbLog2SizeY, 5) */
-#define TR_MAX_WIDTH (1 << 5) /*!< spec: Log2MaxTrafoSize */
+#define TR_MAX_WIDTH (1 << TR_MAX_LOG2_SIZE)
+#define TR_MIN_LOG2_SIZE 2 /*!< spec: Log2MinTrafoSize */
+#define TR_MIN_WIDTH (1 << TR_MIN_LOG2_SIZE)

 #if LCU_WIDTH != 64
  #error "Kvazaar only support LCU_WIDTH == 64"
--- a/src/intra.c
+++ b/src/intra.c
@ -658,7 +658,7 @@ void intra_get_planar_pred(pixel* src, int32_t srcstride, uint32_t width, pixel*
  }
 }

-void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, lcu_t *lcu)
+void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu)
 {
  const encoder_control * const encoder = encoder_state->encoder_control;
  const vector2d lcu_px = { x & 0x3f, y & 0x3f };
@ -669,10 +669,10 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int
  if (depth == 0 || cur_cu->tr_depth > depth) {
    int offset = width / 2;

-    intra_recon_lcu_luma(encoder_state, x,          y,          depth+1, lcu);
-    intra_recon_lcu_luma(encoder_state, x + offset, y,          depth+1, lcu);
-    intra_recon_lcu_luma(encoder_state, x,          y + offset, depth+1, lcu);
-    intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, lcu);
+    intra_recon_lcu_luma(encoder_state, x,          y,          depth+1, intra_mode, lcu);
+    intra_recon_lcu_luma(encoder_state, x + offset, y,          depth+1, intra_mode, lcu);
+    intra_recon_lcu_luma(encoder_state, x,          y + offset, depth+1, intra_mode, lcu);
+    intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu);

    if (depth < MAX_DEPTH) {
      cu_info *cu_a =  &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) +  (lcu_px.y>>3)        *LCU_T_CU_WIDTH];
@ -700,7 +700,7 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int
    intra_build_reference_border(encoder, x, y,(int16_t)width * 2 + 8, rec, (int16_t)width * 2 + 8, 0,
                                 pic_width, pic_height, lcu);
    intra_recon(encoder, rec_shift, width * 2 + 8,
-                width, recbase_y, rec_stride, cur_cu->intra[pu_index].mode, 0);
+                width, recbase_y, rec_stride, intra_mode, 0);

    quantize_lcu_luma_residual(encoder_state, x, y, depth, lcu);
  }
--- a/src/intra.h
+++ b/src/intra.h
@ -48,7 +48,7 @@ void intra_get_angular_pred(const encoder_control *encoder, pixel* src, int32_t

 void intra_recon(const encoder_control *encoder, pixel* rec, int32_t rec_stride, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);

-void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, lcu_t *lcu);
+void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu);
 void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, lcu_t *lcu);

 #endif
--- a/src/search.c
+++ b/src/search.c
@ -626,7 +626,26 @@ static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[M
 }


-static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth, int pred_mode, int chroma_mode, int part_mode)
+static void lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
+{
+  const int width_cu = LCU_CU_WIDTH >> depth;
+  const vector2d lcu_cu = { (x_px & (LCU_WIDTH - 1)) / 8, (y_px & (LCU_WIDTH - 1)) / 8 };
+  cu_info *const cur_cu = &lcu->cu[lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH + LCU_CU_OFFSET];
+  int x, y;
+
+  // Depth 4 doesn't go inside the loop. Set the top-left CU.
+  cur_cu->tr_depth = tr_depth;
+
+  for (y = 0; y < width_cu; ++y) {
+    for (x = 0; x < width_cu; ++x) {
+      cu_info *cu = &cur_cu[x + y * LCU_T_CU_WIDTH];
+      cu->tr_depth = tr_depth;
+    }
+  }
+}
+
+
+static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pred_mode, int chroma_mode, int part_mode)
 {
  const int width_cu = LCU_CU_WIDTH >> depth;
  const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
@ -639,9 +658,9 @@ static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int tr
    cu_info *cu = &lcu_cu[x_cu + y_cu * LCU_T_CU_WIDTH];
    cu->depth = MAX_DEPTH;
    cu->type = CU_INTRA;
-    // It is assumed that cu->intra[].mode's are already set.
+    cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode = pred_mode;
+    cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode_chroma = chroma_mode;
    cu->part_size = part_mode;
-    cu->tr_depth = tr_depth;
    return;
  }

@ -657,7 +676,6 @@ static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int tr
      cu->intra[3].mode = pred_mode;
      cu->intra[0].mode_chroma = chroma_mode;
      cu->part_size = part_mode;
-      cu->tr_depth = tr_depth;
      cu->coded = 1;
    }
  }
@ -716,144 +734,263 @@ static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info *cu


 /**
- * Calculate "final cost" for the block
- * \return Cost of block
- *
- * Take SSD between reconstruction and original and add cost from
- * coding (bitcost * lambda) and cost for coding coefficients (estimated
- * here as (coefficient_sum * 1.5) * lambda)
- */
-static int lcu_get_final_cost_luma(const encoder_state * const encoder_state,
-                                   const int x_px, const int y_px,
-                                   const int depth, lcu_t *lcu)
+* Calculate RD cost for a Coding Unit.
+* \return Cost of block
+* \param ref_cu  CU used for prediction parameters.
+*
+* Calculates the RDO cost of a single CU that will not be split further.
+* Takes into account SSD of reconstruction and the cost of encoding whatever
+* prediction unit data needs to be coded.
+*/
+static int cu_rd_cost_luma(const encoder_state *const encoder_state,
+  const int x_px, const int y_px, const int depth,
+  const cu_info *const pred_cu,
+  lcu_t *const lcu)
 {
-  cu_info *cur_cu;
-  int x_local = (x_px&0x3f), y_local = (y_px&0x3f);
-  int cost = 0;
-  int coeff_cost = 0;
  const int rdo = encoder_state->encoder_control->rdo;
+  const int width = LCU_WIDTH >> depth;

-  int width = LCU_WIDTH>>depth;
-  int x,y;
-  cur_cu = &lcu->cu[LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH];
+  // cur_cu is used for TU parameters.
+  cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH];
+
+  int x, y;
+  int cost = 0;
+
+  // Check that lcu is not in 
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  {
+    int trtree_bits = 0;
+    if (width <= TR_MAX_WIDTH
+      && width > TR_MIN_WIDTH
+      && pred_cu->part_size != SIZE_NxN) {
+      trtree_bits += 1; // split_transform_flag
+    }
+    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+  }
+  if (tr_cu->tr_depth > depth) {
+    int offset = width / 2;
+
+    cost += cu_rd_cost_luma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+
+    return cost;
+  }
+
+  if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) {
+    int trtree_bits = 1; // cbf_luma
+    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+  }

  // SSD between reconstruction and original
-  for (y = y_local; y < y_local+width; ++y) {
-    for (x = x_local; x < x_local+width; ++x) {
+  for (y = y_px; y < y_px + width; ++y) {
+    for (x = x_px; x < x_px + width; ++x) {
      int diff = (int)lcu->rec.y[y * LCU_WIDTH + x] - (int)lcu->ref.y[y * LCU_WIDTH + x];
      cost += diff*diff;
    }
  }

-  if(rdo == 1) {
-    // sum of coeffs
-    for (y = y_local; y < y_local+width; ++y) {
-      for (x = x_local; x < x_local+width; ++x) {
+  if (rdo == 1) {
+    int coeff_cost = 0;
+
+    // Estimate coding cost to be 1.5 * summ of abs coeffs.
+    for (y = y_px; y < y_px + width; ++y) {
+      for (x = x_px; x < x_px + width; ++x) {
        coeff_cost += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]);
      }
    }
-    // Coefficient costs
-    cost += (coeff_cost + (coeff_cost>>1)) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+    cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);

-  // Calculate actual bit costs for coding the coeffs
-  // RDO
  } else if (rdo == 2) {
-    coefficient coeff_temp[32*32];
-    int i;
-    int blocks = (width == 64)?4:1;
-    int8_t luma_scan_mode = get_scan_order(cur_cu->type, cur_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth);
+    int coeff_cost = 0;

-    for(i = 0; i < blocks; i++) {
-      // For 64x64 blocks we need to do transform split to 32x32
-      int blk_y = i&2 ? 32:0 + y_local;
-      int blk_x = i&1 ? 32:0 + x_local;
-      int blockwidth = (width == 64)?32:width;
+    coefficient coeff_temp[32 * 32];
+    int8_t luma_scan_mode = get_scan_order(pred_cu->type, pred_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth);
+
+    // Code coeffs using cabac to get a better estimate of real coding costs.
+    coefficients_blit(&lcu->coeff.y[(y_px*LCU_WIDTH) + x_px], coeff_temp, width, width, LCU_WIDTH, width);
+    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode);

-      // Calculate luma coeff bit count
-      coefficients_blit(&lcu->coeff.y[(blk_y*LCU_WIDTH)+blk_x],coeff_temp,blockwidth,blockwidth,LCU_WIDTH,blockwidth);
-      coeff_cost += get_coeff_cost(encoder_state, coeff_temp, blockwidth, 0, luma_scan_mode);
-    }
    // Multiply bit count with lambda to get RD-cost
-    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
  }

  return cost;
 }

-static int lcu_get_final_cost_chroma(const encoder_state * const encoder_state,
-                                     const int x_px, const int y_px,
-                                     const int depth, lcu_t *lcu)
-{
-  cu_info *cur_cu;
-  int x_local = (x_px&0x3f), y_local = (y_px&0x3f);
-  int cost = 0;
-  int coeff_cost = 0;
-  const int rdo = encoder_state->encoder_control->rdo;

-  int width = LCU_WIDTH>>depth;
-  int x,y;
-  cur_cu = &lcu->cu[LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH];
+static int cu_rd_cost_chroma(const encoder_state *const encoder_state,
+  const int x_px, const int y_px, const int depth,
+  const cu_info *const pred_cu,
+  lcu_t *const lcu)
+{
+  const vector2d lcu_px = { x_px / 2, y_px / 2 };
+  const int rdo = encoder_state->encoder_control->rdo;
+  const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
+  cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH];
+
+  int x, y;
+
+  int cost = 0;
+
+  assert(x_px >= 0 && x_px < LCU_WIDTH);
+  assert(y_px >= 0 && y_px < LCU_WIDTH);
+
+  if (depth < MAX_PU_DEPTH) {
+    int trtree_bits = 0;
+    // cbf_c bits are present only when log2TrafoSize > 2
+    if (tr_cu->tr_depth == depth) {
+      // cbf_c bits are always present at transform depth 0.
+      trtree_bits += 2;
+    } else {
+      // cbf_c bits are not present if cbf has already been set to 0.
+      trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1);
+      trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1);
+    }
+    cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+  } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
+    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
+    // block and return 0 cost for all others.
+    return 0;
+  }
+
+  if (tr_cu->tr_depth > depth) {
+    int offset = LCU_WIDTH >> (depth + 1);
+
+    cost += cu_rd_cost_chroma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
+    cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+
+    return cost;
+  }

  // Chroma SSD
-  for (y = y_local>>1; y < (y_local+width)>>1; ++y) {
-    for (x = x_local>>1; x < (x_local+width)>>1; ++x) {
-      int diff = (int)lcu->rec.u[y * (LCU_WIDTH>>1) + x] - (int)lcu->ref.u[y * (LCU_WIDTH>>1) + x];
-      cost += diff*diff;
-      diff = (int)lcu->rec.v[y * (LCU_WIDTH>>1) + x] - (int)lcu->ref.v[y * (LCU_WIDTH>>1) + x];
-      cost += diff*diff;
+  for (y = lcu_px.y; y < lcu_px.y + width; ++y) {
+    for (x = lcu_px.x; x < lcu_px.x + width; ++x) {
+      int diff = (int)lcu->rec.u[y * LCU_WIDTH_C + x] - (int)lcu->ref.u[y * LCU_WIDTH_C + x];
+      cost += diff * diff;
+      diff = (int)lcu->rec.v[y * LCU_WIDTH_C + x] - (int)lcu->ref.v[y * LCU_WIDTH_C + x];
+      cost += diff * diff;
    }
  }

-  if(rdo == 1) {
-    // Chroma sum of coeffs
-    for (y = y_local>>1; y < (y_local+width)>>1; ++y) {
-      for (x = x_local>>1; x < (x_local+width)>>1; ++x) {
-        coeff_cost += abs((int)lcu->coeff.u[y * (LCU_WIDTH>>1) + x]);
-        coeff_cost += abs((int)lcu->coeff.v[y * (LCU_WIDTH>>1) + x]);
-      }
-    }
-    // Coefficient costs
-    cost += (coeff_cost + (coeff_cost>>1)) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+  if (rdo == 1) {
+    int coeff_cost = 0;

-  // Calculate actual bit costs for coding the coeffs
-  // RDO
+    // Estimate coding cost to be 1.5 * summ of abs coeffs.
+    for (y = lcu_px.y; y < lcu_px.y + width; ++y) {
+      for (x = lcu_px.x; x < lcu_px.x + width; ++x) {
+        coeff_cost += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]);
+        coeff_cost += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]);
+      }
+    }
+
+    cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
  } else if (rdo == 2) {
-    coefficient coeff_temp_u[16*16];
-    coefficient coeff_temp_v[16*16];
-    int i;
-    int blocks = (width == 64)?4:1;
-    int8_t chroma_scan_mode = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, depth);
+    coefficient coeff_temp[16 * 16];
+    int8_t scan_order = get_scan_order(pred_cu->type, pred_cu->intra[0].mode_chroma, depth);

-    for(i = 0; i < blocks; i++) {
-      // For 64x64 blocks we need to do transform split to 32x32
-      int blk_y = i&2 ? 32:0 + y_local;
-      int blk_x = i&1 ? 32:0 + x_local;
-      int blockwidth = (width == 64)?32:width;
+    int coeff_cost = 0;

-      blk_y >>= 1;
-      blk_x >>= 1;
-      if (blockwidth > 4) {
-        // Chroma is 1/4th of luma unless luma is 4x4.
-        blockwidth >>= 1;
-      } else if (x_px % 8 != 0 || y_px % 8 != 0) {
-        // Only add chroma cost for 4x4 blocks for the one on the 8x8 grid.
-        break;
-      }
+    coefficients_blit(&lcu->coeff.u[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x],
+      coeff_temp, width, width, LCU_WIDTH_C, width);
+    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);

-      coefficients_blit(&lcu->coeff.u[(blk_y*(LCU_WIDTH>>1))+blk_x],coeff_temp_u,blockwidth,blockwidth,LCU_WIDTH>>1,blockwidth);
-      coefficients_blit(&lcu->coeff.v[(blk_y*(LCU_WIDTH>>1))+blk_x],coeff_temp_v,blockwidth,blockwidth,LCU_WIDTH>>1,blockwidth);

-      coeff_cost += get_coeff_cost(encoder_state, coeff_temp_u, blockwidth, 2, chroma_scan_mode);
-      coeff_cost += get_coeff_cost(encoder_state, coeff_temp_v, blockwidth, 2, chroma_scan_mode);
-    }
+    coefficients_blit(&lcu->coeff.v[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x],
+      coeff_temp, width, width, LCU_WIDTH_C, width);
+    coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order);
+
    // Multiply bit count with lambda to get RD-cost
-    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
+    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
  }

  return cost;
 }


+/**
+* \brief Perform search for best intra transform split configuration.
+*
+* This function does a recursive search for the best intra transform split
+* configuration for a given intra prediction mode.
+*
+* \return RD cost of best transform split configuration. Splits in lcu->cu.
+* \param depth  Current transform depth.
+* \param max_depth  Depth to which TR split will be tried.
+* \param intra_mode  Intra prediction mode.
+* \param cost_treshold  RD cost at which search can be stopped.
+*/
+static int32_t search_intra_trdepth(encoder_state * const encoder_state,
+  int x_px, int y_px, int depth, int max_depth,
+  int intra_mode, int cost_treshold,
+  const cu_info *const pred_cu,
+  lcu_t *const lcu)
+{
+  const int width = LCU_WIDTH >> depth;
+  const int offset = width / 2;
+  const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
+  cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+
+  pixel nosplit_pixels[TR_MAX_WIDTH*TR_MAX_WIDTH];
+
+  int32_t split_cost = INT32_MAX;
+  int32_t nosplit_cost = INT32_MAX;
+
+  assert(width >= TR_MIN_WIDTH);
+
+  if (depth > 0) {
+    tr_cu->tr_depth = depth;
+    intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, lcu);
+    nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+
+    // Clear cbf bits because they have been set by the reconstruction.
+    cbf_clear(&tr_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4));
+
+    // Early stop codition for the recursive search.
+    // If the cost of any 1/4th of the transform is already larger than the
+    // whole transform, assume that splitting further is a bad idea.
+    if (nosplit_cost >= cost_treshold) {
+      return nosplit_cost;
+    }
+
+    pixels_blit(lcu->rec.y, nosplit_pixels, width, width, LCU_WIDTH, width);
+  }
+
+  if (depth < max_depth && depth < MAX_PU_DEPTH) {
+    split_cost = 3 * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5);
+
+    split_cost += search_intra_trdepth(encoder_state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
+    if (split_cost < nosplit_cost) {
+      split_cost += search_intra_trdepth(encoder_state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
+    }
+    if (split_cost < nosplit_cost) {
+      split_cost += search_intra_trdepth(encoder_state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
+    }
+    if (split_cost < nosplit_cost) {
+      split_cost += search_intra_trdepth(encoder_state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
+    }
+  } else {
+    assert(width <= TR_MAX_WIDTH);
+  }
+
+  if (depth == 0 || split_cost < nosplit_cost) {
+    return split_cost;
+  } else {
+    lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+
+    // We only restore the pixel data and not coefficients or cbf data.
+    // The only thing we really need are the border pixels.
+    pixels_blit(nosplit_pixels, lcu->rec.y, width, width, width, LCU_WIDTH);
+
+    return nosplit_cost;
+  }
+}
+

 static void sort_modes(int8_t *modes, uint32_t *costs, int length)
 {
@ -995,12 +1132,17 @@ static int8_t search_intra_rough(encoder_state * const encoder_state,


 static void search_intra_rdo(encoder_state * const encoder_state, 
+                             int x_px, int y_px, int depth,
                             pixel *orig, int32_t origstride,
                             pixel *rec, int16_t recstride,
-                             int width, int8_t *intra_preds,
+                             int8_t *intra_preds,
+                             int modes_to_check,
                             int8_t modes[35], uint32_t costs[35],
-                             int modes_to_check)
+                             lcu_t *lcu)
 {
+  const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + encoder_state->encoder_control->tr_depth_intra);
+  const int width = LCU_WIDTH >> depth;
+
  pixel pred[LCU_WIDTH * LCU_WIDTH + 1];
  pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];
  int rdo_mode;
@ -1040,15 +1182,26 @@ static void search_intra_rdo(encoder_state * const encoder_state,
  }

  for(rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost;
+    int rdo_bitcost = intra_pred_ratecost(modes[rdo_mode], intra_preds);
+    costs[rdo_mode] = rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
+
+    if (tr_depth == depth) {
      // The reconstruction is calculated again here, it could be saved from before..
      intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[rdo_mode], 0);
+      costs[rdo_mode] += rdo_cost_intra(encoder_state, pred, orig_block, width, modes[rdo_mode], width == 4 ? 1 : 0);
+    } else {
+      // Perform transform split search and save mode RD cost for the best one.
+      cu_info pred_cu;
+      pred_cu.depth = depth;
+      pred_cu.type = CU_INTRA;
+      pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
+      pred_cu.intra[0].mode = modes[rdo_mode];

-    costs[rdo_mode] = rdo_cost_intra(encoder_state,pred,orig_block,width,modes[rdo_mode], width == 4 ? 1 : 0);
-    // Bitcost also calculated again for this mode
-    rdo_bitcost = intra_pred_ratecost(modes[rdo_mode],intra_preds);
-    // Add bitcost * lambda
-    costs[rdo_mode] += rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
+      // Reset transform split data in lcu.cu for this area.
+      lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+
+      costs[rdo_mode] += search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
+    }
  }

  sort_modes(modes, costs, modes_to_check);
@ -1109,13 +1262,18 @@ static int search_cu_intra(encoder_state * const encoder_state,
                                                cu_width, candidate_modes,
                                                modes, costs);

+    // Set transform depth to current depth, meaning no transform splits.
+    lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+
    if (encoder_state->encoder_control->rdo == 2) {
      int num_modes_to_check = MIN(number_of_modes, (cu_width <= 8) ? 8 : 3);
      search_intra_rdo(encoder_state, 
+                       lcu_px.x, lcu_px.y, depth,
                       ref_pixels, LCU_WIDTH,
                       cu_in_rec_buffer, cu_width * 2 + 8,
-                       cu_width, candidate_modes,
-                       modes, costs, num_modes_to_check);
+                       candidate_modes,
+                       num_modes_to_check,
+                       modes, costs, lcu);
    }

    cur_cu->intra[pu_index].mode = modes[0];
@ -1194,11 +1352,13 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
    // Reconstruct best mode because we need the reconstructed pixels for
    // mode search of adjacent CUs.
    if (cur_cu->type == CU_INTRA) {
-      lcu_set_intra_mode(&work_tree[depth], x, y, depth, cur_cu->tr_depth,
-                         cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode,
-                         cur_cu->intra[0].mode_chroma,
+      int8_t intra_mode = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode;
+      int8_t intra_mode_chroma = cur_cu->intra[0].mode_chroma;
+      lcu_set_intra_mode(&work_tree[depth], x, y, depth,
+                         intra_mode,
+                         intra_mode_chroma,
                         cur_cu->part_size);
-      intra_recon_lcu_luma(encoder_state, x, y, depth, &work_tree[depth]);
+      intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]);
      intra_recon_lcu_chroma(encoder_state, x, y, depth, &work_tree[depth]);
    } else if (cur_cu->type == CU_INTER) {
      int cbf;
@ -1219,8 +1379,8 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
    }
  }
  if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = lcu_get_final_cost_luma(encoder_state, x, y, depth, &work_tree[depth]);
-    cost += lcu_get_final_cost_chroma(encoder_state, x, y, depth, &work_tree[depth]);
+    cost = cu_rd_cost_luma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]);
+    cost += cu_rd_cost_chroma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]);
    
    // Bitcost
    cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
--- a/src/transform.c
+++ b/src/transform.c
@ -417,6 +417,9 @@ int quantize_residual(encoder_state *const encoder_state,

  int has_coeffs = 0;

+  assert(width <= TR_MAX_WIDTH);
+  assert(width >= TR_MIN_WIDTH);
+
  // Get residual. (ref_in - pred_in -> residual)
  {
    int y, x;