diff --git a/src/encoder.c b/src/encoder.c
index d971c824..fe16f596 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1871,7 +1871,7 @@ void encode_transform_tree(encoder_control* encoder, int32_t x, int32_t y, uint8
 
     if(cur_cu->type == CU_INTRA)
     {
-      int pu_index = x_pu&1 + 2 * (y_pu&1);
+      int pu_index = PU_INDEX(x_pu, y_pu);
       int luma_mode = cur_cu->intra[pu_index].mode;
       scan_idx_luma = SCAN_DIAG;
 
@@ -1944,7 +1944,6 @@ void encode_transform_tree(encoder_control* encoder, int32_t x, int32_t y, uint8
 
     // Check for non-zero coeffs
     cbf_y = 0;
-    memset(cur_cu->coeff_top_y, 0, MAX_PU_DEPTH + 4);
     for (i = 0; i < width * width; i++) {
       if (coeff_y[i] != 0) {
         // Found one, we can break here
@@ -1955,8 +1954,12 @@ void encode_transform_tree(encoder_control* encoder, int32_t x, int32_t y, uint8
             cur_cu->coeff_top_y[d] = 1;
           }
         } else {
-          int pu_index = x_pu&1 + 2 * (y_pu&1);
+          int pu_index = (x_pu & 1) + 2 * (y_pu & 1);
+          int d;
           cur_cu->coeff_top_y[depth + pu_index] = 1;
+          for (d = 0; d < depth; ++d) {
+            cur_cu->coeff_top_y[d] = 1;
+          }
         }
         break;
       }
@@ -2014,7 +2017,6 @@ void encode_transform_tree(encoder_control* encoder, int32_t x, int32_t y, uint8
       }
 
       transform_chroma(encoder, cur_cu, chroma_depth, base_u, pred_u, coeff_u, scan_idx_chroma, pre_quant_coeff, block);
-      memset(cur_cu->coeff_top_u, 0, MAX_PU_DEPTH + 4);
       for (i = 0; i < chroma_size; i++) {
         if (coeff_u[i] != 0) {
           int d;
@@ -2025,7 +2027,6 @@ void encode_transform_tree(encoder_control* encoder, int32_t x, int32_t y, uint8
         }
       }
       transform_chroma(encoder, cur_cu, chroma_depth, base_v, pred_v, coeff_v, scan_idx_chroma, pre_quant_coeff, block);
-      memset(cur_cu->coeff_top_v, 0, MAX_PU_DEPTH + 4);
       for (i = 0; i < chroma_size; i++) {
         if (coeff_v[i] != 0) {
           int d;
diff --git a/src/global.h b/src/global.h
index 4675ade1..a9a47d58 100644
--- a/src/global.h
+++ b/src/global.h
@@ -59,7 +59,7 @@ typedef int16_t coefficient;
 #define MAX_INTER_SEARCH_DEPTH 3
 #define MIN_INTER_SEARCH_DEPTH 0
 
-#define MAX_INTRA_SEARCH_DEPTH 3 /*!< Max search depth -> min block size (3 == 8x8) */
+#define MAX_INTRA_SEARCH_DEPTH 4 /*!< Max search depth -> min block size (3 == 8x8) */
 #define MIN_INTRA_SEARCH_DEPTH 1 /*!< Min search depth -> max block size (0 == 64x64) */
 
 
@@ -99,6 +99,7 @@ typedef int16_t coefficient;
 #define NO_SCU_IN_LCU(no_lcu) ((no_lcu) << MAX_DEPTH)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 #define UNREFERENCED_PARAMETER(p) (p)
+#define PU_INDEX(x_pu, y_pu) (((x_pu) % 2)  + 2 * ((y_pu) % 2))
 
 #define LOG2_LCU_WIDTH 6
 // CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width
diff --git a/src/intra.c b/src/intra.c
index 8b68cd07..42f589ad 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -100,8 +100,6 @@ pixel intra_get_dc_pred(pixel *pic, uint16_t picwidth, uint8_t width)
   return (pixel)((sum + width) / (width + width));
 }
 
-#define PU_INDEX(x_pu, y_pu) (((x_pu) % 2)  + 2 * ((y_pu) % 2))
-
 /**
  * \brief Function for deriving intra luma predictions
  * \param pic picture to use
@@ -120,24 +118,24 @@ int8_t intra_get_dir_luma_predictor(uint32_t x, uint32_t y, int8_t* preds,
   int8_t left_intra_dir  = 1;
   int8_t above_intra_dir = 1;
 
-  if (cur_cu->part_size == SIZE_NxN && (x & 7) == 1) {
+  if (x & 4) {
     // If current CU is NxN and PU is on the right half, take mode from the
     // left half of the same CU.
-    left_intra_dir = cur_cu->intra[PU_INDEX(0, y_cu<<1)].mode;
+    left_intra_dir = cur_cu->intra[PU_INDEX(0, y >> 2)].mode;
   } else if (left_cu && left_cu->type == CU_INTRA) {
     // Otherwise take the mode from the right side of the CU on the left.
-    left_intra_dir = left_cu->intra[PU_INDEX(1, y_cu<<1)].mode;
+    left_intra_dir = left_cu->intra[PU_INDEX(1, y >> 2)].mode;
   }
 
-  if (cur_cu->part_size == SIZE_NxN && (y & 7) == 1) {
+  if (y & 4) {
     // If current CU is NxN and PU is on the bottom half, take mode from the
     // top half of the same CU.
-    above_intra_dir = cur_cu->intra[PU_INDEX(x_cu<<1, 0)].mode;
+    above_intra_dir = cur_cu->intra[PU_INDEX(x >> 2, 0)].mode;
   } else if (above_cu && above_cu->type == CU_INTRA &&
              (y_cu * (LCU_WIDTH>>MAX_DEPTH)) % LCU_WIDTH != 0)
   {
     // Otherwise take the mode from the bottom half of the CU above.
-    above_intra_dir = above_cu->intra[PU_INDEX(x_cu<<1, 1)].mode;
+    above_intra_dir = above_cu->intra[PU_INDEX(x >> 2, 1)].mode;
   }
 
   // If the predictions are the same, add new predictions
@@ -158,7 +156,7 @@ int8_t intra_get_dir_luma_predictor(uint32_t x, uint32_t y, int8_t* preds,
     // add planar mode if it's not yet present
     if (left_intra_dir && above_intra_dir ) {
       preds[2] = 0; // PLANAR_IDX;
-    } else { // else we add 26 or 1
+    } else {  // Add DC mode if it's not present, otherwise 26.
       preds[2] =  (left_intra_dir+above_intra_dir)<2? 26 : 1;
     }
   }
@@ -778,69 +776,50 @@ void intra_recon_lcu(encoder_control* encoder, int x, int y, int depth, lcu_t *l
   pixel *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
 
   int8_t width = LCU_WIDTH >> depth;
-  int8_t width_c = LCU_WIDTH >> (depth + 1);
+  int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
   static vector2d offsets[4] = {{0,0},{1,0},{0,1},{1,1}};
   int num_pu = (cur_cu->part_size == SIZE_2Nx2N ? 1 : 4);
-  int i;
-
-  if (cur_cu->part_size == SIZE_NxN) {
-    width = width_c;
-  }
+  int i = PU_INDEX(x >> 2, y >> 2);
 
   cur_cu->intra[0].mode_chroma = 36; // TODO: Chroma intra prediction
 
-  // Reconstruct chroma
-  rec_shift  = &rec[width_c * 2 + 8 + 1];
-  intra_build_reference_border(x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 1,
-                                   pic_width/2, pic_height/2, lcu);
-  intra_recon(rec_shift,
-              width_c * 2 + 8,
-              width_c,
-              recbase_u,
-              rec_stride >> 1,
-              cur_cu->intra[0].mode_chroma != 36 ? cur_cu->intra[0].mode_chroma : cur_cu->intra[0].mode,
-              1);
+  // Reconstruct chroma.
+  if (!(x & 4 || y & 4)) {
+    rec_shift  = &rec[width_c * 2 + 8 + 1];
+    intra_build_reference_border(x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 1,
+                                     pic_width/2, pic_height/2, lcu);
+    intra_recon(rec_shift,
+                width_c * 2 + 8,
+                width_c,
+                recbase_u,
+                rec_stride >> 1,
+                cur_cu->intra[0].mode_chroma != 36 ? cur_cu->intra[0].mode_chroma : cur_cu->intra[0].mode,
+                1);
 
-  intra_build_reference_border(x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 2,
-                                   pic_width/2, pic_height/2, lcu);
-  intra_recon(rec_shift,
-              width_c * 2 + 8,
-              width_c,
-              recbase_v,
-              rec_stride >> 1,
-              cur_cu->intra[0].mode_chroma != 36 ? cur_cu->intra[0].mode_chroma : cur_cu->intra[0].mode,
-              2);
-
-  for (i = 0; i < num_pu; ++i) {
-    // Build reconstructed block to use in prediction with extrapolated borders
-    int x_off = offsets[i].x * width;
-    int y_off = offsets[i].y * width;
-    recbase_y = &lcu->rec.y[x_local + x_off + (y_local+y_off) * LCU_WIDTH];
-
-    rec_shift  = &rec[width * 2 + 8 + 1];
-    intra_build_reference_border(x+x_off, y+y_off,(int16_t)width * 2 + 8, rec, (int16_t)width * 2 + 8, 0,
-                                 pic_width, pic_height, lcu);
-    intra_recon(rec_shift, width * 2 + 8,
-                width, recbase_y, rec_stride, cur_cu->intra[i].mode, 0);
-
-    // Filter DC-prediction
-    if (cur_cu->intra[i].mode == 1 && width < 32) {
-      intra_dc_pred_filtering(rec_shift, width * 2 + 8, recbase_y,
-                              rec_stride, width, width);
-    }
-
-    // Handle NxN mode by doing quant/transform and inverses for the next NxN block
-    if (cur_cu->part_size == SIZE_NxN) {
-      encode_transform_tree(encoder, x + x_off, y + y_off, depth+1, lcu);
-    }
+    intra_build_reference_border(x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 2,
+                                     pic_width/2, pic_height/2, lcu);
+    intra_recon(rec_shift,
+                width_c * 2 + 8,
+                width_c,
+                recbase_v,
+                rec_stride >> 1,
+                cur_cu->intra[0].mode_chroma != 36 ? cur_cu->intra[0].mode_chroma : cur_cu->intra[0].mode,
+                2);
   }
 
-  // If we coded NxN block, fetch the coded block flags to this level
-  if (cur_cu->part_size == SIZE_NxN) {
-    cur_cu->coeff_top_y[depth] = cur_cu->coeff_top_y[depth+1] | cur_cu->coeff_top_y[depth+2] | cur_cu->coeff_top_y[depth+3] | cur_cu->coeff_top_y[depth+4];
-    cur_cu->coeff_top_u[depth] = cur_cu->coeff_top_u[depth+1];
-    cur_cu->coeff_top_v[depth] = cur_cu->coeff_top_v[depth+1];
-    return;
+  // Build reconstructed block to use in prediction with extrapolated borders
+  recbase_y = &lcu->rec.y[x_local + y_local * LCU_WIDTH];
+
+  rec_shift  = &rec[width * 2 + 8 + 1];
+  intra_build_reference_border(x, y,(int16_t)width * 2 + 8, rec, (int16_t)width * 2 + 8, 0,
+                                pic_width, pic_height, lcu);
+  intra_recon(rec_shift, width * 2 + 8,
+              width, recbase_y, rec_stride, cur_cu->intra[i].mode, 0);
+
+  // Filter DC-prediction
+  if (cur_cu->intra[i].mode == 1 && width < 32) {
+    intra_dc_pred_filtering(rec_shift, width * 2 + 8, recbase_y,
+                            rec_stride, width, width);
   }
 
   encode_transform_tree(encoder, x, y, depth, lcu);
diff --git a/src/search.c b/src/search.c
index 01f5db02..d25b0566 100644
--- a/src/search.c
+++ b/src/search.c
@@ -477,7 +477,7 @@ static int search_cu_inter(encoder_control *encoder, int x, int y, int depth, lc
 /**
  * Copy all non-reference CU data from depth+1 to depth.
  */
-static void work_tree_copy_up(int x_px, int y_px, int depth, lcu_t work_tree[MAX_PU_DEPTH])
+static void work_tree_copy_up(int x_px, int y_px, int depth, lcu_t work_tree[MAX_PU_DEPTH + 1])
 {
   // Copy non-reference CUs.
   {
@@ -530,14 +530,14 @@ static void work_tree_copy_up(int x_px, int y_px, int depth, lcu_t work_tree[MAX
 /**
  * Copy all non-reference CU data from depth to depth+1..MAX_PU_DEPTH.
  */
-static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[MAX_PU_DEPTH])
+static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[MAX_PU_DEPTH + 1])
 {
   // TODO: clean up to remove the copy pasta
   const int width_px = LCU_WIDTH >> depth;
 
   int d;
 
-  for (d = depth + 1; d < MAX_PU_DEPTH; ++d) {
+  for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
     const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
     const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
     const int width_cu = width_px >> MAX_DEPTH;
@@ -553,7 +553,7 @@ static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[M
   }
 
   // Copy reconstructed pixels.
-  for (d = depth + 1; d < MAX_PU_DEPTH; ++d) {
+  for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
     const int x = SUB_SCU(x_px);
     const int y = SUB_SCU(y_px);
 
@@ -584,11 +584,11 @@ static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pr
   // NxN can only be applied to a single CU at a time.
   if (part_mode == SIZE_NxN) {
     cu_info *cu = &lcu_cu[x_cu + y_cu * LCU_T_CU_WIDTH];
-    cu->depth = depth;
+    cu->depth = MAX_DEPTH;
     cu->type = CU_INTRA;
     // It is assumed that cu->intra[].mode's are already set.
     cu->part_size = part_mode;
-    cu->tr_depth = depth + 1;
+    cu->tr_depth = depth;
     return;
   }
 
@@ -704,69 +704,18 @@ static int search_cu_intra(encoder_control *encoder,
     uint32_t cost = -1;
     int16_t mode = -1;
     pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
+    unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2);
     mode = intra_prediction(ref_pixels, LCU_WIDTH,
                             cu_in_rec_buffer, cu_width * 2 + 8, cu_width,
                             pred_buffer, cu_width,
                             &cost, candidate_modes, &bitcost);
-    cur_cu->intra[0].mode = (int8_t)mode;
-    cur_cu->intra[0].cost = cost;
-    cur_cu->part_size = SIZE_2Nx2N;
+    cur_cu->intra[pu_index].mode = (int8_t)mode;
+    cur_cu->intra[pu_index].cost = cost;
   }
 
-  // Do search for NxN split.
-  if (0 && depth == MAX_DEPTH) { //TODO: reactivate NxN when _something_ is done to make it better
-    static const vector2d offsets[4] = {{0,0},{4,0},{0,4},{4,4}};
-    const int nxn_width = 4;
+  cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].bitcost = bitcost;
+  return cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].cost;
 
-    // Save 2Nx2N information to compare with NxN.
-    int nn_cost = cur_cu->intra[0].cost;
-    int8_t nn_mode = cur_cu->intra[0].mode;
-    int cost = (int)(g_cur_lambda_cost * 4.5);  // +0.5 to round to nearest
-
-    int nxn_i;
-
-    cu_in_rec_buffer = &rec_buffer[nxn_width * 2 + 8 + 1];
-
-    bitcost_nxn = 0;
-
-    for (nxn_i = 0; nxn_i < 4; ++nxn_i) {
-      const vector2d nxn_px = { x_px + offsets[nxn_i].x,
-                                y_px + offsets[nxn_i].y };
-      intra_get_dir_luma_predictor(nxn_px.x, nxn_px.y, candidate_modes,
-                                   cur_cu, left_cu, above_cu);
-      intra_build_reference_border(nxn_px.x, nxn_px.y, nxn_width * 2 + 8,
-                                   rec_buffer, nxn_width * 2 + 8, 0,
-                                   encoder->in.cur_pic->width, encoder->in.cur_pic->height,
-                                   lcu);
-      {
-        uint32_t nxn_cost = -1;
-        int16_t nxn_mode = -1;
-        uint32_t bitcost_temp = 0;
-        pixel *ref_pixels = &lcu->ref.y[nxn_px.x + nxn_px.y * LCU_WIDTH];
-        nxn_mode = intra_prediction(ref_pixels, encoder->in.width,
-                                    cu_in_rec_buffer, nxn_width * 2 + 8, nxn_width,
-                                    pred_buffer, nxn_width,
-                                    &nxn_cost, candidate_modes, &bitcost_temp);
-        cur_cu->intra[nxn_i].mode = (int8_t)nxn_mode;
-        cost += nxn_cost;
-        bitcost_nxn += bitcost_temp;
-      }
-    }
-
-    // Choose between 2Nx2N and NxN.
-    if (nn_cost <= cost) {
-      cur_cu->intra[0].cost = nn_cost;
-      cur_cu->intra[0].mode = nn_mode;
-    } else {
-      cur_cu->intra[0].cost = cost;
-      cur_cu->part_size = SIZE_NxN;
-      bitcost = bitcost_nxn;
-    }
-  }
-
-  cur_cu->intra[0].bitcost = bitcost;
-
-  return cur_cu->intra[0].cost;
 }
 
 /**
@@ -841,8 +790,10 @@ static int search_cu(encoder_control *encoder, int x, int y, int depth, lcu_t wo
 
   cur_cu = &(&work_tree[depth])->cu[LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH];
   // Assign correct depth
-  cur_cu->depth = depth; cur_cu->tr_depth = depth ? depth : 1;
-  cur_cu->type = CU_NOTSET; cur_cu->part_size = SIZE_2Nx2N;
+  cur_cu->depth = depth > MAX_DEPTH ? MAX_DEPTH : depth;
+  cur_cu->tr_depth = depth > 0 ? depth : 1;
+  cur_cu->type = CU_NOTSET;
+  cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
   if (x + cu_width <= encoder->in.width &&
@@ -874,7 +825,7 @@ static int search_cu(encoder_control *encoder, int x, int y, int depth, lcu_t wo
     // Reconstruct best mode because we need the reconstructed pixels for
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
-      lcu_set_intra_mode(&work_tree[depth], x, y, depth, cur_cu->intra[0].mode, cur_cu->part_size);
+      lcu_set_intra_mode(&work_tree[depth], x, y, depth, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode, cur_cu->part_size);
       intra_recon_lcu(encoder, x, y, depth,&work_tree[depth],encoder->in.cur_pic->width,encoder->in.cur_pic->height);
     } else if (cur_cu->type == CU_INTER) {
       inter_recon_lcu(encoder->ref->pics[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
@@ -1117,11 +1068,11 @@ static void copy_lcu_to_cu_data(encoder_control *encoder, int x_px, int y_px, co
  */
 static void search_lcu(encoder_control *encoder, int x, int y)
 {
-  lcu_t work_tree[MAX_PU_DEPTH];
+  lcu_t work_tree[MAX_PU_DEPTH + 1];
   int depth;
-  memset(work_tree, 0, sizeof(lcu_t)*MAX_PU_DEPTH);
   // Initialize work tree.
-  for (depth = 0; depth < MAX_PU_DEPTH; ++depth) {
+  for (depth = 0; depth <= MAX_PU_DEPTH; ++depth) {
+    memset(&work_tree[depth], 0, sizeof(work_tree[depth]));
     init_lcu_t(encoder, x, y, &work_tree[depth]);
   }