diff --git a/src/global.h b/src/global.h
index ffc279b6..b39264ea 100644
--- a/src/global.h
+++ b/src/global.h
@@ -124,6 +124,7 @@ typedef int16_t coeff_t;
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define CLIP(low,high,value) MAX((low),MIN((high),(value)))
+#define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
 #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
diff --git a/src/intra.c b/src/intra.c
index edf0b7c7..18c3667d 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -28,91 +28,18 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
-#include "config.h"
 #include "encoder.h"
 #include "transform.h"
-#include "rdo.h"
 
 
-const uint8_t kvz_intra_hor_ver_dist_thres[5] = {0,7,1,0,0};
-
-
-/**
- * \brief Set intrablock mode (and init typedata)
- * \param pic picture to use
- * \param xCtb x CU position (smallest CU)
- * \param yCtb y CU position (smallest CU)
- * \param depth current CU depth
- * \param mode mode to set
- * \returns Void
- */
-void kvz_intra_set_block_mode(videoframe_t *frame,uint32_t x_cu, uint32_t y_cu, uint8_t depth, uint8_t mode, uint8_t part_mode)
-{
-  uint32_t x, y;
-  int block_scu_width = (LCU_WIDTH>>depth)/(LCU_WIDTH>>MAX_DEPTH);
-
-  if (part_mode == SIZE_NxN) {
-    cu_info_t *cur_cu = kvz_videoframe_get_cu(frame, x_cu, y_cu);
-    // Modes are already set.
-    cur_cu->depth = depth;
-    cur_cu->type = CU_INTRA;
-    cur_cu->tr_depth = depth + 1;
-    return;
-  }
-
-  // Loop through all the blocks in the area of cur_cu
-  for (y = y_cu; y < y_cu + block_scu_width; y++) {
-    for (x = x_cu; x < x_cu + block_scu_width; x++) {
-      cu_info_t *cur_cu = kvz_videoframe_get_cu(frame, x_cu, y_cu);
-      cur_cu->depth = depth;
-      cur_cu->type = CU_INTRA;
-      cur_cu->intra[0].mode = mode;
-      cur_cu->intra[1].mode = mode;
-      cur_cu->intra[2].mode = mode;
-      cur_cu->intra[3].mode = mode;
-      cur_cu->part_size = part_mode;
-      cur_cu->tr_depth = depth;
-    }
-  }
-}
-
-/**
- * \brief get intrablock mode
- * \param pic picture data to use
- * \param picwidth width of the picture data
- * \param xpos x-position
- * \param ypos y-position
- * \param width block width
- * \returns DC prediction
-*/
-kvz_pixel kvz_intra_get_dc_pred(const kvz_pixel *pic, uint16_t picwidth, uint8_t width)
-{
-  int32_t i, sum = 0;
-
-  // pixels on top and left
-  for (i = -picwidth; i < width - picwidth; i++) {
-    sum += pic[i];
-  }
-  for (i = -1; i < width * picwidth - 1; i += picwidth) {
-    sum += pic[i];
-  }
-
-  // return the average
-  return (kvz_pixel)((sum + width) / (width + width));
-}
-
-/**
- * \brief Function for deriving intra luma predictions
- * \param pic picture to use
- * \param x_cu x CU position (smallest CU)
- * \param y_cu y CU position (smallest CU)
- * \param preds output buffer for 3 predictions
- * \returns (predictions are found)?1:0
- */
-int8_t kvz_intra_get_dir_luma_predictor(const uint32_t x, const uint32_t y, int8_t* preds,
-                                    const cu_info_t * const cur_cu, const cu_info_t * const left_cu, const cu_info_t * const above_cu)
+int8_t kvz_intra_get_dir_luma_predictor(
+  const uint32_t x,
+  const uint32_t y,
+  int8_t *preds,
+  const cu_info_t *const cur_cu,
+  const cu_info_t *const left_cu,
+  const cu_info_t *const above_cu)
 {
   int y_cu = y>>3;
 
@@ -166,158 +93,349 @@ int8_t kvz_intra_get_dir_luma_predictor(const uint32_t x, const uint32_t y, int8
   return 1;
 }
 
-/**
- * \brief Intra filtering of the border samples
- * \param ref reference picture data
- * \param x_cu x CU position (smallest CU)
- * \param y_cu y CU position (smallest CU)
- * \param depth current CU depth
- * \param preds output buffer for 3 predictions
- * \returns (predictions are found)?1:0
- */
-void kvz_intra_filter(kvz_pixel *ref, int32_t stride,int32_t width, int8_t mode)
+
+static void intra_filter_reference(
+  int_fast8_t log2_width,
+  kvz_intra_references *refs)
 {
-  #define FWIDTH (LCU_WIDTH*2+1)
-  kvz_pixel filtered[FWIDTH * FWIDTH]; //!< temporary buffer for filtered samples
-  kvz_pixel *filteredShift = &filtered[FWIDTH+1]; //!< pointer to temporary buffer with offset (1,1)
-  int x,y;
-
-  if (!mode) {
-    // pF[ -1 ][ -1 ] = ( p[ -1 ][ 0 ] + 2*p[ -1 ][ -1 ] + p[ 0 ][ -1 ] + 2 )  >>  2	(8 35)
-    filteredShift[-FWIDTH-1] = (ref[-1] + 2*ref[-(int32_t)stride-1] + ref[-(int32_t)stride] + 2) >> 2;
-
-    // pF[ -1 ][ y ] = ( p[ -1 ][ y + 1 ] + 2*p[ -1 ][ y ] + p[ -1 ][ y - 1 ] + 2 )  >>  2 for y = 0..nTbS * 2 - 2	(8 36)
-    for (y = 0; y < (int32_t)width * 2 - 1; y++) {
-      filteredShift[y*FWIDTH-1] = (ref[(y + 1) * stride - 1] + 2*ref[y * stride - 1] + ref[(y - 1) * stride - 1] + 2) >> 2;
-    }
-
-    // pF[ -1 ][ nTbS * 2 - 1 ] = p[ -1 ][ nTbS * 2 - 1 ]		(8 37)
-    filteredShift[(width * 2 - 1) * FWIDTH - 1] = ref[(width * 2 - 1) * stride - 1];
-
-    // pF[ x ][ -1 ] = ( p[ x - 1 ][ -1 ] + 2*p[ x ][ -1 ] + p[ x + 1 ][ -1 ] + 2 )  >>  2 for x = 0..nTbS * 2 - 2	(8 38)
-    for(x = 0; x < (int32_t)width*2-1; x++) {
-      filteredShift[x - FWIDTH] = (ref[x - 1 - stride] + 2*ref[x - stride] + ref[x + 1 - stride] + 2) >> 2;
-    }
-
-    // pF[ nTbS * 2 - 1 ][ -1 ] = p[ nTbS * 2 - 1 ][ -1 ]
-    filteredShift[(width * 2 - 1) - FWIDTH] = ref[(width * 2 - 1) - stride];
-
-    // Copy filtered samples to the input array
-    for (x = -1; x < (int32_t)width * 2; x++) {
-      ref[x - stride] = filtered[x + 1];
-    }
-    for(y = 0; y < (int32_t)width * 2; y++)  {
-      ref[y * stride - 1] = filtered[(y + 1) * FWIDTH];
-    }
-  } else  {
-    printf("UNHANDLED: %s: %d\r\n", __FILE__, __LINE__);
-    exit(1);
+  if (refs->filtered_initialized) {
+    return;
+  } else {
+    refs->filtered_initialized = true;
+  }
+
+  const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
+  kvz_intra_ref *ref = &refs->ref;
+  kvz_intra_ref *filtered_ref = &refs->filtered_ref;
+
+  filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) / 4;
+  filtered_ref->top[0] = filtered_ref->left[0];
+
+  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
+    kvz_pixel *p = &ref->left[y];
+    filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
+  }
+  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];
+
+  for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
+    kvz_pixel *p = &ref->top[x];
+    filtered_ref->top[x] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
+  }
+  filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
+}
+
+
+static void intra_post_process_angular(
+  unsigned width,
+  unsigned stride,
+  const kvz_pixel *ref,
+  kvz_pixel *block)
+{
+  kvz_pixel ref2 = ref[0];
+  for (unsigned i = 0; i < width; i++) {
+    kvz_pixel val = block[i * stride];
+    kvz_pixel ref1 = ref[i + 1];
+    block[i * stride] = CLIP_TO_PIXEL(val + ((ref1 - ref2) >> 1));
   }
-  #undef FWIDTH
 }
 
 
 /**
- * \param rec  Reference pixel. 0 points to unfiltered and 1 to filtered.
- * \param recstride  Stride for rec pixel arrays.
- * \param dst
+ * \brief Generage angular predictions.
+ * \param log2_width    Log2 of width, range 2..5.
+ * \param intra_mode    Angular mode in range 2..34.
+ * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param dst           Buffer of size width*width.
  */
-void kvz_intra_get_pred(const encoder_control_t * const encoder, const kvz_pixel *rec, const kvz_pixel *rec_filtered, int recstride, kvz_pixel *dst, int width, int mode, int is_chroma)
+static void intra_pred_angular(
+  const int_fast8_t log2_width,
+  const int_fast8_t intra_mode,
+  const kvz_pixel *const in_ref_above,
+  const kvz_pixel *const in_ref_left,
+  kvz_pixel *const dst)
 {
-  const kvz_pixel *ref_pixels = rec;
-  if (is_chroma || mode == 1 || width == 4) {
+  assert(log2_width >= 2 && log2_width <= 5);
+  assert(intra_mode >= 2 && intra_mode <= 34);
+
+  static const int8_t modedisp2sampledisp[9] = {0, 2, 5, 9, 13, 17, 21, 26, 32};
+  static const int16_t modedisp2invsampledisp[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256}; // (256 * 32) / sampledisp
+
+  // Temporary buffer for modes 11-25.
+  // It only needs to be big enough to hold indices from -width to width-1.
+  kvz_pixel tmp_ref[2 * 32];
+  const int_fast8_t width = 1 << log2_width;
+
+  // Whether to swap references to always project on the left reference row.
+  const bool vertical_mode = intra_mode >= 18;
+  // Modes distance to horizontal or vertical mode.
+  const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
+  // Sample displacement per column in fractions of 32.
+  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
+
+  // Pointer for the reference we are interpolating from.
+  const kvz_pixel *ref_main;
+  // Pointer for the other reference.
+  const kvz_pixel *ref_side;
+
+  // Set ref_main and ref_side such that, when indexed with 0, they point to
+  // index 0 in block coordinates.
+  if (sample_disp < 0) {
+    // Negative sample_disp means, we need to use both references.
+
+    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
+    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
+
+    // Move the reference pixels to start from the middle to the later half of
+    // the tmp_ref, so there is room for negative indices.
+    for (int_fast8_t x = -1; x < width; ++x) {
+      tmp_ref[x + width] = ref_main[x];
+    }
+    // Get a pointer to block index 0 in tmp_ref.
+    ref_main = &tmp_ref[width];
+
+    // Extend the side reference to the negative indices of main reference.
+    int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
+    int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
+    int_fast8_t most_negative_index = (width * sample_disp) >> 5;
+    for (int_fast8_t x = -2; x >= most_negative_index; --x) {
+      col_sample_disp += inv_abs_sample_disp;
+      int_fast8_t side_index = col_sample_disp >> 8;
+      tmp_ref[x + width] = ref_side[side_index - 1];
+    }
+  } else {
+    // sample_disp >= 0 means we don't need to refer to negative indices,
+    // which means we can just use the references as is.
+    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
+    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
+  }
+
+  if (sample_disp != 0) {
+    // The mode is not horizontal or vertical, we have to do interpolation.
+
+    int_fast16_t delta_pos = 0;
+    for (int_fast8_t y = 0; y < width; ++y) {
+      delta_pos += sample_disp;
+      int_fast8_t delta_int = delta_pos >> 5;
+      int_fast8_t delta_fract = delta_pos & (32 - 1);
+
+      if (delta_fract) {
+        // Do linear filtering
+        for (int_fast8_t x = 0; x < width; ++x) {
+          kvz_pixel ref1 = ref_main[x + delta_int];
+          kvz_pixel ref2 = ref_main[x + delta_int + 1];
+          dst[y * width + x] = ((32 - delta_fract) * ref1 + delta_fract * ref2 + 16) >> 5;
+        }
+      } else {
+        // Just copy the integer samples
+        for (int_fast8_t x = 0; x < width; x++) {
+          dst[y * width + x] = ref_main[x + delta_int];
+        }
+      }
+    }
+  } else {
+    // Mode is horizontal or vertical, just copy the pixels.
+
+    for (int_fast8_t y = 0; y < width; ++y) {
+      for (int_fast8_t x = 0; x < width; ++x) {
+        dst[y * width + x] = ref_main[x];
+      }
+    }
+  }
+
+  // Flip the block if this is was a horizontal mode.
+  if (!vertical_mode) {
+    for (int_fast8_t y = 0; y < width - 1; ++y) {
+      for (int_fast8_t x = y + 1; x < width; ++x) {
+        SWAP(dst[y * width + x], dst[x * width + y], kvz_pixel);
+      }
+    }
+  }
+}
+
+
+/**
+ * \brief Generage planar prediction.
+ * \param log2_width    Log2 of width, range 2..5.
+ * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param dst           Buffer of size width*width.
+ */
+static void intra_pred_planar(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const dst)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+  const kvz_pixel top_right = ref_top[width + 1];
+  const kvz_pixel bottom_left = ref_left[width + 1];
+
+#if 0
+  // Unoptimized version for reference.
+  for (int y = 0; y < width; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int_fast16_t hor = (width - 1 - x) * ref_left[y + 1] + (x + 1) * top_right;
+      int_fast16_t ver = (width - 1 - y) * ref_top[x + 1] + (y + 1) * bottom_left;
+      dst[y * width + x] = (ver + hor + width) >> (log2_width + 1);
+    }
+  }
+#else
+  int_fast16_t top[32];
+  for (int i = 0; i < width; ++i) {
+    top[i] = ref_top[i + 1] << log2_width;
+  }
+
+  for (int y = 0; y < width; ++y) {
+    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+    for (int x = 0; x < width; ++x) {
+      hor += top_right - ref_left[y + 1];
+      top[x] += bottom_left - ref_top[x + 1];
+      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
+    }
+  }
+#endif
+}
+
+
+/**
+* \brief Generage planar prediction.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void intra_pred_dc(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_top[i + 1];
+    sum += ref_left[i + 1];
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+  const int_fast16_t block_size = 1 << (log2_width * 2);
+
+  for (int_fast16_t i = 0; i < block_size; ++i) {
+    out_block[i] = dc_val;
+  }
+}
+
+
+/**
+* \brief Generage intra DC prediction with post filtering applied.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void intra_pred_filtered_dc(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_top[i + 1];
+    sum += ref_left[i + 1];
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+
+  // Filter top-left with ([1 2 1] / 4)
+  out_block[0] = (ref_left[1] + 2 * dc_val + ref_top[1] + 2) / 4;
+
+  // Filter rest of the boundary with ([1 3] / 4)
+  for (int_fast8_t x = 1; x < width; ++x) {
+    out_block[x] = (ref_top[x + 1] + 3 * dc_val + 2) / 4;
+  }
+  for (int_fast8_t y = 1; y < width; ++y) {
+    out_block[y * width] = (ref_left[y + 1] + 3 * dc_val + 2) / 4;
+    for (int_fast8_t x = 1; x < width; ++x) {
+      out_block[y * width + x] = dc_val;
+    }
+  }
+}
+
+
+void kvz_intra_predict(
+  kvz_intra_references *refs,
+  int_fast8_t log2_width,
+  int_fast8_t mode,
+  color_t color,
+  kvz_pixel *dst)
+{
+  const int_fast8_t width = 1 << log2_width;
+
+  const kvz_intra_ref *used_ref = &refs->ref;
+  if (color != COLOR_Y || mode == 1 || width == 4) {
     // For chroma, DC and 4x4 blocks, always use unfiltered reference.
   } else if (mode == 0) {
     // Otherwise, use filtered for planar.
-    ref_pixels = rec_filtered;
+    used_ref = &refs->filtered_ref;
   } else {
     // Angular modes use smoothed reference pixels, unless the mode is close
     // to being either vertical or horizontal.
+    static const int kvz_intra_hor_ver_dist_thres[5] = { 0, 7, 1, 0, 0 };
     int filter_threshold = kvz_intra_hor_ver_dist_thres[g_to_bits[width]];
     int dist_from_vert_or_hor = MIN(abs(mode - 26), abs(mode - 10));
     if (dist_from_vert_or_hor > filter_threshold) {
-      ref_pixels = rec_filtered;
+      used_ref = &refs->filtered_ref;
     }
   }
 
+  if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
+    intra_filter_reference(log2_width, refs);
+  }
+
   if (mode == 0) {
-    kvz_intra_get_planar_pred(ref_pixels, recstride, width, dst, width);
+    intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
   } else if (mode == 1) {
-    int i;
-    kvz_pixel val = kvz_intra_get_dc_pred(ref_pixels, recstride, width);
-    for (i = 0; i < width * width; i++) {
-      dst[i] = val;
-    }
     // Do extra post filtering for edge pixels of luma DC mode.
-    if (!is_chroma && width < 32) {
-      kvz_intra_dc_pred_filtering(ref_pixels, recstride, dst, width, width, width);
+    if (color == COLOR_Y && width < 32) {
+      intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
+    } else {
+      intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst);
     }
   } else {
-    int filter = !is_chroma && width < 32;
-    kvz_intra_get_angular_pred(encoder, ref_pixels, recstride, dst, width, width, mode, filter);
+    intra_pred_angular(log2_width, mode, used_ref->top, used_ref->left, dst);
+    if (color == COLOR_Y && width < 32) {
+      if (mode == 10) {
+        intra_post_process_angular(width, 1, used_ref->top, dst);
+      } else if (mode == 26) {
+        intra_post_process_angular(width, width, used_ref->left, dst);
+      }
+    }
   }
 }
 
 
-
-/**
- * \brief Reconstruct intra block according to prediction
- * \param rec reconstructed picture data
- * \param recstride reconstructed picture stride
- * \param width block size to predict
- * \param dst destination buffer for best prediction
- * \param dststride destination width
- * \param mode intra mode to use
- * \param chroma chroma-block flag
-
-*/
-void kvz_intra_recon(const encoder_control_t * const encoder, kvz_pixel* rec, int32_t recstride, uint32_t width, kvz_pixel* dst, int32_t dststride, int8_t mode, int8_t chroma)
+void kvz_intra_build_reference(
+  const int_fast8_t log2_width,
+  const color_t color,
+  const vector2d_t *const luma_px,
+  const vector2d_t *const pic_px,
+  const lcu_t *const lcu,
+  kvz_intra_references *const refs)
 {
-  kvz_pixel pred[LCU_WIDTH * LCU_WIDTH];
-  kvz_pixel rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
-  kvz_pixel *recf = &rec_filtered_temp[recstride + 1];
+  assert(log2_width >= 2 && log2_width <= 5);
 
-  // Generate filtered reference pixels.
-  {
-    int x, y;
-    for (y = -1; y < recstride; y++) {
-      recf[y*recstride - 1] = rec[y*recstride - 1];
-    }
-    for (x = 0; x < recstride; x++) {
-      recf[x - recstride] = rec[x - recstride];
-    }
-    kvz_intra_filter(recf, recstride, width, 0);
-  }
-
-  kvz_intra_get_pred(encoder, rec, recf, recstride, pred, width, mode, chroma);
-
-  kvz_pixels_blit(pred, dst, width, width, width, dststride);
-}
-
-/**
- * \brief Build top and left borders for a reference block.
- * \param pic picture to use as a source
- * \param outwidth width of the prediction block
- * \param chroma signaling if chroma is used, 0 = luma, 1 = U and 2 = V
- *
- * The end result is 2*width+8 x 2*width+8 array, with only the top and left
- * edge pixels filled with the reconstructed pixels.
- */
-void kvz_intra_build_reference_border(const encoder_control_t * const encoder, int32_t x_luma, int32_t y_luma, int16_t out_width,
-                                      kvz_pixel *dst, int32_t dst_stride, int8_t chroma,
-                                      int32_t pic_width, int32_t pic_height,
-                                      lcu_t *lcu)
-{
-  // Some other function might make use of the arrays num_ref_pixels_top and
-  // num_ref_pixels_left in the future, but until that happens lets leave
-  // them here.
-
-  /**
-   * \brief Table for looking up the number of intra reference pixels based on
-   *        prediction units coordinate within an LCU.
-   *
-   * This table was generated by "tools/generate_ref_pixel_tables.py".
-   */
+  // Tables for looking up the number of intra reference pixels based on
+  // prediction units coordinate within an LCU.
+  // generated by "tools/generate_ref_pixel_tables.py".
   static const uint8_t num_ref_pixels_top[16][16] = {
     { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
     {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
@@ -336,329 +454,149 @@ void kvz_intra_build_reference_border(const encoder_control_t * const encoder, i
     { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
     {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 }
   };
-
-  /**
-   * \brief Table for looking up the number of intra reference pixels based on
-   *        prediction units coordinate within an LCU.
-   *
-   * This table was generated by "tools/generate_ref_pixel_tables.py".
-   */
   static const uint8_t num_ref_pixels_left[16][16] = {
     { 64,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
-    { 64,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
-    { 64,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
-    { 64,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
-    { 64,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
-    { 64,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
-    { 64,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
-    { 64,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 },
-    { 64,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
-    { 64,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
-    { 64,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
-    { 64,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
-    { 64,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
-    { 64,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
-    { 64,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
-    { 64,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
+    { 60,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
+    { 56,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
+    { 52,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
+    { 48,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
+    { 44,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
+    { 40,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 36,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 },
+    { 32,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
+    { 28,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
+    { 24,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
+    { 20,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
+    { 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
+    { 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
+    { 8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
   };
 
-  const kvz_pixel dc_val = 1 << (encoder->bitdepth - 1);
-  const int is_chroma = chroma ? 1 : 0;
+  refs->filtered_initialized = false;
+  kvz_pixel *out_left_ref = &refs->ref.left[0];
+  kvz_pixel *out_top_ref = &refs->ref.top[0];
 
-  // input picture pointer
-  //const pixel * const src = (!chroma) ? pic->y_recdata : ((chroma == 1) ? pic->u_recdata : pic->v_recdata);
+  const kvz_pixel dc_val = 1 << (KVZ_BIT_DEPTH - 1);
+  const int is_chroma = color != COLOR_Y ? 1 : 0;
+  const int_fast8_t width = 1 << log2_width;
 
   // Convert luma coordinates to chroma coordinates for chroma.
-  const int x = chroma ? x_luma / 2 : x_luma;
-  const int y = chroma ? y_luma / 2 : y_luma;
+  const vector2d_t lcu_px = {
+    luma_px->x % LCU_WIDTH,
+    luma_px->y % LCU_WIDTH
+  };
+  const vector2d_t px = {
+    lcu_px.x >> is_chroma,
+    lcu_px.y >> is_chroma,
+  };
 
-  const int y_in_lcu = y_luma % LCU_WIDTH;
-  const int x_in_lcu = x_luma % LCU_WIDTH;
+  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
+  const kvz_pixel *left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
+  const kvz_pixel *top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
+  const kvz_pixel *rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;
 
-  int x_local = (x_luma&0x3f)>>is_chroma, y_local = (y_luma&0x3f)>>is_chroma;
-
-  kvz_pixel *left_ref = !chroma ? &lcu->left_ref.y[1] : (chroma == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
-  kvz_pixel *top_ref  = !chroma ? &lcu->top_ref.y[1]  : (chroma == 1) ? &lcu->top_ref.u[1]  : &lcu->top_ref.v[1];
-  kvz_pixel *rec_ref  = !chroma ? lcu->rec.y : (chroma == 1) ? lcu->rec.u : lcu->rec.v;
-
-  kvz_pixel *left_border = &left_ref[y_local];
-  kvz_pixel *top_border = &top_ref[x_local];
-  uint32_t left_stride = 1;
-
-  if(x_local) {
-    left_border = &rec_ref[x_local - 1 + y_local * (LCU_WIDTH>>is_chroma)];
-    left_stride = LCU_WIDTH>>is_chroma;
+  // Init top borders pointer to point to the correct place in the correct reference array.
+  const kvz_pixel *top_border;
+  if (px.y) {
+    top_border = &rec_ref[px.x + (px.y - 1) * (LCU_WIDTH >> is_chroma)];
+  } else {
+    top_border = &top_ref[px.x];
   }
 
-  if(y_local) {
-    top_border = &rec_ref[x_local + (y_local - 1) * (LCU_WIDTH>>is_chroma)];
+  // Init left borders pointer to point to the correct place in the correct reference array.
+  const kvz_pixel *left_border;
+  int left_stride; // Distance between reference samples.
+  if (px.x) {
+    left_border = &rec_ref[px.x - 1 + px.y * (LCU_WIDTH >> is_chroma)];
+    left_stride = LCU_WIDTH >> is_chroma;
+  } else {
+    left_border = &left_ref[px.y];
+    left_stride = 1;
   }
 
-  // Copy pixels for left edge.
-  if (x > 0) {
+  // Generate left reference.
+  if (luma_px->x > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int num_ref_pixels = num_ref_pixels_left[y_in_lcu / 4][x_in_lcu / 4] >> is_chroma;
-    int i;
-    kvz_pixel nearest_pixel;
+    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
 
-    // Max pixel we can copy from src is yy + outwidth - 1 because the dst
-    // extends one pixel to the left.
-    num_ref_pixels = MIN(num_ref_pixels, out_width - 1);
-    // There are no coded pixels below the frame.
-    num_ref_pixels = MIN(num_ref_pixels, pic_height - y);
-    // There are no coded pixels below the bottom of the LCU due to raster
-    // scan order.
-    num_ref_pixels = MIN(num_ref_pixels, (LCU_WIDTH - y_in_lcu) >> is_chroma);
+    // Limit the number of available pixels based on block size and dimensions
+    // of the picture.
+    px_available_left = MIN(px_available_left, width * 2);
+    px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
 
     // Copy pixels from coded CUs.
-    for (i = 0; i < num_ref_pixels; ++i) {
-      dst[(i + 1) * dst_stride] = left_border[i*left_stride];
+    for (int i = 0; i < px_available_left; ++i) {
+      out_left_ref[i + 1] = left_border[i * left_stride];
     }
     // Extend the last pixel for the rest of the reference values.
-    nearest_pixel = dst[i * dst_stride];
-    for (i = num_ref_pixels; i < out_width - 1; ++i) {
-      dst[i * dst_stride] = nearest_pixel;
+    kvz_pixel nearest_pixel = out_left_ref[px_available_left];
+    for (int i = px_available_left; i < width * 2; ++i) {
+      out_left_ref[i + 1] = nearest_pixel;
     }
   } else {
     // If we are on the left edge, extend the first pixel of the top row.
-    kvz_pixel nearest_pixel = y > 0 ? top_border[0] : dc_val;
-    int i;
-    for (i = 1; i < out_width - 1; i++) {
-      dst[i * dst_stride] = nearest_pixel;
+    kvz_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
+    for (int i = 0; i < width * 2; i++) {
+      out_left_ref[i + 1] = nearest_pixel;
     }
   }
 
-  // Copy pixels for top edge.
-  if (y > 0) {
+  // Generate top-left reference.
+  if (luma_px->x > 0 && luma_px->y > 0) {
+    // If the block is at an LCU border, the top-left must be copied from
+    // the border that points to the LCUs 1D reference buffer.
+    if (px.x == 0) {
+      out_left_ref[0] = left_border[-1 * left_stride];
+      out_top_ref[0] = left_border[-1 * left_stride];
+    } else {
+      out_left_ref[0] = top_border[-1];
+      out_top_ref[0] = top_border[-1];
+    }
+  } else {
+    // Copy reference clockwise.
+    out_left_ref[0] = out_left_ref[1];
+    out_top_ref[0] = out_left_ref[1];
+  }
+
+  // Generate top reference.
+  if (luma_px->y > 0) {
     // Get the number of reference pixels based on the PU coordinate within the LCU.
-    int num_ref_pixels = num_ref_pixels_top[y_in_lcu / 4][x_in_lcu / 4] >> is_chroma;
-    int i;
-    kvz_pixel nearest_pixel;
+    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
 
-    // Max pixel we can copy from src is yy + outwidth - 1 because the dst
-    // extends one pixel to the left.
-    num_ref_pixels = MIN(num_ref_pixels, out_width - 1);
-    // All LCUs in the row above have been coded.
-    num_ref_pixels = MIN(num_ref_pixels, pic_width - x);
+    // Limit the number of available pixels based on block size and dimensions
+    // of the picture.
+    px_available_top = MIN(px_available_top, width * 2);
+    px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
 
-    // Copy pixels from coded CUs.
-    for (i = 0; i < num_ref_pixels; ++i) {
-      dst[i + 1] = top_border[i];
+    // Copy all the pixels we can.
+    for (int i = 0; i < px_available_top; ++i) {
+      out_top_ref[i + 1] = top_border[i];
     }
     // Extend the last pixel for the rest of the reference values.
-    nearest_pixel = top_border[num_ref_pixels - 1];
-    for (; i < out_width - 1; ++i) {
-      dst[i + 1] = nearest_pixel;
+    kvz_pixel nearest_pixel = top_border[px_available_top - 1];
+    for (int i = px_available_top; i < width * 2; ++i) {
+      out_top_ref[i + 1] = nearest_pixel;
     }
   } else {
     // Extend nearest pixel.
-    kvz_pixel nearest_pixel = x > 0 ? left_border[0] : dc_val;
-    int i;
-    for(i = 1; i < out_width; i++)
-    {
-      dst[i] = nearest_pixel;
-    }
-  }
-
-  // If top-left corner sample doesn't exist, use the sample from below.
-  // Unavailable samples on the left boundary are copied from below if
-  // available. This is the only place they are available because we don't
-  // support constrained intra prediction.
-  if (x > 0 && y > 0) {
-    // Make sure we always take the top-left pixel from the LCU reference
-    // pixel arrays if they are available.
-    if (x_local == 0) {
-      dst[0] = left_border[-1];
-    } else {
-      dst[0] = top_border[-1];
-    }
-  } else {
-    dst[0] = dst[dst_stride];
-  }
-}
-
-const int32_t kvz_ang_table[9]     = {0,    2,    5,   9,  13,  17,  21,  26,  32};
-const int32_t kvz_inv_ang_table[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256}; // (256 * 32) / Angle
-
-/**
- * \brief this functions constructs the angular intra prediction from border samples
- *
- */
-void kvz_intra_get_angular_pred(const encoder_control_t * const encoder, const kvz_pixel* src, int32_t src_stride, kvz_pixel* dst, int32_t dst_stride, int32_t width, int32_t dir_mode, int8_t filter)
-{
-  int32_t k,l;
-  int32_t blk_size        = width;
-
-  // Map the mode index to main prediction direction and angle
-  bool mode_ver       = dir_mode >= 18;
-  int32_t intra_pred_angle = mode_ver ? dir_mode - 26 : 10 - dir_mode;
-  int32_t abs_ang       = abs(intra_pred_angle);
-  int32_t sign_ang      = intra_pred_angle < 0 ? -1 : 1;
-
-  // Set bitshifts and scale the angle parameter to block size
-  int32_t inv_angle       = kvz_inv_ang_table[abs_ang];
-
-  // Do angular predictions
-  kvz_pixel *ref_main;
-  kvz_pixel *ref_side;
-  kvz_pixel  ref_above[2 * LCU_WIDTH + 1];
-  kvz_pixel  ref_left[2 * LCU_WIDTH + 1];
-
-  // Tell clang-analyzer that everything is ok.
-  assert(width == 4 || width == 8 || width == 16 || width == 32);
-
-  abs_ang           = kvz_ang_table[abs_ang];
-  intra_pred_angle  = sign_ang * abs_ang;
-
-  // Initialise the Main and Left reference array.
-  if (intra_pred_angle < 0) {
-    int32_t invAngleSum = 128; // rounding for (shift by 8)
-    for (k = 0; k < blk_size + 1; k++) {
-      ref_above[k + blk_size - 1] = src[k - src_stride - 1];
-      ref_left[k + blk_size - 1]  = src[(k - 1) * src_stride - 1];
-    }
-
-    ref_main = (mode_ver ? ref_above : ref_left) + (blk_size - 1);
-    ref_side = (mode_ver ? ref_left : ref_above) + (blk_size - 1);
-
-    // Extend the Main reference to the left.
-    for (k = -1; k > blk_size * intra_pred_angle>>5; k--) {
-      invAngleSum += inv_angle;
-      ref_main[k] = ref_side[invAngleSum>>8];
-    }
-  } else {
-    for (k = 0; k < 2 * blk_size + 1; k++) {
-      ref_above[k] = src[k - src_stride - 1];
-      ref_left[k]  = src[(k - 1) * src_stride - 1];
-    }
-    ref_main = mode_ver ? ref_above : ref_left;
-    ref_side = mode_ver ? ref_left  : ref_above;
-  }
-
-  if (intra_pred_angle == 0) {
-    for (k = 0; k < blk_size; k++) {
-      for (l = 0; l < blk_size; l++) {
-        dst[k * dst_stride + l] = ref_main[l + 1];
-      }
-    }
-
-    if (filter) {
-      for (k=0;k<blk_size;k++) {
-        dst[k * dst_stride] = CLIP(0, (1<<encoder->bitdepth) - 1, dst[k * dst_stride] + (( ref_side[k + 1] - ref_side[0]) >> 1));
-      }
-    }
-  } else {
-    int32_t delta_pos=0;
-    int32_t delta_int;
-    int32_t delta_fract;
-    int32_t minus_delta_fract;
-    int32_t ref_main_index;
-    for (k = 0; k < blk_size; k++) {
-      delta_pos += intra_pred_angle;
-      delta_int   = delta_pos >> 5;
-      delta_fract = delta_pos & (32 - 1);
-
-
-      if (delta_fract) {
-        minus_delta_fract = (32 - delta_fract);
-        // Do linear filtering
-        for (l = 0; l < blk_size; l++) {
-          ref_main_index        = l + delta_int + 1;
-          dst[k * dst_stride + l] = (kvz_pixel) ( (minus_delta_fract * ref_main[ref_main_index]
-                                                 + delta_fract * ref_main[ref_main_index + 1] + 16) >> 5);
-        }
-      } else {
-        // Just copy the integer samples
-        for (l = 0; l < blk_size; l++) {
-          dst[k * dst_stride + l] = ref_main[l + delta_int + 1];
-        }
-      }
-    }
-  }
-
-  // Flip the block if this is the horizontal mode
-  if (!mode_ver) {
-    kvz_pixel tmp;
-    for (k=0;k<blk_size-1;k++) {
-      for (l=k+1;l<blk_size;l++) {
-        tmp                 = dst[k * dst_stride + l];
-        dst[k * dst_stride + l] = dst[l * dst_stride + k];
-        dst[l * dst_stride + k] = tmp;
-      }
+    kvz_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
+    for (int i = 0; i < width * 2; i++) {
+      out_top_ref[i + 1] = nearest_pixel;
     }
   }
 }
 
 
-
-
-void kvz_intra_dc_pred_filtering(const kvz_pixel *src, int32_t src_stride, kvz_pixel *dst, int32_t dst_stride, int32_t width, int32_t height )
+void kvz_intra_recon_lcu_luma(
+  encoder_state_t *const state,
+  int x,
+  int y,
+  int depth,
+  int8_t intra_mode,
+  cu_info_t *cur_cu,
+  lcu_t *lcu)
 {
-  int32_t x, y, dst_stride2, src_stride2;
-
-  // boundary pixels processing
-  dst[0] = ((src[-src_stride] + src[-1] + 2 * dst[0] + 2) >> 2);
-
-  for (x = 1; x < width; x++) {
-    dst[x] = ((src[x - src_stride] +  3 * dst[x] + 2) >> 2);
-  }
-  for ( y = 1, dst_stride2 = dst_stride, src_stride2 = src_stride-1;
-        y < height; y++, dst_stride2+=dst_stride, src_stride2+=src_stride ) {
-    dst[dst_stride2] = ((src[src_stride2] + 3 * dst[dst_stride2] + 2) >> 2);
-  }
-  return;
-}
-
-/**
- * \brief Function for deriving planar intra prediction.
- * \param src source pixel array
- * \param srcstride source width
- * \param width block size to predict
- * \param dst destination buffer for prediction
- * \param dststride destination width
-
-  This function derives the prediction samples for planar mode (intra coding).
-*/
-void kvz_intra_get_planar_pred(const kvz_pixel* src, int32_t srcstride, uint32_t width, kvz_pixel* dst, int32_t dststride)
-{
-  int32_t k, l, bottom_left, top_right;
-  int32_t hor_pred;
-  int32_t left_column[LCU_WIDTH+1], top_row[LCU_WIDTH+1], bottom_row[LCU_WIDTH+1], right_column[LCU_WIDTH+1];
-  uint32_t blk_size = width;
-  uint32_t offset_2d = width;
-  uint32_t shift_1d = kvz_g_convert_to_bit[ width ] + 2;
-  uint32_t shift_2d = shift_1d + 1;
-
-  // Get left and above reference column and row
-  for (k = 0; k < (int32_t)blk_size + 1; k++) {
-    top_row[k] = src[k - srcstride];
-    left_column[k] = src[k * srcstride - 1];
-  }
-
-  // Prepare intermediate variables used in interpolation
-  bottom_left = left_column[blk_size];
-  top_right   = top_row[blk_size];
-  for (k = 0; k < (int32_t)blk_size; k++) {
-    bottom_row[k]   = bottom_left - top_row[k];
-    right_column[k] = top_right   - left_column[k];
-    top_row[k]      <<= shift_1d;
-    left_column[k]  <<= shift_1d;
-  }
-
-  // Generate prediction signal
-  for (k = 0; k < (int32_t)blk_size; k++) {
-    hor_pred = left_column[k] + offset_2d;
-    for (l = 0; l < (int32_t)blk_size; l++) {
-      hor_pred += right_column[k];
-      top_row[l] += bottom_row[l];
-      dst[k * dststride + l] = (kvz_pixel)((hor_pred + top_row[l]) >> shift_2d);
-    }
-  }
-}
-
-void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu)
-{
-  const encoder_control_t * const encoder = state->encoder_control;
   const vector2d_t lcu_px = { x & 0x3f, y & 0x3f };
   if (cur_cu == NULL) {
     cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
@@ -684,30 +622,33 @@ void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int d
 
     return;
   }
-  {
-    const uint32_t pic_width = state->tile->frame->width;
-    const uint32_t pic_height = state->tile->frame->height;
 
-    // Pointers to reconstruction arrays
-    kvz_pixel *recbase_y = &lcu->rec.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
+  // Perform intra prediction and put the result in correct place lcu.
+  vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
+  vector2d_t luma_px = { x, y };
+  kvz_intra_references refs;
+  const int_fast8_t log2_width = kvz_g_convert_to_bit[width] + 2;
+  kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs);
 
-    kvz_pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
-    kvz_pixel *rec_shift  = &rec[width * 2 + 8 + 1];
+  kvz_pixel pred[32 * 32];
+  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred);
+  
+  kvz_pixel *block_in_lcu = &lcu->rec.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
+  kvz_pixels_blit(pred, block_in_lcu, width, width, width, LCU_WIDTH);
 
-    int32_t rec_stride = LCU_WIDTH;
-
-    kvz_intra_build_reference_border(encoder, x, y,(int16_t)width * 2 + 8, rec, (int16_t)width * 2 + 8, 0,
-                                 pic_width, pic_height, lcu);
-    kvz_intra_recon(encoder, rec_shift, width * 2 + 8,
-                width, recbase_y, rec_stride, intra_mode, 0);
-
-    kvz_quantize_lcu_luma_residual(state, x, y, depth, cur_cu, lcu);
-  }
+  kvz_quantize_lcu_luma_residual(state, x, y, depth, cur_cu, lcu);
 }
 
-void kvz_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu)
+
+void kvz_intra_recon_lcu_chroma(
+  encoder_state_t *const state,
+  int x,
+  int y,
+  int depth,
+  int8_t intra_mode,
+  cu_info_t *cur_cu,
+  lcu_t *lcu)
 {
-  const encoder_control_t * const encoder = state->encoder_control;
   const vector2d_t lcu_px = { x & 0x3f, y & 0x3f };
   const int8_t width = LCU_WIDTH >> depth;
   const int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
@@ -739,44 +680,35 @@ void kvz_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int
     return;
   }
 
-  {
-    const uint32_t pic_width = state->tile->frame->width;
-    const uint32_t pic_height = state->tile->frame->height;
+  if (!(x & 4 || y & 4)) {
+    const int_fast8_t log2_width_c = kvz_g_convert_to_bit[width_c] + 2;
+    const vector2d_t luma_px = { x, y };
+    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
 
-    // Pointers to reconstruction arrays
-    kvz_pixel *recbase_u = &lcu->rec.u[lcu_px.x/2 + (lcu_px.y * LCU_WIDTH)/4];
-    kvz_pixel *recbase_v = &lcu->rec.v[lcu_px.x/2 + (lcu_px.y * LCU_WIDTH)/4];
+    // Intra predict U-plane and put the result in lcu buffer.
+    {
+      kvz_intra_references refs;
+      kvz_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs);
 
-    kvz_pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
+      kvz_pixel pred[32 * 32];
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred);
 
-    int32_t rec_stride = LCU_WIDTH;
-
-    // Reconstruct chroma.
-    if (!(x & 4 || y & 4)) {
-      kvz_pixel *rec_shift_c  = &rec[width_c * 2 + 8 + 1];
-      kvz_intra_build_reference_border(encoder, x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 1,
-                                   pic_width/2, pic_height/2, lcu);
-      kvz_intra_recon(encoder,
-                  rec_shift_c,
-                  width_c * 2 + 8,
-                  width_c,
-                  recbase_u,
-                  rec_stride >> 1,
-                  intra_mode,
-                  1);
-
-      kvz_intra_build_reference_border(encoder, x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 2,
-                                   pic_width/2, pic_height/2, lcu);
-      kvz_intra_recon(encoder,
-                  rec_shift_c,
-                  width_c * 2 + 8,
-                  width_c,
-                  recbase_v,
-                  rec_stride >> 1,
-                  intra_mode,
-                  2);
-
-      kvz_quantize_lcu_chroma_residual(state, x, y, depth, cur_cu, lcu);
+      kvz_pixel *pu_in_lcu = &lcu->rec.u[lcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4];
+      kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
     }
+
+    // Intra predict V-plane and put the result in lcu buffer.
+    {
+      kvz_intra_references refs;
+      kvz_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs);
+      
+      kvz_pixel pred[32 * 32];
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred);
+
+      kvz_pixel *pu_in_lcu = &lcu->rec.v[lcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4];
+      kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
+    }
+
+    kvz_quantize_lcu_chroma_residual(state, x, y, depth, cur_cu, lcu);
   }
 }
diff --git a/src/intra.h b/src/intra.h
index 845136e0..02750619 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -27,29 +27,91 @@
 
 #include "global.h"
 
-#include "image.h"
-#include "encoder.h"
 #include "encoderstate.h"
 
-//void kvz_intra_set_block_mode(image* im,uint32_t x_ctb, uint32_t y_ctb, uint8_t depth, uint8_t mode, uint8_t part_mode);
+typedef struct {
+  kvz_pixel left[2 * 32 + 1];
+  kvz_pixel top[2 * 32 + 1];
+} kvz_intra_ref;
+typedef struct
+{
+  kvz_intra_ref ref;
+  kvz_intra_ref filtered_ref;
+  bool filtered_initialized;
+} kvz_intra_references;
 
-int8_t kvz_intra_get_dir_luma_predictor(uint32_t x, uint32_t y, int8_t* preds,
-                                    const cu_info_t* cur_cu, const cu_info_t* left_cu, const cu_info_t* above_cu);
-void kvz_intra_dc_pred_filtering(const kvz_pixel* src, int32_t src_stride, kvz_pixel* dst, int32_t dst_stride, int32_t width, int32_t height );
 
-void kvz_intra_build_reference_border(const encoder_control_t *encoder, int32_t x_luma, int32_t y_luma, int16_t out_width, kvz_pixel *dst, int32_t dst_stride, int8_t chroma, int32_t pic_width, int32_t pic_height, lcu_t *lcu);
-void kvz_intra_filter(kvz_pixel* ref, int32_t stride, int32_t width, int8_t mode);
+/**
+* \brief Function for deriving intra luma predictions
+* \param pic picture to use
+* \param x_cu x CU position (smallest CU)
+* \param y_cu y CU position (smallest CU)
+* \param preds output buffer for 3 predictions
+* \returns (predictions are found)?1:0
+*/
+int8_t kvz_intra_get_dir_luma_predictor(
+  const uint32_t x,
+  const uint32_t y,
+  int8_t *preds,
+  const cu_info_t *const cur_cu,
+  const cu_info_t *const left_cu,
+  const cu_info_t *const above_cu);
 
-/* Predictions */
-void kvz_intra_get_pred(const encoder_control_t * const encoder, const kvz_pixel *rec, const kvz_pixel *rec_filtered, int recstride, kvz_pixel *dst, int width, int mode, int is_chroma);
+/**
+* \brief Generage angular predictions.
+* \param width    Width in pixels, range 4..32.
+* \param color    What color pixels to use.
+* \param luma_px  Luma coordinates of the prediction block.
+* \param pic_px   Picture dimensions in luma pixels.
+* \param lcu      LCU struct.
+* \param out_left_ref  Left reference pixels, index 0 is the top-left.
+* \param out_top_ref   Top reference pixels, index 0 is the top-left.
+*/
+void kvz_intra_build_reference(
+  const int_fast8_t log2_width,
+  const color_t color,
+  const vector2d_t *const luma_px,
+  const vector2d_t *const pic_px,
+  const lcu_t *const lcu,
+  kvz_intra_references *const refs);
 
-kvz_pixel kvz_intra_get_dc_pred(const kvz_pixel* pic, uint16_t pic_width, uint8_t width);
-void kvz_intra_get_planar_pred(const kvz_pixel* src,int32_t srcstride, uint32_t width, kvz_pixel* dst, int32_t dststride);
-void kvz_intra_get_angular_pred(const encoder_control_t *encoder, const kvz_pixel* src, int32_t src_stride, kvz_pixel* dst, int32_t dst_stride, int32_t width, int32_t dir_mode, int8_t filter);
+/**
+ * \brief Generate intra predictions.
+ * \param refs   Reference pixels used for the prediction.     
+ * \param log2_width  Width of the predicted block.
+ * \param mode   Intra mode used for the prediction.
+ * \param color  Color of the prediction.
+ * \param dst    Buffer for the predicted pixels.
+ */
+void kvz_intra_predict(
+  kvz_intra_references *refs,
+  int_fast8_t log2_width,
+  int_fast8_t mode,
+  color_t color,
+  kvz_pixel *dst);
 
-void kvz_intra_recon(const encoder_control_t *encoder, kvz_pixel* rec, int32_t rec_stride, uint32_t width, kvz_pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
+/**
+ * \brief Do a full intra prediction cycle on a CU in lcu for luma.
+ */
+void kvz_intra_recon_lcu_luma(
+  encoder_state_t *const state,
+  int x,
+  int y,
+  int depth,
+  int8_t intra_mode,
+  cu_info_t *cur_cu,
+  lcu_t *lcu);
 
-void kvz_intra_recon_lcu_luma(encoder_state_t *state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
-void kvz_intra_recon_lcu_chroma(encoder_state_t *state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
+/**
+* \brief Do a full intra prediction cycle on a CU in lcu for chroma.
+*/
+void kvz_intra_recon_lcu_chroma(
+  encoder_state_t *const state,
+  int x,
+  int y,
+  int depth,
+  int8_t intra_mode,
+  cu_info_t *cur_cu,
+  lcu_t *lcu);
 
 #endif
diff --git a/src/search_intra.c b/src/search_intra.c
index 39e2c032..09efa9b2 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -271,14 +271,14 @@ static double search_intra_trdepth(encoder_state_t * const state,
 static void search_intra_chroma_rough(encoder_state_t * const state,
                                       int x_px, int y_px, int depth,
                                       const kvz_pixel *orig_u, const kvz_pixel *orig_v, int16_t origstride,
-                                      const kvz_pixel *rec_u, const kvz_pixel *rec_v, int16_t recstride,
+                                      kvz_intra_references *refs_u, kvz_intra_references *refs_v,
                                       int8_t luma_mode,
                                       int8_t modes[5], double costs[5])
 {
-  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
-  if (!reconstruct_chroma) return;
+  assert(!(x_px & 4 || y_px & 4));
 
   const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
+  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2);
 
   for (int i = 0; i < 5; ++i) {
     costs[i] = 0;
@@ -287,16 +287,16 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   cost_pixel_nxn_func *const satd_func = kvz_pixels_get_satd_func(width);
   //cost_pixel_nxn_func *const sad_func = kvz_pixels_get_sad_func(width);
 
-  kvz_pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  kvz_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   kvz_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
 
-  kvz_pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  kvz_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT];
   kvz_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
 
   kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modes[i] == luma_mode) continue;
-    kvz_intra_get_pred(state->encoder_control, rec_u, NULL, recstride, pred, width, modes[i], 1);
+    kvz_intra_predict(refs_u, log2_width_c, modes[i], COLOR_U, pred);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
@@ -304,7 +304,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modes[i] == luma_mode) continue;
-    kvz_intra_get_pred(state->encoder_control, rec_v, NULL, recstride, pred, width, modes[i], 2);
+    kvz_intra_predict(refs_v, log2_width_c, modes[i], COLOR_V, pred);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
@@ -343,41 +343,25 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
  */
 static int8_t search_intra_rough(encoder_state_t * const state, 
                                  kvz_pixel *orig, int32_t origstride,
-                                 kvz_pixel *rec, int16_t recstride,
-                                 int width, int8_t *intra_preds,
+                                 kvz_intra_references *refs,
+                                 int log2_width, int8_t *intra_preds,
                                  int8_t modes[35], double costs[35])
 {
+  assert(log2_width >= 2 && log2_width <= 5);
+  int_fast8_t width = 1 << log2_width;
   cost_pixel_nxn_func *satd_func = kvz_pixels_get_satd_func(width);
   cost_pixel_nxn_func *sad_func = kvz_pixels_get_sad_func(width);
 
   // Temporary block arrays
-  kvz_pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  kvz_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   kvz_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
   
-  kvz_pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT];
+  kvz_pixel _orig_block[32 * 32 + SIMD_ALIGNMENT];
   kvz_pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT);
-  
-  kvz_pixel rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
-
-  kvz_pixel *recf = &rec_filtered_temp[recstride + 1];
-
-  assert(width == 4 || width == 8 || width == 16 || width == 32);
 
   // Store original block for SAD computation
   kvz_pixels_blit(orig, orig_block, width, width, origstride, width);
 
-  // Generate filtered reference pixels.
-  {
-    int16_t x, y;
-    for (y = -1; y < recstride; y++) {
-      recf[y*recstride - 1] = rec[y*recstride - 1];
-    }
-    for (x = 0; x < recstride; x++) {
-      recf[x - recstride] = rec[x - recstride];
-    }
-    kvz_intra_filter(recf, recstride, width, 0);
-  }
-  
   int8_t modes_selected = 0;
   unsigned min_cost = UINT_MAX;
   unsigned max_cost = 0;
@@ -387,18 +371,15 @@ static int8_t search_intra_rough(encoder_state_t * const state,
   int offset;
   if (state->encoder_control->full_intra_search) {
     offset = 1;
-  } else if (width == 4) {
-    offset = 2;
-  } else if (width == 8) {
-    offset = 4;
   } else {
-    offset = 8;
+    static const int8_t offsets[4] = { 2, 4, 8, 8 };
+    offset = offsets[log2_width - 2];
   }
 
   // Calculate SAD for evenly spaced modes to select the starting point for 
   // the recursive search.
   for (int mode = 2; mode <= 34; mode += offset) {
-    kvz_intra_get_pred(state->encoder_control, rec, recf, recstride, pred, width, mode, 0);
+    kvz_intra_predict(refs, log2_width, mode, COLOR_Y, pred);
     costs[modes_selected] = get_cost(state, pred, orig_block, satd_func, sad_func, width);
     modes[modes_selected] = mode;
 
@@ -421,7 +402,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
       int8_t center_node = best_mode;
       int8_t mode = center_node - offset;
       if (mode >= 2) {
-        kvz_intra_get_pred(state->encoder_control, rec, recf, recstride, pred, width, mode, 0);
+        kvz_intra_predict(refs, log2_width, mode, COLOR_Y, pred);
         costs[modes_selected] = get_cost(state, pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         if (costs[modes_selected] < best_cost) {
@@ -433,7 +414,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 
       mode = center_node + offset;
       if (mode <= 34) {
-        kvz_intra_get_pred(state->encoder_control, rec, recf, recstride, pred, width, mode, 0);
+        kvz_intra_predict(refs, log2_width, mode, COLOR_Y, pred);
         costs[modes_selected] = get_cost(state, pred, orig_block, satd_func, sad_func, width);
         modes[modes_selected] = mode;
         if (costs[modes_selected] < best_cost) {
@@ -460,7 +441,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
     }
 
     if (!has_mode) {
-      kvz_intra_get_pred(state->encoder_control, rec, recf, recstride, pred, width, mode, 0);
+      kvz_intra_predict(refs, log2_width, mode, COLOR_Y, pred);
       costs[modes_selected] = get_cost(state, pred, orig_block, satd_func, sad_func, width);
       modes[modes_selected] = mode;
       ++modes_selected;
@@ -507,7 +488,6 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 static int8_t search_intra_rdo(encoder_state_t * const state, 
                              int x_px, int y_px, int depth,
                              kvz_pixel *orig, int32_t origstride,
-                             kvz_pixel *rec, int16_t recstride,
                              int8_t *intra_preds,
                              int modes_to_check,
                              int8_t modes[35], double costs[35],
@@ -517,31 +497,14 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   const int width = LCU_WIDTH >> depth;
 
   kvz_pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];
-  int rdo_mode;
-  int pred_mode;
-
-  kvz_pixel rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
-  kvz_pixel *recf = &rec_filtered_temp[recstride + 1];
-
-  // Generate filtered reference pixels.
-  {
-    int x, y;
-    for (y = -1; y < recstride; y++) {
-      recf[y*recstride - 1] = rec[y*recstride - 1];
-    }
-    for (x = 0; x < recstride; x++) {
-      recf[x - recstride] = rec[x - recstride];
-    }
-    kvz_intra_filter(recf, recstride, width, 0);
-  }
 
   kvz_pixels_blit(orig, orig_block, width, width, origstride, width);
 
   // Check that the predicted modes are in the RDO mode list
   if (modes_to_check < 35) {
-    for (pred_mode = 0; pred_mode < 3; pred_mode++) {
+    for (int pred_mode = 0; pred_mode < 3; pred_mode++) {
       int mode_found = 0;
-      for (rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode++) {
+      for (int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode++) {
         if (intra_preds[pred_mode] == modes[rdo_mode]) {
           mode_found = 1;
           break;
@@ -555,42 +518,27 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     }
   }
 
-  for(rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
+  for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
     int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds);
     costs[rdo_mode] = rdo_bitcost * (int)(state->global->cur_lambda_cost + 0.5);
-#if 0
-    if (width != 4 && tr_depth == depth) {
-      // This code path has been disabled for now because it increases bdrate
-      // by 1-2 %. Possibly due to not taking chroma into account during luma
-      // mode search. Enabling separate chroma search compensates a little,
-      // but not enough.
 
-      // The idea for this code path is, that it would do the same thing as
-      // the more general search_intra_trdepth, but would only handle cases
-      // where transform split or transform skip don't need to be handled.
-      kvz_intra_get_pred(state->encoder_control, rec, recf, recstride, pred, width, modes[rdo_mode], 0);
-      costs[rdo_mode] += kvz_rdo_cost_intra(state, pred, orig_block, width, modes[rdo_mode], width == 4 ? 1 : 0);
-    } else 
-#endif
-	{
-      // Perform transform split search and save mode RD cost for the best one.
-      cu_info_t pred_cu;
-      pred_cu.depth = depth;
-      pred_cu.type = CU_INTRA;
-      pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-      pred_cu.intra[0].mode = modes[rdo_mode];
-      pred_cu.intra[1].mode = modes[rdo_mode];
-      pred_cu.intra[2].mode = modes[rdo_mode];
-      pred_cu.intra[3].mode = modes[rdo_mode];
-      pred_cu.intra[0].mode_chroma = modes[rdo_mode];
-      FILL(pred_cu.cbf, 0);
+    // Perform transform split search and save mode RD cost for the best one.
+    cu_info_t pred_cu;
+    pred_cu.depth = depth;
+    pred_cu.type = CU_INTRA;
+    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
+    pred_cu.intra[0].mode = modes[rdo_mode];
+    pred_cu.intra[1].mode = modes[rdo_mode];
+    pred_cu.intra[2].mode = modes[rdo_mode];
+    pred_cu.intra[3].mode = modes[rdo_mode];
+    pred_cu.intra[0].mode_chroma = modes[rdo_mode];
+    FILL(pred_cu.cbf, 0);
 
-      // Reset transform split data in lcu.cu for this area.
-      kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+    // Reset transform split data in lcu.cu for this area.
+    kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
 
-      double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
-      costs[rdo_mode] += mode_cost;
-    }
+    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu);
+    costs[rdo_mode] += mode_cost;
   }
 
   // The best transform split hierarchy is not saved anywhere, so to get the
@@ -697,7 +645,6 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
                               const int depth, lcu_t *lcu)
 {
-  const videoframe_t * const frame = state->tile->frame;
   const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
   const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
   const int cu_index = LCU_CU_OFFSET + lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH;
@@ -726,23 +673,15 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   // FIXME: It might make more sense to only disable rough search if
   // num_modes is 0.is 0.
   if (num_modes != 1 && num_modes != 5) {
-    kvz_pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
-    kvz_pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)];
+    const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
+    const vector2d_t luma_px = { x_px, y_px };
 
-    const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
-    const int16_t rec_stride = width_c * 2 + 8;
-    const int16_t out_stride = rec_stride;
+    kvz_intra_references refs_u;
+    kvz_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs_u);
 
-    kvz_intra_build_reference_border(state->encoder_control,
-                                 x_px, y_px, out_stride,
-                                 rec_u, rec_stride, COLOR_U,
-                                 frame->width / 2, frame->height / 2,
-                                 lcu);
-    kvz_intra_build_reference_border(state->encoder_control,
-                                 x_px, y_px, out_stride,
-                                 rec_v, rec_stride, COLOR_V,
-                                 frame->width / 2, frame->height / 2,
-                                 lcu);
+    kvz_intra_references refs_v;
+    kvz_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs_v);
 
     vector2d_t lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 };
     kvz_pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
@@ -750,7 +689,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
 
     search_intra_chroma_rough(state, x_px, y_px, depth,
                               ref_u, ref_v, LCU_WIDTH_C,
-                              &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride,
+                              &refs_u, &refs_v,
                               intra_mode, modes, costs);
   }
 
@@ -771,16 +710,15 @@ double kvz_search_cu_intra(encoder_state_t * const state,
                            const int x_px, const int y_px,
                            const int depth, lcu_t *lcu)
 {
-  const videoframe_t * const frame = state->tile->frame;
   const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
   const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
   const int8_t cu_width = (LCU_WIDTH >> (depth));
   const int cu_index = LCU_CU_OFFSET + lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH;
+  const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
 
   cu_info_t *cur_cu = &lcu->cu[cu_index];
 
-  kvz_pixel rec_buffer[(LCU_WIDTH * 2 + 1) * (LCU_WIDTH * 2 + 1)];
-  kvz_pixel *cu_in_rec_buffer = &rec_buffer[cu_width * 2 + 8 + 1];
+  kvz_intra_references refs;
 
   int8_t candidate_modes[3];
 
@@ -798,12 +736,9 @@ double kvz_search_cu_intra(encoder_state_t * const state,
   kvz_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
   if (depth > 0) {
-  // Build reconstructed block to use in prediction with extrapolated borders
-  kvz_intra_build_reference_border(state->encoder_control, x_px, y_px, cu_width * 2 + 8,
-                               rec_buffer, cu_width * 2 + 8, 0,
-                               frame->width,
-                               frame->height,
-                               lcu);
+    const vector2d_t luma_px = { x_px, y_px };
+    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
+    kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs);
   }
 
   int8_t modes[35];
@@ -817,10 +752,10 @@ double kvz_search_cu_intra(encoder_state_t * const state,
   bool skip_rough_search = (depth == 0 || state->encoder_control->rdo >= 3);
   if (!skip_rough_search) {
     number_of_modes = search_intra_rough(state,
-                                              ref_pixels, LCU_WIDTH,
-                                              cu_in_rec_buffer, cu_width * 2 + 8,
-                                              cu_width, candidate_modes,
-                                              modes, costs);
+                                         ref_pixels, LCU_WIDTH,
+                                         &refs,
+                                         log2_width, candidate_modes,
+                                         modes, costs);
   } else {
     number_of_modes = 35;
     for (int i = 0; i < number_of_modes; ++i) {
@@ -849,7 +784,6 @@ double kvz_search_cu_intra(encoder_state_t * const state,
     number_of_modes = search_intra_rdo(state,
                       x_px, y_px, depth,
                       ref_pixels, LCU_WIDTH,
-                      cu_in_rec_buffer, cu_width * 2 + 8,
                       candidate_modes,
                       num_modes_to_check,
                       modes, costs, lcu);