Add reimplemented intra prediction code

Just along side for now to help with debugging. The main difference with the new versions is that they take and output width**2 blocks and two width*2+1 arrays of reference samples, instead of the (2*width+8)**2 blocks the old ones do. This should make the interface clearer and the memory footprint smaller. Also commented the shit out of angular prediction, so hopefully Ari L. will have an easier time with a SIMD implementation.
2024-11-23 18:14:06 +00:00 · 2015-10-03 03:36:58 +03:00 · 2015-10-03 03:36:58 +03:00 · cd2f1797bf
parent 115756b9d7
commit cd2f1797bf
3 changed files with 553 additions and 7 deletions
--- a/src/global.h
+++ b/src/global.h
@ -124,6 +124,7 @@ typedef int16_t coeff_t;
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define CLIP(low,high,value) MAX((low),MIN((high),(value)))
+#define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
 #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
--- a/src/intra.c
+++ b/src/intra.c
@ -217,11 +217,281 @@ void kvz_intra_filter(kvz_pixel *ref, int32_t stride,int32_t width, int8_t mode)
 }


+static void intra_filter_reference(int_fast8_t log2_width, kvz_intra_references *refs)
+{
+  if (refs->filtered_initialized) {
+    return;
+  } else {
+    refs->filtered_initialized = true;
+  }
+
+  const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
+  kvz_intra_ref *ref = &refs->ref;
+  kvz_intra_ref *filtered_ref = &refs->filtered_ref;
+
+  filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) / 4;
+  filtered_ref->top[0] = filtered_ref->left[0];
+
+  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
+    kvz_pixel *p = &ref->left[y];
+    filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
+  }
+  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];
+
+  for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
+    kvz_pixel *p = &ref->top[x];
+    filtered_ref->top[x] = (p[-1] + 2 * p[0] + p[1] + 2) / 4;
+  }
+  filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
+}
+
+
+static void post_process_intra_angular(
+  unsigned width,
+  unsigned stride,
+  const kvz_pixel *ref,
+  kvz_pixel *block)
+{
+  kvz_pixel ref2 = ref[0];
+  for (unsigned i = 0; i < width; i++) {
+    kvz_pixel val = block[i * stride];
+    kvz_pixel ref1 = ref[i + 1];
+    block[i * stride] = CLIP_TO_PIXEL(val + ((ref1 - ref2) >> 1));
+  }
+}
+
+
 /**
- * \param rec  Reference pixel. 0 points to unfiltered and 1 to filtered.
- * \param recstride  Stride for rec pixel arrays.
- * \param dst
+ * \brief Generage angular predictions.
+ * \param log2_width    Log2 of width, range 2..5.
+ * \param intra_mode    Angular mode in range 2..34.
+ * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param dst           Buffer of size width*width.
 */
+static void kvz_intra_pred_angular(
+  const int_fast8_t log2_width,
+  const int_fast8_t intra_mode,
+  const kvz_pixel *const in_ref_above,
+  const kvz_pixel *const in_ref_left,
+  kvz_pixel *const dst)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+  assert(intra_mode >= 2 && intra_mode <= 34);
+
+  static const int8_t modedisp2sampledisp[9] = {0, 2, 5, 9, 13, 17, 21, 26, 32};
+  static const int16_t modedisp2invsampledisp[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256}; // (256 * 32) / sampledisp
+
+  // Temporary buffer for modes 11-25.
+  // It only needs to be big enough to hold indices from -width to width-1.
+  kvz_pixel tmp_ref[2 * 32];
+  const int_fast8_t width = 1 << log2_width;
+
+  // Whether to swap references to always project on the left reference row.
+  const bool vertical_mode = intra_mode >= 18;
+  // Modes distance to horizontal or vertical mode.
+  const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
+  // Sample displacement per column in fractions of 32.
+  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
+
+  // Pointer for the reference we are interpolating from.
+  const kvz_pixel *ref_main;
+  // Pointer for the other reference.
+  const kvz_pixel *ref_side;
+
+  // Set ref_main and ref_side such that, when indexed with 0, they point to
+  // index 0 in block coordinates.
+  if (sample_disp < 0) {
+    // Negative sample_disp means, we need to use both references.
+
+    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
+    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
+
+    // Move the reference pixels to start from the middle to the later half of
+    // the tmp_ref, so there is room for negative indices.
+    for (int_fast8_t x = -1; x < width; ++x) {
+      tmp_ref[x + width] = ref_main[x];
+    }
+    // Get a pointer to block index 0 in tmp_ref.
+    ref_main = &tmp_ref[width];
+
+    // Extend the side reference to the negative indices of main reference.
+    int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
+    int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
+    int_fast8_t most_negative_index = (width * sample_disp) >> 5;
+    for (int_fast8_t x = -2; x >= most_negative_index; --x) {
+      col_sample_disp += inv_abs_sample_disp;
+      int_fast8_t side_index = col_sample_disp >> 8;
+      tmp_ref[x + width] = ref_side[side_index - 1];
+    }
+  } else {
+    // sample_disp >= 0 means we don't need to refer to negative indices,
+    // which means we can just use the references as is.
+    ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
+    ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
+  }
+
+  if (sample_disp != 0) {
+    // The mode is not horizontal or vertical, we have to do interpolation.
+
+    int_fast16_t delta_pos = 0;
+    for (int_fast8_t y = 0; y < width; ++y) {
+      delta_pos += sample_disp;
+      int_fast8_t delta_int = delta_pos >> 5;
+      int_fast8_t delta_fract = delta_pos & (32 - 1);
+
+      if (delta_fract) {
+        // Do linear filtering
+        for (int_fast8_t x = 0; x < width; ++x) {
+          kvz_pixel ref1 = ref_main[x + delta_int];
+          kvz_pixel ref2 = ref_main[x + delta_int + 1];
+          dst[y * width + x] = ((32 - delta_fract) * ref1 + delta_fract * ref2 + 16) >> 5;
+        }
+      } else {
+        // Just copy the integer samples
+        for (int_fast8_t x = 0; x < width; x++) {
+          dst[y * width + x] = ref_main[x + delta_int];
+        }
+      }
+    }
+  } else {
+    // Mode is horizontal or vertical, just copy the pixels.
+
+    for (int_fast8_t y = 0; y < width; ++y) {
+      for (int_fast8_t x = 0; x < width; ++x) {
+        dst[y * width + x] = ref_main[x];
+      }
+    }
+  }
+
+  // Flip the block if this is was a horizontal mode.
+  if (!vertical_mode) {
+    for (int_fast8_t y = 0; y < width - 1; ++y) {
+      for (int_fast8_t x = y + 1; x < width; ++x) {
+        SWAP(dst[y * width + x], dst[x * width + y], kvz_pixel);
+      }
+    }
+  }
+}
+
+
+/**
+ * \brief Generage planar prediction.
+ * \param log2_width    Log2 of width, range 2..5.
+ * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param dst           Buffer of size width*width.
+ */
+static void kvz_intra_pred_planar(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const dst)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+  const kvz_pixel top_right = ref_top[width + 1];
+  const kvz_pixel bottom_left = ref_left[width + 1];
+
+#if 0
+  // Unoptimized version for reference.
+  for (int y = 0; y < width; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int_fast16_t hor = (width - 1 - x) * ref_left[y + 1] + (x + 1) * top_right;
+      int_fast16_t ver = (width - 1 - y) * ref_top[x + 1] + (y + 1) * bottom_left;
+      dst[y * width + x] = (ver + hor + width) >> (log2_width + 1);
+    }
+  }
+#else
+  int_fast16_t top[32];
+  for (int i = 0; i < width; ++i) {
+    top[i] = ref_top[i + 1] << log2_width;
+  }
+
+  for (int y = 0; y < width; ++y) {
+    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+    for (int x = 0; x < width; ++x) {
+      hor += top_right - ref_left[y + 1];
+      top[x] += bottom_left - ref_top[x + 1];
+      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
+    }
+  }
+#endif
+}
+
+
+/**
+* \brief Generage planar prediction.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void kvz_intra_pred_dc(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_top[i + 1];
+    sum += ref_left[i + 1];
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+  const int_fast16_t block_size = 1 << (log2_width * 2);
+
+  for (int_fast16_t i = 0; i < block_size; ++i) {
+    out_block[i] = dc_val;
+  }
+}
+
+
+/**
+* \brief Generage intra DC prediction with post filtering applied.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void kvz_intra_pred_filtered_dc(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_top[i + 1];
+    sum += ref_left[i + 1];
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+
+  // Filter top-left with ([1 2 1] / 4)
+  out_block[0] = (ref_left[1] + 2 * dc_val + ref_top[1] + 2) / 4;
+
+  // Filter rest of the boundary with ([1 3] / 4)
+  for (int_fast8_t x = 1; x < width; ++x) {
+    out_block[x] = (ref_top[x + 1] + 3 * dc_val + 2) / 4;
+  }
+  for (int_fast8_t y = 1; y < width; ++y) {
+    out_block[y * width] = (ref_left[y + 1] + 3 * dc_val + 2) / 4;
+    for (int_fast8_t x = 1; x < width; ++x) {
+      out_block[y * width + x] = dc_val;
+    }
+  }
+}
+
+
 void kvz_intra_get_pred(const encoder_control_t * const encoder, const kvz_pixel *rec, const kvz_pixel *rec_filtered, int recstride, kvz_pixel *dst, int width, int mode, int is_chroma)
 {
  const kvz_pixel *ref_pixels = rec;
@ -259,6 +529,56 @@ void kvz_intra_get_pred(const encoder_control_t * const encoder, const kvz_pixel
 }


+void kvz_intra_get_pred_new(
+  kvz_intra_references *refs,
+  int_fast8_t log2_width,
+  int_fast8_t mode,
+  color_t color,
+  kvz_pixel *dst)
+{
+  const int_fast8_t width = 1 << log2_width;
+
+  const kvz_intra_ref *used_ref = &refs->ref;
+  if (color != COLOR_Y || mode == 1 || width == 4) {
+    // For chroma, DC and 4x4 blocks, always use unfiltered reference.
+  } else if (mode == 0) {
+    // Otherwise, use filtered for planar.
+    used_ref = &refs->filtered_ref;
+  } else {
+    // Angular modes use smoothed reference pixels, unless the mode is close
+    // to being either vertical or horizontal.
+    int filter_threshold = kvz_intra_hor_ver_dist_thres[g_to_bits[width]];
+    int dist_from_vert_or_hor = MIN(abs(mode - 26), abs(mode - 10));
+    if (dist_from_vert_or_hor > filter_threshold) {
+      used_ref = &refs->filtered_ref;
+    }
+  }
+
+  if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
+    intra_filter_reference(log2_width, refs);
+  }
+
+  if (mode == 0) {
+    kvz_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
+  } else if (mode == 1) {
+    // Do extra post filtering for edge pixels of luma DC mode.
+    if (color == COLOR_Y && width < 32) {
+      kvz_intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
+    } else {
+      kvz_intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst);
+    }
+  } else {
+    kvz_intra_pred_angular(log2_width, mode, used_ref->top, used_ref->left, dst);
+    if (color == COLOR_Y && width < 32) {
+      if (mode == 10) {
+        post_process_intra_angular(width, 1, used_ref->top, dst);
+      } else if (mode == 26) {
+        post_process_intra_angular(width, width, used_ref->left, dst);
+      }
+    }
+  }
+}
+

 /**
 * \brief Reconstruct intra block according to prediction
@ -269,7 +589,6 @@ void kvz_intra_get_pred(const encoder_control_t * const encoder, const kvz_pixel
 * \param dststride destination width
 * \param mode intra mode to use
 * \param chroma chroma-block flag
-
 */
 void kvz_intra_recon(const encoder_control_t * const encoder, kvz_pixel* rec, int32_t recstride, uint32_t width, kvz_pixel* dst, int32_t dststride, int8_t mode, int8_t chroma)
 {
@ -294,6 +613,22 @@ void kvz_intra_recon(const encoder_control_t * const encoder, kvz_pixel* rec, in
  kvz_pixels_blit(pred, dst, width, width, width, dststride);
 }

+void kvz_intra_recon_new(
+  kvz_intra_references *refs, 
+  uint32_t log2_width, 
+  kvz_pixel* dst, 
+  int32_t dst_stride, 
+  int8_t mode, 
+  color_t color)
+{
+  kvz_pixel pred[32 * 32];
+  const int_fast8_t width = 1 << log2_width;
+  
+  kvz_intra_get_pred_new(refs, log2_width, mode, color, pred);
+
+  kvz_pixels_blit(pred, dst, width, width, width, dst_stride);
+}
+
 /**
 * \brief Build top and left borders for a reference block.
 * \param pic picture to use as a source
@ -477,8 +812,171 @@ void kvz_intra_build_reference_border(const encoder_control_t * const encoder, i
  }
 }

-const int32_t kvz_ang_table[9]     = {0,    2,    5,   9,  13,  17,  21,  26,  32};
-const int32_t kvz_inv_ang_table[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256}; // (256 * 32) / Angle
+
+void kvz_intra_build_reference(
+  const int_fast8_t log2_width,
+  const color_t color,
+  const vector2d_t *const luma_px,
+  const vector2d_t *const pic_px,
+  const lcu_t *const lcu,
+  kvz_intra_references *const refs)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  // Tables for looking up the number of intra reference pixels based on
+  // prediction units coordinate within an LCU.
+  // generated by "tools/generate_ref_pixel_tables.py".
+  static const uint8_t num_ref_pixels_top[16][16] = {
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
+    {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 }
+  };
+  static const uint8_t num_ref_pixels_left[16][16] = {
+    { 64,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
+    { 60,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
+    { 56,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
+    { 52,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
+    { 48,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
+    { 44,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
+    { 40,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 36,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 },
+    { 32,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
+    { 28,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
+    { 24,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
+    { 20,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
+    { 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
+    { 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
+    { 8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
+    { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
+  };
+
+  refs->filtered_initialized = false;
+  kvz_pixel *out_left_ref = &refs->ref.left[0];
+  kvz_pixel *out_top_ref = &refs->ref.top[0];
+
+  const kvz_pixel dc_val = 1 << (KVZ_BIT_DEPTH - 1);
+  const int is_chroma = color != COLOR_Y ? 1 : 0;
+  const int_fast8_t width = 1 << log2_width;
+
+  // Convert luma coordinates to chroma coordinates for chroma.
+  const vector2d_t lcu_px = {
+    luma_px->x % LCU_WIDTH,
+    luma_px->y % LCU_WIDTH
+  };
+  const vector2d_t px = {
+    lcu_px.x >> is_chroma,
+    lcu_px.y >> is_chroma,
+  };
+
+  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
+  const kvz_pixel *left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
+  const kvz_pixel *top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
+  const kvz_pixel *rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;
+
+  // Init top borders pointer to point to the correct place in the correct reference array.
+  const kvz_pixel *top_border;
+  if (px.y) {
+    top_border = &rec_ref[px.x + (px.y - 1) * (LCU_WIDTH >> is_chroma)];
+  } else {
+    top_border = &top_ref[px.x];
+  }
+
+  // Init left borders pointer to point to the correct place in the correct reference array.
+  const kvz_pixel *left_border;
+  int left_stride; // Distance between reference samples.
+  if (px.x) {
+    left_border = &rec_ref[px.x - 1 + px.y * (LCU_WIDTH >> is_chroma)];
+    left_stride = LCU_WIDTH >> is_chroma;
+  } else {
+    left_border = &left_ref[px.y];
+    left_stride = 1;
+  }
+
+  // Generate left reference.
+  if (luma_px->x > 0) {
+    // Get the number of reference pixels based on the PU coordinate within the LCU.
+    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+
+    // Limit the number of available pixels based on block size and dimensions
+    // of the picture.
+    px_available_left = MIN(px_available_left, width * 2);
+    px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);
+
+    // Copy pixels from coded CUs.
+    for (int i = 0; i < px_available_left; ++i) {
+      out_left_ref[i + 1] = left_border[i * left_stride];
+    }
+    // Extend the last pixel for the rest of the reference values.
+    kvz_pixel nearest_pixel = out_left_ref[px_available_left];
+    for (int i = px_available_left; i < width * 2; ++i) {
+      out_left_ref[i + 1] = nearest_pixel;
+    }
+  } else {
+    // If we are on the left edge, extend the first pixel of the top row.
+    kvz_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
+    for (int i = 0; i < width * 2; i++) {
+      out_left_ref[i + 1] = nearest_pixel;
+    }
+  }
+
+  // Generate top-left reference.
+  if (luma_px->x > 0 && luma_px->y > 0) {
+    // If the block is at an LCU border, the top-left must be copied from
+    // the border that points to the LCUs 1D reference buffer.
+    if (px.x == 0) {
+      out_left_ref[0] = left_border[-1 * left_stride];
+      out_top_ref[0] = left_border[-1 * left_stride];
+    } else {
+      out_left_ref[0] = top_border[-1];
+      out_top_ref[0] = top_border[-1];
+    }
+  } else {
+    // Copy reference clockwise.
+    out_left_ref[0] = out_left_ref[1];
+    out_top_ref[0] = out_left_ref[1];
+  }
+
+  // Generate top reference.
+  if (luma_px->y > 0) {
+    // Get the number of reference pixels based on the PU coordinate within the LCU.
+    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;
+
+    // Limit the number of available pixels based on block size and dimensions
+    // of the picture.
+    px_available_top = MIN(px_available_top, width * 2);
+    px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);
+
+    // Copy all the pixels we can.
+    for (int i = 0; i < px_available_top; ++i) {
+      out_top_ref[i + 1] = top_border[i];
+    }
+    // Extend the last pixel for the rest of the reference values.
+    kvz_pixel nearest_pixel = top_border[px_available_top - 1];
+    for (int i = px_available_top; i < width * 2; ++i) {
+      out_top_ref[i + 1] = nearest_pixel;
+    }
+  } else {
+    // Extend nearest pixel.
+    kvz_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
+    for (int i = 0; i < width * 2; i++) {
+      out_top_ref[i + 1] = nearest_pixel;
+    }
+  }
+}
+

 /**
 * \brief this functions constructs the angular intra prediction from border samples
@ -486,6 +984,9 @@ const int32_t kvz_inv_ang_table[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 25
 */
 void kvz_intra_get_angular_pred(const encoder_control_t * const encoder, const kvz_pixel* src, int32_t src_stride, kvz_pixel* dst, int32_t dst_stride, int32_t width, int32_t dir_mode, int8_t filter)
 {
+  static const int32_t kvz_ang_table[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+  static const int32_t kvz_inv_ang_table[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
+
  int32_t k,l;
  int32_t blk_size        = width;

--- a/src/intra.h
+++ b/src/intra.h
@ -27,10 +27,20 @@

 #include "global.h"

-#include "image.h"
 #include "encoder.h"
 #include "encoderstate.h"

+typedef struct {
+  kvz_pixel left[2 * 32 + 1];
+  kvz_pixel top[2 * 32 + 1];
+} kvz_intra_ref;
+typedef struct
+{
+  kvz_intra_ref ref;
+  kvz_intra_ref filtered_ref;
+  bool filtered_initialized;
+} kvz_intra_references;
+
 //void kvz_intra_set_block_mode(image* im,uint32_t x_ctb, uint32_t y_ctb, uint8_t depth, uint8_t mode, uint8_t part_mode);

 int8_t kvz_intra_get_dir_luma_predictor(uint32_t x, uint32_t y, int8_t* preds,
@ -47,6 +57,40 @@ kvz_pixel kvz_intra_get_dc_pred(const kvz_pixel* pic, uint16_t pic_width, uint8_
 void kvz_intra_get_planar_pred(const kvz_pixel* src,int32_t srcstride, uint32_t width, kvz_pixel* dst, int32_t dststride);
 void kvz_intra_get_angular_pred(const encoder_control_t *encoder, const kvz_pixel* src, int32_t src_stride, kvz_pixel* dst, int32_t dst_stride, int32_t width, int32_t dir_mode, int8_t filter);

+/**
+* \brief Generage angular predictions.
+* \param width    Width in pixels, range 4..32.
+* \param color    What color pixels to use.
+* \param luma_px  Luma coordinates of the prediction block.
+* \param pic_px   Picture dimensions in luma pixels.
+* \param lcu      LCU struct.
+* \param out_left_ref  Left reference pixels, index 0 is the top-left.
+* \param out_top_ref   Top reference pixels, index 0 is the top-left.
+*/
+void kvz_intra_build_reference(
+  const int_fast8_t log2_width,
+  const color_t color,
+  const vector2d_t *const luma_px,
+  const vector2d_t *const pic_px,
+  const lcu_t *const lcu,
+  kvz_intra_references *const refs);
+
+/**
+ * \brief Generate intra predictions.
+ * \param refs   Reference pixels used for the prediction.     
+ * \param log2_width  Width of the predicted block.
+ * \param mode   Intra mode used for the prediction.
+ * \param color  Color of the prediction.
+ * \param dst    Buffer for the predicted pixels.
+ */
+void kvz_intra_get_pred_new(
+  kvz_intra_references *refs,
+  int_fast8_t log2_width,
+  int_fast8_t mode,
+  color_t color,
+  kvz_pixel *dst);
+
+
 void kvz_intra_recon(const encoder_control_t *encoder, kvz_pixel* rec, int32_t rec_stride, uint32_t width, kvz_pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);

 void kvz_intra_recon_lcu_luma(encoder_state_t *state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);