[isp] Modify generic intra pred functions to handle non-square blocks.

2024-11-24 02:24:07 +00:00 · 2022-07-19 16:52:07 +03:00 · 2022-07-19 16:52:07 +03:00 · 7f844c643a
parent 4a8f007bcc
commit 7f844c643a
4 changed files with 179 additions and 92 deletions
--- a/src/intra.c
+++ b/src/intra.c
@ -197,6 +197,7 @@ int8_t uvg_intra_get_dir_luma_predictor(

 static void intra_filter_reference(
  int_fast8_t log2_width,
+  int_fast8_t log2_height,
  uvg_intra_references *refs)
 {
  if (refs->filtered_initialized) {
@ -206,6 +207,7 @@ static void intra_filter_reference(
  }

  const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
+  const int_fast8_t ref_height = 2 * (1 << log2_height) + 1;
  uvg_intra_ref *ref = &refs->ref;
  uvg_intra_ref *filtered_ref = &refs->filtered_ref;

@ -213,14 +215,13 @@ static void intra_filter_reference(
  filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) >> 2;
  filtered_ref->top[0] = filtered_ref->left[0];

-  // TODO: use block height here instead of ref_width
  // Top to bottom
-  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
+  for (int_fast8_t y = 1; y < ref_height - 1; ++y) {
    uvg_pixel *p = &ref->left[y];
    filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2;
  }
  // Bottom left (not filtered) 
-  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];
+  filtered_ref->left[ref_height - 1] = ref->left[ref_height - 1];

  // Left to right
  for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
@ -234,36 +235,46 @@ static void intra_filter_reference(

 /**
 * \brief Generate dc prediction.
-* \param log2_width    Log2 of width, range 2..5.
+* \param cu_loc        CU location and size data.
+* \param color         Color channel.
 * \param ref_top       Pointer to -1 index of above reference, length=width*2+1.
 * \param ref_left      Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 * \param multi_ref_idx Multi reference line index for use with MRL.
 */
 static void intra_pred_dc(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  const color_t color,
  const uvg_pixel *const ref_top,
  const uvg_pixel *const ref_left,
  uvg_pixel *const out_block,
  const uint8_t multi_ref_idx)
 {
-  int_fast8_t width = 1 << log2_width;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;

  int_fast16_t sum = 0;
-  for (int_fast8_t i = 0; i < width; ++i) {
-    sum += ref_top[i + 1 + multi_ref_idx];
-    sum += ref_left[i + 1 + multi_ref_idx];
+  // Only one loop is done for non-square blocks.
+  // In case of non-square blocks, only the longer reference is summed.
+  if (width >= height) {
+    for (int_fast8_t i = 0; i < width; ++i) {
+      sum += ref_top[i + 1 + multi_ref_idx];
+    }
+  }
+  if (width <= height) {
+    for (int_fast8_t j = 0; j < height; ++j) {
+      sum += ref_left[j + 1 + multi_ref_idx];
+    }
  }
  
  // JVET_K0122
-  // TODO: take non-square blocks into account
-  const int denom     = width << 1;
+  const int denom     = width == height ? width << 1 : MAX(width, height);
  const int divShift  = uvg_math_floor_log2(denom);
  const int divOffset = denom >> 1;
  
  const uvg_pixel dc_val = (sum + divOffset) >> divShift;
  //const uvg_pixel dc_val = (sum + width) >> (log2_width + 1);
-  const int_fast16_t block_size = 1 << (log2_width * 2);
+  const int_fast16_t block_size = width * height;

  for (int_fast16_t i = 0; i < block_size; ++i) {
    out_block[i] = dc_val;
@ -901,31 +912,34 @@ static void mip_predict(
 static void intra_predict_regular(
  const encoder_state_t* const state,
  uvg_intra_references *refs,
-  int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  int_fast8_t mode,
  color_t color,
  uvg_pixel *dst,
  const uint8_t multi_ref_idx)
 {
-  const int_fast8_t width = 1 << log2_width;
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
  const uvg_config *cfg = &state->encoder_control->cfg;

  // MRL only for luma
  uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;

  const uvg_intra_ref *used_ref = &refs->ref;
-  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || width == 4 || multi_ref_index) {
+  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || (width == 4 && height == 4) || multi_ref_index) {
    // For chroma, DC and 4x4 blocks, always use unfiltered reference.
  } else if (mode == 0) {
    // Otherwise, use filtered for planar.
-    if (width * width > 32) {
+    if (width * height > 32) {
      used_ref = &refs->filtered_ref;
    }
  } else {
    // Angular modes use smoothed reference pixels, unless the mode is close
    // to being either vertical or horizontal.
    static const int uvg_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 };
-    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_width) >> 1];
+    int filter_threshold = uvg_intra_hor_ver_dist_thres[(log2_width + log2_height) >> 1];
    int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18));
    if (dist_from_vert_or_hor > filter_threshold) {

@ -939,15 +953,15 @@ static void intra_predict_regular(
  }

  if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
-    intra_filter_reference(log2_width, refs);
+    intra_filter_reference(log2_width, log2_height, refs);
  }

  if (mode == 0) {
-    uvg_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
+    uvg_intra_pred_planar(cu_loc, color, used_ref->top, used_ref->left, dst);
  } else if (mode == 1) {
-    intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst, multi_ref_index);
+    intra_pred_dc(cu_loc, color, used_ref->top, used_ref->left, dst, multi_ref_index);
  } else {
-    uvg_angular_pred(log2_width, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
+    uvg_angular_pred(cu_loc, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
  }

  // pdpc
@ -1407,7 +1421,7 @@ void uvg_intra_predict(
      mip_predict(state, refs, width, height, dst, intra_mode, data->pred_cu.intra.mip_is_transposed);
    }
    else {
-      intra_predict_regular(state, refs, uvg_g_convert_to_bit[width] + 2, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
+      intra_predict_regular(state, refs, cu_loc, intra_mode, color, dst, data->pred_cu.intra.multi_ref_idx);
    }
  }
  else {
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@ -42,10 +42,9 @@
 #include "strategyselector.h"
 #include "strategies/missing-intel-intrinsics.h"

-
 /**
 * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU locationand size data.
 * \param intra_mode    Angular mode in range 2..34.
 * \param channel_type  Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
@ -54,7 +53,7 @@
 * \param multi_ref_idx Reference line index for use with MRL.
 */
 static void uvg_angular_pred_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
@ -62,8 +61,12 @@ static void uvg_angular_pred_avx2(
  uvg_pixel *const dst,
  const uint8_t multi_ref_idx)
 {
-  
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
  assert(intra_mode >= 2 && intra_mode <= 66);

  // TODO: implement handling of MRL
@ -142,7 +145,6 @@ static void uvg_angular_pred_avx2(
  //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE:IDX] = { 0 };
  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;

  int32_t pred_mode = intra_mode; // ToDo: handle WAIP

@ -498,20 +500,26 @@ static void uvg_angular_pred_avx2(

 /**
 * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
 * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_intra_pred_planar_avx2(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uint8_t *const ref_top,
  const uint8_t *const ref_left,
  uint8_t *const dst)
 {
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));

-  const int_fast8_t width = 1 << log2_width;
  const uint8_t top_right = ref_top[width + 1];
  const uint8_t bottom_left = ref_left[width + 1];

--- a/src/strategies/generic/intra-generic.c
+++ b/src/strategies/generic/intra-generic.c
@ -34,6 +34,7 @@

 #include <stdlib.h>

+#include "cu.h"
 #include "intra.h"
 #include "uvg266.h"
 #include "strategyselector.h"
@ -42,15 +43,16 @@

 /**
 * \brief Generate angular predictions.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
 * \param intra_mode    Angular mode in range 2..34.
+ * \param channel_type  Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
- * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+ * \param in_ref_left   Pointer to -1 index of left reference, length=height*2+1.
 * \param dst           Buffer of size width*width.
 * \param multi_ref_idx Multi reference line index for use with MRL.
 */
 static void uvg_angular_pred_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
@ -58,8 +60,12 @@ static void uvg_angular_pred_generic(
  uvg_pixel *const dst,
  const uint8_t multi_ref_idx)
 {
+  const int width  = channel_type == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = channel_type == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
  
-  assert(log2_width >= 2 && log2_width <= 5);
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));
  assert(intra_mode >= 2 && intra_mode <= 66);

  static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
@ -107,9 +113,8 @@ static void uvg_angular_pred_generic(

  // TODO: check the correct size for these arrays when MRL is used
  //uvg_pixel tmp_ref[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_main[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  uvg_pixel temp_side[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
-  const int_fast32_t width = 1 << log2_width;
+  uvg_pixel temp_above[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };
+  uvg_pixel temp_left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX] = { 0 };

  uint32_t pred_mode = intra_mode; // ToDo: handle WAIP

@ -124,7 +129,7 @@ static void uvg_angular_pred_generic(
  // Sample displacement per column in fractions of 32.
  const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
  
-  // TODO: replace latter width with height
+  // ISP_TODO: replace latter width with height
  int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);

  // Pointer for the reference we are interpolating from.
@ -136,19 +141,32 @@ static void uvg_angular_pred_generic(
  // index 0 in block coordinates.
  if (sample_disp < 0) {

-    // TODO: for non-square blocks, separate loops for x and y is needed
-    for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
-      temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
-      temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    // ISP_TODO: might be able to use memcpy instead of loops here, should be a bit faster.
+    /*if (vertical_mode) {
+      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
+        temp_main[width + i] = in_ref_above[i];
+      }
+      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
+        temp_side[height + j] = in_ref_left[j];
+      }
+    } else {
+      for (int i = 0; i <= width + 1 + multi_ref_index; i++) {
+        temp_side[width + i] = in_ref_above[i];
+      }
+      for (int j = 0; j <= height + 1 + multi_ref_index; j++) {
+        temp_main[height + j] = in_ref_left[j];
+      }
+    }*/
+    memcpy(&temp_above[height], &in_ref_above[0], (width + 2 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[width], &in_ref_left[0], (height + 2 + multi_ref_index) * sizeof(uvg_pixel));

-    // TODO: take into account non-square blocks
-    ref_main = temp_main + width;
-    ref_side = temp_side + width;
+    ref_main = vertical_mode ? temp_above + height : temp_left + width;
+    ref_side = vertical_mode ? temp_left + width : temp_above + height;

    // TODO: for non square blocks, need to check if width or height is used for reference extension
-    for (int i = -width; i <= -1; i++) {
-      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
+    int size_side = vertical_mode ? height : width;
+    for (int i = -size_side; i <= -1; i++) {
+      ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)];
    }

    //const uint32_t index_offset = width + 1;
@ -186,23 +204,27 @@ static void uvg_angular_pred_generic(
  else {
    
    // TODO: again, separate loop needed for non-square blocks
-    for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
+    /*for (int i = 0; i <= (width << 1) + multi_ref_index; i++) {
      temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
      temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
-    }
+    }*/
+    memcpy(&temp_above[0], &in_ref_above[0], ((width << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
+    memcpy(&temp_left[0], &in_ref_left[0], ((height << 1) + 1 + multi_ref_index) * sizeof(uvg_pixel));
+
+    ref_main = vertical_mode ? temp_above : temp_left;
+    ref_side = vertical_mode ? temp_left : temp_above;

    // TODO: this code block will need to change also when non-square blocks are used
-    // const int log2_ratio = 0;
-    const int s = 0;
+    const int log2_ratio = log2_width - log2_height;
+    const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio);
    const int max_index = (multi_ref_index << s) + 2;
-    const int ref_length = width << 1;
-    const uvg_pixel val = temp_main[ref_length + multi_ref_index];
+    const int ref_length = vertical_mode ? width << 1 : height << 1;
+    const uvg_pixel val = ref_main[ref_length + multi_ref_index];
    for (int j = 1; j <= max_index; j++) {
-      temp_main[ref_length + multi_ref_index +  j] = val;
+      ref_main[ref_length + multi_ref_index +  j] = val;
    }

-    ref_main = temp_main;
-    ref_side = temp_side;
+    
    //// sample_disp >= 0 means we don't need to refer to negative indices,
    //// which means we can just use the references as is.
    //ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
@ -222,7 +244,7 @@ static void uvg_angular_pred_generic(
  if (sample_disp != 0) {
    // The mode is not horizontal or vertical, we have to do interpolation.

-    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < width; ++y, delta_pos += sample_disp) {
+    for (int_fast32_t y = 0, delta_pos = sample_disp * (1 + multi_ref_index); y < height; ++y, delta_pos += sample_disp) {
      int_fast32_t delta_int = delta_pos >> 5;
      int_fast32_t delta_fract = delta_pos & (32 - 1);

@ -237,6 +259,7 @@ static void uvg_angular_pred_generic(
          int filter_threshold = uvg_intra_hor_ver_dist_thres[log2_width];
          int dist_from_vert_or_hor = MIN(abs((int32_t)pred_mode - 50), abs((int32_t)pred_mode - 18));
          if (dist_from_vert_or_hor > filter_threshold) {
+            // ISP_TODO: these are introduced in the beginning of this function or am I missing something?
            static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
            const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
            const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
@ -291,7 +314,7 @@ static void uvg_angular_pred_generic(
        }
      }
      if(PDPC_filter) {
-        int       inv_angle_sum = 256;
+        int inv_angle_sum = 256;
        for (int x = 0; x < MIN(3 << scale, width); x++) {
          inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];

@ -336,31 +359,54 @@ static void uvg_angular_pred_generic(
  }
  else {
    // Mode is horizontal or vertical, just copy the pixels.
+    
+    // Do not apply PDPC if multi ref line index is other than 0
+    // ISP_TODO: do not do PDPC if block is in BDPCM mode
+    bool do_pdpc = (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0);

-    // TODO: update outer loop to use height instead of width
-    for (int_fast32_t y = 0; y < width; ++y) {
-      for (int_fast32_t x = 0; x < width; ++x) {
-        dst[y * width + x] = ref_main[x + 1];
-      }
-      // Do not apply PDPC if multi ref line index is other than 0
-      if ((width >= 4 || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
-        int scale = (log2_width + log2_width - 2) >> 2;
-        const uvg_pixel top_left = ref_main[0];
+    if (do_pdpc) {
+      int scale = (log2_width + log2_height - 2) >> 2;
+      const uvg_pixel top_left = ref_main[0];
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
        const uvg_pixel left = ref_side[1 + y];
-        for (int i = 0; i < MIN(3 << scale, width); i++) {
-          const int wL = 32 >> (2 * i >> scale);
-          const uvg_pixel val = dst[y * width + i];
-          dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+        for (int_fast32_t x = 0; x < MIN(3 << scale, width); ++x) {
+          const int wL = 32 >> (2 * x >> scale);
+          const uvg_pixel val = dst[y * width + x];
+          dst[y * width + x] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
        }
      }
+    } else {
+      for (int_fast32_t y = 0; y < height; ++y) {
+        memcpy(&dst[y * width], &ref_main[1], width * sizeof(uvg_pixel));
+      }
    }
+    // ISP_TODO: there is no reason to run these loops AND then check if PDPC is applied. Do the check first and then run either the normal or PDPC loops
+
+    //for (int_fast32_t y = 0; y < height; ++y) {
+    //  for (int_fast32_t x = 0; x < width; ++x) {
+    //    dst[y * width + x] = ref_main[x + 1];
+    //  }
+    //  // Do not apply PDPC if multi ref line index is other than 0
+    //  // ISP_TODO: do not do PDPC if block is in BDPCM mode
+    //  if (((width >= 4 && height >= 4) || channel_type != 0) && sample_disp >= 0 && multi_ref_index == 0) {
+    //    int scale = (log2_width + log2_height - 2) >> 2;
+    //    const uvg_pixel top_left = ref_main[0];
+    //    const uvg_pixel left = ref_side[1 + y];
+    //    for (int i = 0; i < MIN(3 << scale, width); i++) { // ISP_TODO: is one loop enough for PDPC?
+    //      const int wL = 32 >> (2 * i >> scale);
+    //      const uvg_pixel val = dst[y * width + i];
+    //      dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
+    //    }
+    //  }
+    //}
  }

  // Flip the block if this is was a horizontal mode.
  if (!vertical_mode) {
-    for (int_fast32_t y = 0; y < width - 1; ++y) {
+    for (int_fast32_t y = 0; y < height - 1; ++y) {
      for (int_fast32_t x = y + 1; x < width; ++x) {
-        SWAP(dst[y * width + x], dst[x * width + y], uvg_pixel);
+        SWAP(dst[y * width + x], dst[x * height + y], uvg_pixel);
      }
    }
  }
@ -369,23 +415,31 @@ static void uvg_angular_pred_generic(

 /**
 * \brief Generate planar prediction.
- * \param log2_width    Log2 of width, range 2..5.
+ * \param cu_loc        CU location and size data.
+ * \param color         Color channel.
 * \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
 * \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
 * \param dst           Buffer of size width*width.
 */
 static void uvg_intra_pred_planar_generic(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uvg_pixel *const ref_top,
  const uvg_pixel *const ref_left,
  uvg_pixel *const dst)
 {
-  // TODO: Add height
-  assert(log2_width >= 2 && log2_width <= 5);
+  const int width = color == COLOR_Y ? cu_loc->width : cu_loc->chroma_width;
+  const int height = color == COLOR_Y ? cu_loc->height : cu_loc->chroma_height;
+  const int log2_width = uvg_g_convert_to_bit[width] + 2;
+  const int log2_height = uvg_g_convert_to_bit[height] + 2;
+
+  const int offset = 1 << (log2_width + log2_height);
+  const int final_shift = 1 + log2_width + log2_height;
+  
+  assert((log2_width >= 2 && log2_width <= 5) && (log2_height >= 2 && log2_height <= 5));

-  const int_fast8_t width = 1 << log2_width;
  const uvg_pixel top_right = ref_top[width + 1];
-  const uvg_pixel bottom_left = ref_left[width + 1];
+  const uvg_pixel bottom_left = ref_left[height + 1];

 #if 0
  // Unoptimized version for reference.
@ -397,18 +451,27 @@ static void uvg_intra_pred_planar_generic(
    }
  }
 #else
-  int_fast16_t top[32];
+  // TODO: get rid of magic numbers. Make a define for this
+  int_fast16_t top[64];
+  int_fast16_t bottom[64];
+  int_fast16_t left[64];
+  int_fast16_t right[64];
  for (int i = 0; i < width; ++i) {
-    top[i] = ref_top[i + 1] << log2_width;
+    bottom[i] = bottom_left - ref_top[i + 1];
+    top[i] = ref_top[i + 1] << log2_height;
  }

-  for (int y = 0; y < width; ++y) {
-    int_fast16_t hor = (ref_left[y + 1] << log2_width) + width;
+  for (int j = 0; j < height; ++j) {
+    right[j] = top_right - ref_left[j + 1];
+    left[j] = ref_left[j + 1] << log2_width;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    int_fast16_t hor = left[y];
    for (int x = 0; x < width; ++x) {
-      hor += top_right - ref_left[y + 1];
-      top[x] += bottom_left - ref_top[x + 1];
-      dst[y * width + x] = (hor + top[x]) >> (log2_width + 1);
-      //
+      hor += right[y];
+      top[x] += bottom[x];
+      dst[y * width + x] = ((hor << log2_height) + (top[x] << log2_width) + offset) >> final_shift;
    }
  }
 #endif
--- a/src/strategies/strategies-intra.h
+++ b/src/strategies/strategies-intra.h
@ -38,13 +38,14 @@
 * Interface for intra prediction functions.
 */

+#include "cu.h"
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 #include "uvg266.h"


 typedef void (angular_pred_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
  const int_fast8_t intra_mode,
  const int_fast8_t channel_type,
  const uvg_pixel *const in_ref_above,
@ -53,7 +54,8 @@ typedef void (angular_pred_func)(
  const uint8_t multi_ref_idx);

 typedef void (intra_pred_planar_func)(
-  const int_fast8_t log2_width,
+  const cu_loc_t* const cu_loc,
+  color_t color,
  const uvg_pixel *const ref_top,
  const uvg_pixel *const ref_left,
  uvg_pixel *const dst);