diff --git a/src/cu.h b/src/cu.h
index 4b68bca4..585f823b 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -106,10 +106,15 @@ typedef struct {
 
 cu_array_t * kvz_cu_array_alloc(int width_in_scu, int height_in_scu);
 int kvz_cu_array_free(cu_array_t *cua);
-  
 
-#define SUB_SCU_BIT_MASK (64 - 1)
-#define SUB_SCU(xy) (xy & SUB_SCU_BIT_MASK)
+/**
+ * \brief Return the 7 lowest-order bits of the pixel coordinate.
+ *
+ * The 7 lower-order bits correspond to the distance from the left or top edge
+ * of the containing LCU.
+ */
+#define SUB_SCU(xy) ((xy) & (LCU_WIDTH - 1))
+
 #define LCU_CU_WIDTH 8
 #define LCU_T_CU_WIDTH 9
 #define LCU_CU_OFFSET 10
@@ -153,10 +158,66 @@ typedef struct {
    * - Left reference CUs on column 0.
    * - All of LCUs CUs on 1:9, 1:9.
    * - Top right reference CU on the last slot.
+   *
+   \verbatim
+
+      .-- left reference CUs
+      v
+       0 |  1  2  3  4  5  6  7  8 | 81 <-- top reference CUs
+     ----+-------------------------+----
+       9 | 10 11 12 13 14 15 16 17 |
+      18 | 19 20 21 22 23 24 25 26 <-- this LCU
+      27 | 28 29 30 31 32 33 34 35 |
+      36 | 37 38 39 40 41 42 43 44 |
+      45 | 46 47 48 49 50 51 52 53 |
+      54 | 55 56 57 58 59 60 61 62 |
+      63 | 64 65 66 67 68 69 70 71 |
+      72 | 73 74 75 76 77 78 79 80 |
+     ----+-------------------------+----
+
+   \endverbatim
    */
   cu_info_t cu[9*9+1];
 } lcu_t;
 
+/**
+ * \brief Return pointer to a given CU.
+ *
+ * \param lcu   pointer to the containing LCU
+ * \param x_cu  x-index of the CU
+ * \param y_cu  y-index of the CU
+ * \return      pointer to the CU
+ */
+#define LCU_GET_CU(lcu, x_cu, y_cu) \
+  (&(lcu)->cu[LCU_CU_OFFSET + (x_cu) + (y_cu) * LCU_T_CU_WIDTH])
+
+/**
+ * \brief Return pointer to the top right reference CU.
+ */
+#define LCU_GET_TOP_RIGHT_CU(lcu) \
+  (&(lcu)->cu[LCU_T_CU_WIDTH * LCU_T_CU_WIDTH])
+
+/**
+ * \brief Return pointer to the CU containing a given pixel.
+ *
+ * \param lcu   pointer to the containing LCU
+ * \param x_px  x-coordinate relative to the upper left corner of the LCU
+ * \param y_px  y-coordinate relative to the upper left corner of the LCU
+ * \return      pointer to the CU at coordinates (x_px, y_px)
+ */
+#define LCU_GET_CU_AT_PX(lcu, x_px, y_px) LCU_GET_CU(lcu, (x_px) >> 3, (y_px) >> 3)
+
+/**
+ * \brief Return pointer to a CU relative to the given CU.
+ *
+ * \param cu      pointer to a CU in the array at some location (x, y)
+ * \param x_offs  x-offset
+ * \param y_offs  y-offset
+ * \return        pointer to the CU at (x + x_offs, y + y_offs)
+ */
+#define CU_GET_CU(cu_array, x_offs, y_offs) \
+  (&cu_array[(x_offs) + (y_offs) * LCU_T_CU_WIDTH])
+
 #define CHECKPOINT_LCU(prefix_str, lcu) do { \
   CHECKPOINT_CU(prefix_str " cu[0]", (lcu).cu[0]); \
   CHECKPOINT_CU(prefix_str " cu[1]", (lcu).cu[1]); \
diff --git a/src/filter.c b/src/filter.c
index 465e3cce..c223e8b6 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -83,11 +83,16 @@ const int8_t kvz_g_chroma_filter[8][4] =
 /**
  * \brief
  */
-INLINE void kvz_filter_deblock_luma(const encoder_control_t * const encoder, kvz_pixel *src, int32_t offset,
-                                int32_t tc, int8_t sw,
-                                int8_t part_P_nofilter, int8_t part_Q_nofilter,
-                                int32_t thr_cut,
-                                int8_t filter_second_P, int8_t filter_second_Q)
+static INLINE void kvz_filter_deblock_luma(const encoder_control_t * const encoder,
+                                           kvz_pixel *src,
+                                           int32_t offset,
+                                           int32_t tc,
+                                           int8_t sw,
+                                           int8_t part_P_nofilter,
+                                           int8_t part_Q_nofilter,
+                                           int32_t thr_cut,
+                                           int8_t filter_second_P,
+                                           int8_t filter_second_Q)
 {
   int32_t delta;
 
@@ -143,8 +148,12 @@ INLINE void kvz_filter_deblock_luma(const encoder_control_t * const encoder, kvz
 /**
  * \brief
  */
-INLINE void kvz_filter_deblock_chroma(const encoder_control_t * const encoder, kvz_pixel *src, int32_t offset, int32_t tc,
-                                  int8_t part_P_nofilter, int8_t part_Q_nofilter)
+static INLINE void kvz_filter_deblock_chroma(const encoder_control_t * const encoder,
+                                             kvz_pixel *src,
+                                             int32_t offset,
+                                             int32_t tc,
+                                             int8_t part_P_nofilter,
+                                             int8_t part_Q_nofilter)
 {
   int32_t delta;
   int16_t m2 = src[-offset * 2];
@@ -161,72 +170,119 @@ INLINE void kvz_filter_deblock_chroma(const encoder_control_t * const encoder, k
   }
 }
 
+
 /**
- * \brief
+ * \brief Check wheter an edge is a TU boundary.
+ *
+ * \param state   encoder state
+ * \param x       x-coordinate of the scu in pixels
+ * \param y       y-coordinate of the scu in pixels
+ * \param dir     direction of the edge to check
+ * \return        true, if the edge is a TU boundary, otherwise false
  */
-void kvz_filter_deblock_edge_luma(encoder_state_t * const state,
-                              int32_t xpos, int32_t ypos,
-                              int8_t depth, int8_t dir)
+static bool is_tu_boundary(const encoder_state_t *const state,
+                           int32_t x,
+                           int32_t y,
+                           edge_dir dir)
+{
+  const cu_info_t *const scu = kvz_videoframe_get_cu(state->tile->frame,
+                                                     x >> MIN_SIZE,
+                                                     y >> MIN_SIZE);
+  const int tu_width = LCU_WIDTH >> scu->tr_depth;
+
+  if (dir == EDGE_HOR) {
+    return (y & (tu_width - 1)) == 0;
+  } else {
+    return (x & (tu_width - 1)) == 0;
+  }
+}
+
+
+/**
+ * \brief Check wheter an edge is aligned on a 8x8 grid.
+ *
+ * \param x     x-coordinate of the edge
+ * \param y     y-coordinate of the edge
+ * \param dir   direction of the edge
+ * \return      true, if the edge is aligned on a 8x8 grid, otherwise false
+ */
+static bool is_on_8x8_grid(int x, int y, edge_dir dir)
+{
+  if (dir == EDGE_HOR) {
+    return (y & 7) == 0;
+  } else {
+    return (x & 7) == 0;
+  }
+}
+
+/**
+ * \brief Apply the deblocking filter to luma pixels on a single edge.
+ *
+ * The caller should check that the edge is a TU boundary or a PU boundary.
+ *
+ \verbatim
+
+         .-- filter this edge if dir == EDGE_HOR
+         v
+     +--------+
+     |o <-- pixel at (x, y)
+     |        |
+     |<-- filter this edge if dir == EDGE_VER
+     |        |
+     +--------+
+
+ \endverbatim
+ *
+ * \param state     encoder state
+ * \param x         x-coordinate in pixels (see above)
+ * \param y         y-coordinate in pixels (see above)
+ * \param length    length of the edge in pixels
+ * \param dir       direction of the edge to filter
+ */
+static void filter_deblock_edge_luma(encoder_state_t * const state,
+                                     int32_t x,
+                                     int32_t y,
+                                     int32_t length,
+                                     edge_dir dir)
 {
   videoframe_t * const frame = state->tile->frame;
   const encoder_control_t * const encoder = state->encoder_control;
   
-  cu_info_t *cu_q = kvz_videoframe_get_cu(frame, xpos >> MIN_SIZE, ypos >> MIN_SIZE);
-
-  {
-    // Return if called with a coordinate which is not at CU or TU boundary.
-    // TODO: Add handling for asymmetric inter CU boundaries which do not coincide
-    // with transform boundaries.
-    const int tu_width = LCU_WIDTH >> cu_q->tr_depth;
-    if (dir == EDGE_HOR && (ypos & (tu_width - 1))) return;
-    if (dir == EDGE_VER && (xpos & (tu_width - 1))) return;
-  }
+  cu_info_t *cu_q = kvz_videoframe_get_cu(frame, x >> MIN_SIZE, y >> MIN_SIZE);
 
   {
     int32_t stride = frame->rec->stride;
-    int32_t offset = stride;
     int32_t beta_offset_div2 = encoder->beta_offset_div2;
     int32_t tc_offset_div2   = encoder->tc_offset_div2;
     // TODO: support 10+bits
-    kvz_pixel *orig_src = &frame->rec->y[xpos + ypos*stride];
+    kvz_pixel *orig_src = &frame->rec->y[x + y*stride];
     kvz_pixel *src = orig_src;
-    int32_t step = 1;
     cu_info_t *cu_p = NULL;
-    int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
-    int8_t strength = 0;
+    int16_t x_cu = x >> MIN_SIZE;
+    int16_t y_cu = y >> MIN_SIZE;
 
+    int8_t strength = 0;
     int32_t qp              = state->global->QP;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = kvz_g_beta_table_8x8[b_index] * bitdepth_scale;
     int32_t side_threshold  = (beta + (beta >>1 )) >> 3;
-    uint32_t blocks_in_part = (LCU_WIDTH >> depth) / 4;
-    uint32_t block_idx;
-    int32_t tc_index,tc,thr_cut;
+    int32_t tc_index;
+    int32_t tc;
+    int32_t thr_cut;
 
-    if (dir == EDGE_VER) {
-      offset = 1;
-      step = stride;
-    }
+    uint32_t num_4px_parts  = length / 4;
+
+    const int32_t offset = (dir == EDGE_HOR) ? stride :      1;
+    const int32_t step   = (dir == EDGE_HOR) ?      1 : stride;
 
     // TODO: add CU based QP calculation
 
     // For each 4-pixel part in the edge
-    for (block_idx = 0; block_idx < blocks_in_part; ++block_idx) {
+    for (uint32_t block_idx = 0; block_idx < num_4px_parts; ++block_idx) {
       int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
 
       {
-        vector2d_t px = {
-          (dir == EDGE_HOR ? xpos + block_idx * 4 : xpos),
-          (dir == EDGE_VER ? ypos + block_idx * 4 : ypos)
-        };
-
-        // Don't deblock the last 4x4 block of the LCU. This will be deblocked
-        // when processing the next LCU.
-        if (block_idx > 0 && dir == EDGE_HOR && (px.x + 4) % 64 == 0 && (px.x + 4 != frame->width)) {
-          continue;
-        }
-
         // CU in the side we are filtering, update every 8-pixels
         cu_p = kvz_videoframe_get_cu(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0));
 
@@ -341,53 +397,60 @@ void kvz_filter_deblock_edge_luma(encoder_state_t * const state,
                     useStrongFiltering(offset, 2*d3, (src+step*(block_idx*4+3)));
 
         // Filter four rows/columns
-        kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + 0), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
-        kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + 1), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
-        kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + 2), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
-        kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + 3), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
+        for (int i = 0; i < 4; i++) {
+          kvz_filter_deblock_luma(encoder, src + step * (4*block_idx + i), offset, tc, sw, 0, 0, thr_cut, filter_P, filter_Q);
+        }
       }
     }
   }
 }
 
 /**
- * \brief
+ * \brief Apply the deblocking filter to chroma pixels on a single edge.
+ *
+ * The caller should check that the edge is a TU boundary or a PU boundary.
+ *
+ \verbatim
+
+         .-- filter this edge if dir == EDGE_HOR
+         v
+     +--------+
+     |o <-- pixel at (x, y)
+     |        |
+     |<-- filter this edge if dir == EDGE_VER
+     |        |
+     +--------+
+
+ \endverbatim
+ *
+ * \param state     encoder state
+ * \param x         x-coordinate in chroma pixels (see above)
+ * \param y         y-coordinate in chroma pixels (see above)
+ * \param length    length of the edge in chroma pixels
+ * \param dir       direction of the edge to filter
  */
-void kvz_filter_deblock_edge_chroma(encoder_state_t * const state,
-                                int32_t x, int32_t y,
-                                int8_t depth, int8_t dir)
+static void filter_deblock_edge_chroma(encoder_state_t * const state,
+                                       int32_t x,
+                                       int32_t y,
+                                       int32_t length,
+                                       edge_dir dir)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
   const cu_info_t *cu_q = kvz_videoframe_get_cu_const(frame, x >> (MIN_SIZE - 1), y >> (MIN_SIZE - 1));
-  
-  // Chroma edges that do not lay on a 8x8 grid are not deblocked.
-  if (depth >= MAX_DEPTH) {
-    if (dir == EDGE_HOR && (y & (8 - 1))) return;
-    if (dir == EDGE_VER && (x & (8 - 1))) return;
-  }
-
-  {
-    // Return if called with a coordinate which is not at CU or TU boundary.
-    // TODO: Add handling for asymmetric inter CU boundaries which do not coincide
-    // with transform boundaries.
-    const int tu_width = (LCU_WIDTH / 2) >> cu_q->tr_depth;
-    if (dir == EDGE_HOR && (y & (tu_width - 1))) return;
-    if (dir == EDGE_VER && (x & (tu_width - 1))) return;
-  }
 
   // For each subpart
   {
     int32_t stride = frame->rec->stride >> 1;
     int32_t tc_offset_div2 = encoder->tc_offset_div2;
     // TODO: support 10+bits
-    kvz_pixel *src_u = &frame->rec->u[x + y*stride];
-    kvz_pixel *src_v = &frame->rec->v[x + y*stride];
-    // Init offset and step to EDGE_HOR
-    int32_t offset = stride;
-    int32_t step = 1;
+    kvz_pixel *src[] = {
+      &frame->rec->u[x + y*stride],
+      &frame->rec->v[x + y*stride],
+    };
     const cu_info_t *cu_p = NULL;
-    int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1);
+    int16_t x_cu = x >> (MIN_SIZE-1);
+    int16_t y_cu = y >> (MIN_SIZE-1);
     int8_t strength = 2;
 
     int32_t QP             = kvz_g_chroma_scale[state->global->QP];
@@ -395,42 +458,22 @@ void kvz_filter_deblock_edge_chroma(encoder_state_t * const state,
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = kvz_g_tc_table_8x8[TC_index]*bitdepth_scale;
 
-    // Special handling for depth 4. It's meaning is that we want to bypass
-    // last block in LCU check in order to deblock just that block.
-    uint32_t blocks_in_part= (LCU_WIDTH>>(depth == 4 ? depth : depth + 1)) / 4;
-    uint32_t blk_idx;
+    const uint32_t num_4px_parts = length / 4;
 
-    if(dir == EDGE_VER) {
-      offset = 1;
-      step = stride;
-    }
+    const int32_t offset = (dir == EDGE_HOR) ? stride :      1;
+    const int32_t step   = (dir == EDGE_HOR) ?      1 : stride;
 
-    for (blk_idx = 0; blk_idx < blocks_in_part; ++blk_idx)
+    for (uint32_t blk_idx = 0; blk_idx < num_4px_parts; ++blk_idx)
     {
-      vector2d_t px = {
-        (dir == EDGE_HOR ? x + blk_idx * 4 : x),
-        (dir == EDGE_VER ? y + blk_idx * 4 : y)
-      };
       cu_p = kvz_videoframe_get_cu_const(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? blk_idx : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? blk_idx : 0));
 
-      // Don't deblock the last 4x4 block of the LCU. This will be deblocked
-      // when processing the next LCU.
-      if (depth != 4 && dir == EDGE_HOR && (px.x + 4) % 32 == 0 && (px.x + 4 != frame->width / 2)) {
-        continue;
-      }
-
       // Only filter when strenght == 2 (one of the blocks is intra coded)
       if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
-        // Chroma U
-        kvz_filter_deblock_chroma(encoder, src_u + step * (4*blk_idx + 0), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_u + step * (4*blk_idx + 1), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_u + step * (4*blk_idx + 2), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_u + step * (4*blk_idx + 3), offset, Tc, 0, 0);
-        // Chroma V
-        kvz_filter_deblock_chroma(encoder, src_v + step * (4*blk_idx + 0), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_v + step * (4*blk_idx + 1), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_v + step * (4*blk_idx + 2), offset, Tc, 0, 0);
-        kvz_filter_deblock_chroma(encoder, src_v + step * (4*blk_idx + 3), offset, Tc, 0, 0);
+        for (int component = 0; component < 2; component++) {
+          for (int i = 0; i < 4; i++) {
+            kvz_filter_deblock_chroma(encoder, src[component] + step * (4*blk_idx + i), offset, Tc, 0, 0);
+          }
+        }
       }
     }
   }
@@ -438,88 +481,154 @@ void kvz_filter_deblock_edge_chroma(encoder_state_t * const state,
 
 /**
  * \brief function to split LCU into smaller CU blocks
- * \param encoder the encoder info structure
- * \param xCtb block x-position (as SCU)
- * \param yCtb block y-position (as SCU)
- * \param depth block depth
- * \param edge which edge we are filtering
  *
- * This function takes (SCU) block position as input and splits the block
- * until the coded block size has been achived. Calls luma and chroma filtering
- * functions for each coded CU size.
+ * \param encoder   the encoder info structure
+ * \param x_px      block x-position in pixels
+ * \param y_px      block y-position in pixels
+ * \param depth     block depth
+ * \param dir       direction of the edges to filter
+ *
+ * Recursively traverse the CU/TU quadtree. At the lowest level, apply the
+ * deblocking filter to the left edge (when dir == EDGE_VER) or the top edge
+ * (when dir == EDGE_HOR) as needed. Both luma and chroma are filtered.
  */
-void kvz_filter_deblock_cu(encoder_state_t * const state, int32_t x, int32_t y, int8_t depth, int32_t edge)
+static void filter_deblock_cu(encoder_state_t * const state,
+                              int32_t x,
+                              int32_t y,
+                              int8_t depth,
+                              edge_dir dir)
 {
   const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x, y);
-  uint8_t split_flag = (cur_cu->depth > depth) ? 1 : 0;
-  uint8_t tr_split = (cur_cu->tr_depth > depth) ? 1 : 0;
-  uint8_t border_x = (frame->width  < x*(LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth)) ? 1 : 0;
-  uint8_t border_y = (frame->height < y*(LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth)) ? 1 : 0;
-  uint8_t border_split_x = (frame->width  < ((x + 1) * (LCU_WIDTH >> MAX_DEPTH)) + (LCU_WIDTH >> (depth + 1))) ? 0 : 1;
-  uint8_t border_split_y = (frame->height < ((y + 1) * (LCU_WIDTH >> MAX_DEPTH)) + (LCU_WIDTH >> (depth + 1))) ? 0 : 1;
+  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame,
+                                                        x >> MAX_DEPTH,
+                                                        y >> MAX_DEPTH);
 
-  uint8_t border = border_x | border_y; // are we in any border CU?
+  const int cu_width        = LCU_WIDTH >> depth;
+  const int half_cu_width   = cu_width >> 1;
+  const int scu_width       = LCU_WIDTH >> MAX_DEPTH;
+  const bool split_flag     = cur_cu->depth    > depth;
+  const bool tr_split       = cur_cu->tr_depth > depth;
+  const bool border_x       = frame->width  < x + cu_width;
+  const bool border_y       = frame->height < y + cu_width;
+  const bool border_split_x = frame->width  >= x + scu_width + half_cu_width;
+  const bool border_split_y = frame->height >= y + scu_width + half_cu_width;
+  const bool border         = border_x || border_y; // are we in any border CU?
 
   // split 64x64, on split flag and on border
   if (depth < MAX_DEPTH && (depth == 0 || split_flag || border || tr_split)) {
     // Split the four sub-blocks of this block recursively.
-    uint8_t change;
-    assert(depth >= 0);  // for clang-analyzer
-    change = 1 << (MAX_DEPTH - 1 - depth);
+    const int32_t x2 = x + half_cu_width;
+    const int32_t y2 = y + half_cu_width;
 
-    kvz_filter_deblock_cu(state, x, y, depth + 1, edge);
-    if(!border_x || border_split_x) {
-      kvz_filter_deblock_cu(state, x + change, y, depth + 1, edge);
+    filter_deblock_cu(state, x, y, depth + 1, dir);
+    if (!border_x || border_split_x) {
+      filter_deblock_cu(state, x2, y, depth + 1, dir);
     }
-    if(!border_y || border_split_y) {
-      kvz_filter_deblock_cu(state, x , y + change, depth + 1, edge);
+    if (!border_y || border_split_y) {
+      filter_deblock_cu(state, x, y2, depth + 1, dir);
     }
-    if((!border_x && !border_y) || (border_split_x && border_split_y)) {
-      kvz_filter_deblock_cu(state, x + change, y + change, depth + 1, edge);
+    if (!border || (border_split_x && border_split_y)) {
+      filter_deblock_cu(state, x2, y2, depth + 1, dir);
     }
     return;
   }
 
   // no filtering on borders (where filter would use pixels outside the picture)
-  if ((x == 0 && edge == EDGE_VER) || (y == 0 && edge == EDGE_HOR)) return;
+  if ((x == 0 && dir == EDGE_VER) || (y == 0 && dir == EDGE_HOR)) return;
 
   // do the filtering for block edge
-  kvz_filter_deblock_edge_luma(state,   x*(LCU_WIDTH >> MAX_DEPTH),       y*(LCU_WIDTH >> MAX_DEPTH),       depth, edge);
-  kvz_filter_deblock_edge_chroma(state, x*(LCU_WIDTH >> (MAX_DEPTH + 1)), y*(LCU_WIDTH >> (MAX_DEPTH + 1)), depth, edge);
+  if (is_tu_boundary(state, x, y, dir)) {
+    // Length of luma and chroma edges.
+    int32_t length;
+    int32_t length_c;
+
+    const int32_t x_right             = x + cu_width;
+    const bool rightmost_4px_of_lcu   = x_right % LCU_WIDTH == 0;
+    const bool rightmost_4px_of_frame = x_right == frame->width;
+
+    if (dir == EDGE_HOR &&
+        rightmost_4px_of_lcu &&
+        !rightmost_4px_of_frame) {
+      // The last 4 pixels will be deblocked when processing the next LCU.
+      length   = cu_width - 4;
+      length_c = half_cu_width - 4;
+
+    } else {
+      length   = cu_width;
+      length_c = half_cu_width;
+    }
+
+    filter_deblock_edge_luma(state, x, y, length, dir);
+
+    // Chroma pixel coordinates.
+    const int32_t x_c = x >> 1;
+    const int32_t y_c = y >> 1;
+    if (is_on_8x8_grid(x_c, y_c, dir)) {
+      filter_deblock_edge_chroma(state, x_c, y_c, length_c, dir);
+    }
+  }
 }
 
 
 /**
  * \brief Deblock a single LCU without using data from right or down.
  *
- * Filter all the following edges:
- * - All edges within the LCU, except for the last 4 pixels on the right when
- *   using horizontal filtering.
- * - Left edge and top edge.
- * - After vertical filtering the left edge, filter the last 4 pixels of
- *   horizontal edges in the LCU to the left.
+ * Filter the following vertical edges (horizontal filtering):
+ *  1. The left edge of the LCU.
+ *  2. All vertical edges within the LCU.
+ *
+ * Filter the following horizontal edges (vertical filtering):
+ *  1. The rightmost 4 pixels of the top edge of the LCU to the left.
+ *  2. The rightmost 4 pixels of all horizontal edges within the LCU to the
+ *     left.
+ *  3. The top edge and all horizontal edges within the LCU, excluding the
+ *     rightmost 4 pixels. If the LCU is the rightmost LCU of the frame, the
+ *     last 4 pixels are also filtered.
+ *
+ * What is not filtered:
+ *  - The rightmost 4 pixels of the top edge and all horizontal edges within
+ *    the LCU, unless the LCU is the rightmost LCU of the frame.
+ *  - The bottom edge of the LCU.
+ *  - The right edge of the LCU.
+ *
+ * \param state   encoder state
+ * \param x_px    x-coordinate of the left edge of the LCU in pixels
+ * \param y_px    y-coordinate of the top edge of the LCU in pixels
  */
 void kvz_filter_deblock_lcu(encoder_state_t * const state, int x_px, int y_px)
 {
   const vector2d_t lcu = { x_px / LCU_WIDTH, y_px / LCU_WIDTH };
 
-  kvz_filter_deblock_cu(state, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_VER);
+  filter_deblock_cu(state, x_px, y_px, 0, EDGE_VER);
+
+  assert(x_px == lcu.x * LCU_WIDTH);
+  assert(y_px == lcu.y * LCU_WIDTH);
 
   // Filter rightmost 4 pixels from last LCU now that they have been
   // finally deblocked vertically.
-  if (lcu.x > 0) {
-    int y;
-    for (y = 0; y < 64; y += 8) {
-      if (lcu.y + y == 0) continue;
-      kvz_filter_deblock_edge_luma(state, lcu.x * 64 - 4, lcu.y * 64 + y, 4, EDGE_HOR);
+  if (x_px > 0) {
+    // Luma
+    const int x = x_px - 4;
+    const int end = MIN(y_px + LCU_WIDTH, state->tile->frame->height);
+    for (int y = y_px; y < end; y += 8) {
+      // The top edge of the whole frame is not filtered.
+      if (y > 0 && is_tu_boundary(state, x, y, EDGE_HOR)) {
+        filter_deblock_edge_luma(state, x, y, 4, EDGE_HOR);
+      }
     }
-    for (y = 0; y < 32; y += 8) {
-      if (lcu.y + y == 0) continue;
-      kvz_filter_deblock_edge_chroma(state, lcu.x * 32 - 4, lcu.y * 32 + y, 4, EDGE_HOR);
+
+    // Chroma
+    const int x_px_c = x_px >> 1;
+    const int y_px_c = y_px >> 1;
+    const int x_c = x_px_c - 4;
+    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
+      // The top edge of the whole frame is not filtered.
+      if (y_c > 0 && is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR)) {
+        filter_deblock_edge_chroma(state, x_c, y_c, 4, EDGE_HOR);
+      }
     }
   }
 
-  kvz_filter_deblock_cu(state, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_HOR);
+  filter_deblock_cu(state, x_px, y_px, 0, EDGE_HOR);
 }
-
diff --git a/src/filter.h b/src/filter.h
index fdf30356..50906441 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -31,30 +31,15 @@
 #include "encoderstate.h"
 
 
-//////////////////////////////////////////////////////////////////////////
-// FUNCTIONS
-// Deblocking
-void kvz_filter_deblock_cu(encoder_state_t *state, int32_t x_px, int32_t y_px,
-                       int8_t depth, int32_t edge);
-void kvz_filter_deblock_edge_luma(encoder_state_t *state,
-                              int32_t x_pos, int32_t y_pos,
-                              int8_t depth, int8_t dir);
-void kvz_filter_deblock_edge_chroma(encoder_state_t *state,
-                                int32_t xpos, int32_t ypos,
-                                int8_t depth, int8_t dir);
+/**
+ * \brief Edge direction.
+ */
+typedef enum edge_dir {
+  EDGE_VER = 0, // vertical
+  EDGE_HOR = 1, // horizontal
+} edge_dir;
+
+
 void kvz_filter_deblock_lcu(encoder_state_t *state, int x_px, int y_px);
-void kvz_filter_deblock_luma(const encoder_control_t * const encoder, kvz_pixel *src, int32_t offset, int32_t tc , int8_t sw,
-                         int8_t part_p_nofilter, int8_t part_q_nofilter,
-                         int32_t thr_cut,
-                         int8_t filter_second_p, int8_t filter_second_q);
-void kvz_filter_deblock_chroma(const encoder_control_t * const encoder, kvz_pixel *src, int32_t offset, int32_t tc,
-                           int8_t part_p_nofilter, int8_t part_q_nofilter);
-
-// SAO
-
-//////////////////////////////////////////////////////////////////////////
-// MACROS
-#define EDGE_VER 0
-#define EDGE_HOR 1
 
 #endif
diff --git a/src/inter.c b/src/inter.c
index cd26486f..e9c2d62f 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -395,15 +395,12 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, const kvz_p
  * \param cu coding unit to clear
  */
 static void inter_clear_cu_unused(cu_info_t* cu) {
-  if(!(cu->inter.mv_dir & 1)) {
-    cu->inter.mv[0][0] = 0;
-    cu->inter.mv[0][1] = 0;
-    cu->inter.mv_ref[0] = 255;
-  }
-  if(!(cu->inter.mv_dir & 2)) {
-    cu->inter.mv[1][0] = 0;
-    cu->inter.mv[1][1] = 0;
-    cu->inter.mv_ref[1] = 255;
+  for (unsigned i = 0; i < 2; ++i) {
+    if (cu->inter.mv_dir & (1 << i)) continue;
+
+    cu->inter.mv[i][0] = 0;
+    cu->inter.mv[i][1] = 0;
+    cu->inter.mv_ref[i] = 255;
   }
 }
 
@@ -433,17 +430,16 @@ void kvz_inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth,
   |A1|_________|
   |A0|
   */
-  int32_t x_cu = (x & (LCU_WIDTH - 1)) >> MAX_DEPTH; //!< coordinates from top-left of this LCU
-  int32_t y_cu = (y & (LCU_WIDTH - 1)) >> MAX_DEPTH;
-  cu_info_t* cu = &lcu->cu[LCU_CU_OFFSET];
+  int32_t x_cu = SUB_SCU(x) >> MAX_DEPTH; //!< coordinates from top-left of this LCU
+  int32_t y_cu = SUB_SCU(y) >> MAX_DEPTH;
   // A0 and A1 availability testing
   if (x != 0) {
-    *a1 = &cu[x_cu - 1 + (y_cu + cur_block_in_scu - 1) * LCU_T_CU_WIDTH];
+    *a1 = LCU_GET_CU(lcu, x_cu - 1, y_cu + cur_block_in_scu - 1);
     if (!(*a1)->coded) *a1 = NULL;
     if(*a1) inter_clear_cu_unused(*a1);
 
     if (y_cu + cur_block_in_scu < LCU_WIDTH>>3) {
-      *a0 = &cu[x_cu - 1 + (y_cu + cur_block_in_scu) * LCU_T_CU_WIDTH];
+      *a0 = LCU_GET_CU(lcu, x_cu - 1, y_cu + cur_block_in_scu);
       if (!(*a0)->coded) *a0 = NULL;
     }
     if(*a0) inter_clear_cu_unused(*a0);
@@ -452,21 +448,21 @@ void kvz_inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth,
   // B0, B1 and B2 availability testing
   if (y != 0) {
     if (x_cu + cur_block_in_scu < LCU_WIDTH>>3) {
-      *b0 = &cu[x_cu + cur_block_in_scu + (y_cu - 1) * LCU_T_CU_WIDTH];
+      *b0 = LCU_GET_CU(lcu, x_cu + cur_block_in_scu, y_cu - 1);
       if (!(*b0)->coded) *b0 = NULL;
     } else if(y_cu == 0) {
-      // Special case, top-right cu from LCU is the last in lcu->cu array
-      *b0 = &lcu->cu[LCU_T_CU_WIDTH*LCU_T_CU_WIDTH];
+      // Special case, top-right CU
+      *b0 = LCU_GET_TOP_RIGHT_CU(lcu);
       if (!(*b0)->coded) *b0 = NULL;
     }
     if(*b0) inter_clear_cu_unused(*b0);
 
-    *b1 = &cu[x_cu + cur_block_in_scu - 1 + (y_cu - 1) * LCU_T_CU_WIDTH];
+    *b1 = LCU_GET_CU(lcu, x_cu + cur_block_in_scu - 1, y_cu - 1);
     if (!(*b1)->coded) *b1 = NULL;
     if(*b1) inter_clear_cu_unused(*b1);
 
     if (x != 0) {
-      *b2 = &cu[x_cu - 1 + (y_cu - 1) * LCU_T_CU_WIDTH];
+      *b2 = LCU_GET_CU(lcu, x_cu - 1, y_cu - 1);
       if(!(*b2)->coded) *b2 = NULL;
     }
     if(*b2) inter_clear_cu_unused(*b2);
diff --git a/src/intra.c b/src/intra.c
index 0e542890..87a6f52f 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -485,9 +485,9 @@ void kvz_intra_recon_lcu_luma(
   cu_info_t *cur_cu,
   lcu_t *lcu)
 {
-  const vector2d_t lcu_px = { x & 0x3f, y & 0x3f };
+  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
   if (cur_cu == NULL) {
-    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   const int8_t width = LCU_WIDTH >> depth;
 
@@ -500,9 +500,9 @@ void kvz_intra_recon_lcu_luma(
     kvz_intra_recon_lcu_luma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + (lcu_px.y >> 3)        *LCU_T_CU_WIDTH];
-      cu_info_t *cu_b = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
-      cu_info_t *cu_c = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
+      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
+      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
+      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
       if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
         cbf_set(&cur_cu->cbf.y, depth);
       }
@@ -537,12 +537,12 @@ void kvz_intra_recon_lcu_chroma(
   cu_info_t *cur_cu,
   lcu_t *lcu)
 {
-  const vector2d_t lcu_px = { x & 0x3f, y & 0x3f };
+  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
   const int8_t width = LCU_WIDTH >> depth;
   const int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
 
   if (cur_cu == NULL) {
-    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
   if (depth == 0 || cur_cu->tr_depth > depth) {
@@ -554,9 +554,9 @@ void kvz_intra_recon_lcu_chroma(
     kvz_intra_recon_lcu_chroma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + (lcu_px.y >> 3)        *LCU_T_CU_WIDTH];
-      cu_info_t *cu_b = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
-      cu_info_t *cu_c = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
+      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
+      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
+      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
       if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
         cbf_set(&cur_cu->cbf.u, depth);
       }
diff --git a/src/search.c b/src/search.c
index ac349cb3..f21dd8ab 100644
--- a/src/search.c
+++ b/src/search.c
@@ -82,8 +82,8 @@ static void work_tree_copy_up(int x_px, int y_px, int depth, lcu_t work_tree[MAX
     int x, y;
     for (y = y_cu; y < y_cu + width_cu; ++y) {
       for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = &work_tree[depth + 1].cu[LCU_CU_OFFSET + x + y * LCU_T_CU_WIDTH];
-        cu_info_t *to_cu = &work_tree[depth].cu[LCU_CU_OFFSET + x + y * LCU_T_CU_WIDTH];
+        const cu_info_t *from_cu = LCU_GET_CU(&work_tree[depth + 1], x, y);
+        cu_info_t *to_cu = LCU_GET_CU(&work_tree[depth], x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -142,8 +142,8 @@ static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[M
     int x, y;
     for (y = y_cu; y < y_cu + width_cu; ++y) {
       for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = &work_tree[depth].cu[LCU_CU_OFFSET + x + y * LCU_T_CU_WIDTH];
-        cu_info_t *to_cu = &work_tree[d].cu[LCU_CU_OFFSET + x + y * LCU_T_CU_WIDTH];
+        const cu_info_t *from_cu = LCU_GET_CU(&work_tree[depth], x, y);
+        cu_info_t *to_cu = LCU_GET_CU(&work_tree[d], x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -173,16 +173,15 @@ static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_tree[M
 void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
 {
   const int width_cu = LCU_CU_WIDTH >> depth;
-  const vector2d_t lcu_cu = { (x_px & (LCU_WIDTH - 1)) / 8, (y_px & (LCU_WIDTH - 1)) / 8 };
-  cu_info_t *const cur_cu = &lcu->cu[lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH + LCU_CU_OFFSET];
+  const vector2d_t lcu_cu = { SUB_SCU(x_px) / 8, SUB_SCU(y_px) / 8 };
   int x, y;
 
   // Depth 4 doesn't go inside the loop. Set the top-left CU.
-  cur_cu->tr_depth = tr_depth;
+  LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
 
   for (y = 0; y < width_cu; ++y) {
     for (x = 0; x < width_cu; ++x) {
-      cu_info_t *cu = &cur_cu[x + y * LCU_T_CU_WIDTH];
+      cu_info_t *cu = LCU_GET_CU(lcu, lcu_cu.x + x, lcu_cu.y + y);
       cu->tr_depth = tr_depth;
     }
   }
@@ -194,12 +193,11 @@ static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pr
   const int width_cu = LCU_CU_WIDTH >> depth;
   const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
   const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  cu_info_t *const lcu_cu = &lcu->cu[LCU_CU_OFFSET];
   int x, y;
 
   // NxN can only be applied to a single CU at a time.
   if (part_mode == SIZE_NxN) {
-    cu_info_t *cu = &lcu_cu[x_cu + y_cu * LCU_T_CU_WIDTH];
+    cu_info_t *cu = LCU_GET_CU(lcu, x_cu, y_cu);
     cu->depth = MAX_DEPTH;
     cu->type = CU_INTRA;
     cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode = pred_mode;
@@ -211,7 +209,7 @@ static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pr
   // Set mode in every CU covered by part_mode in this depth.
   for (y = y_cu; y < y_cu + width_cu; ++y) {
     for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = &lcu_cu[x + y * LCU_T_CU_WIDTH];
+      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
       cu->depth = depth;
       cu->type = CU_INTRA;
       cu->intra[0].mode = pred_mode;
@@ -231,12 +229,11 @@ static void lcu_set_inter(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *
   const int width_cu = LCU_CU_WIDTH >> depth;
   const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
   const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  cu_info_t *const lcu_cu = &lcu->cu[LCU_CU_OFFSET];
   int x, y;
   // Set mode in every CU covered by part_mode in this depth.
   for (y = y_cu; y < y_cu + width_cu; ++y) {
     for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = &lcu_cu[x + y * LCU_T_CU_WIDTH];
+      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
       //Check if this could be moved inside the if
       cu->coded    = 1;
       if (cu != cur_cu) {
@@ -257,17 +254,16 @@ static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *
   const int width_cu = LCU_CU_WIDTH >> depth;
   const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
   const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  cu_info_t *const lcu_cu = &lcu->cu[LCU_CU_OFFSET];
   int x, y;
   int tr_split = cur_cu->tr_depth-cur_cu->depth;
 
   // Set coeff flags in every CU covered by part_mode in this depth.
   for (y = y_cu; y < y_cu + width_cu; ++y) {
     for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = &lcu_cu[x + y * LCU_T_CU_WIDTH];
+      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
       // Use TU top-left CU to propagate coeff flags
       uint32_t mask = ~((width_cu>>tr_split)-1);
-      cu_info_t *cu_from = &lcu_cu[(x & mask) + (y & mask) * LCU_T_CU_WIDTH];
+      cu_info_t *cu_from = LCU_GET_CU(lcu, x & mask, y & mask);
       if (cu != cu_from) {
         // Chroma coeff data is not used, luma is needed for deblocking
         cu->cbf.y = cu_from->cbf.y;
@@ -295,7 +291,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
   const uint8_t pu_index = PU_INDEX(x_px / 4, y_px / 4);
 
   // cur_cu is used for TU parameters.
-  cu_info_t *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH];
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
 
   double coeff_bits = 0;
   double tr_tree_bits = 0;
@@ -368,7 +364,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
 {
   const vector2d_t lcu_px = { x_px / 2, y_px / 2 };
   const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
-  cu_info_t *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH];
+  cu_info_t *const tr_cu = LCU_GET_CU(lcu, lcu_px.x / 4, lcu_px.y / 4);
 
   double tr_tree_bits = 0;
   double coeff_bits = 0;
@@ -450,8 +446,8 @@ static double calc_mode_bits(const encoder_state_t *state,
   } else {
     int8_t candidate_modes[3];
     {
-      const cu_info_t *left_cu = ((x > 8) ? &cur_cu[-1] : NULL);
-      const cu_info_t *above_cu = ((y > 8) ? &cur_cu[-LCU_T_CU_WIDTH] : NULL);
+      const cu_info_t *left_cu  = ((x > 8) ? CU_GET_CU(cur_cu, -1,  0) : NULL);
+      const cu_info_t *above_cu = ((y > 8) ? CU_GET_CU(cur_cu,  0, -1) : NULL);
       kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
     }
 
@@ -467,10 +463,9 @@ static double calc_mode_bits(const encoder_state_t *state,
 
 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
 {
-  vector2d_t lcu_cu = { (x & 0x3f) / 8, (y & 0x3f) / 8 };
-  const cu_info_t *cu_array = &(lcu)->cu[LCU_CU_OFFSET];
-  bool condA = x >= 8 && cu_array[(lcu_cu.x - 1) + lcu_cu.y * LCU_T_CU_WIDTH].depth > depth;
-  bool condL = y >= 8 && cu_array[lcu_cu.x + (lcu_cu.y - 1) * LCU_T_CU_WIDTH].depth > depth;
+  vector2d_t lcu_cu = { SUB_SCU(x) / 8, SUB_SCU(y) / 8 };
+  bool condA = x >= 8 && LCU_GET_CU(lcu, lcu_cu.x - 1, lcu_cu.y    )->depth > depth;
+  bool condL = y >= 8 && LCU_GET_CU(lcu, lcu_cu.x,     lcu_cu.y - 1)->depth > depth;
   return condA + condL;
 }
 
@@ -494,7 +489,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
 
   lcu_t *const lcu = &work_tree[depth];
 
-  int x_local = (x&0x3f), y_local = (y&0x3f);
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
 #ifdef KVZ_DEBUG
   int debug_split = 0;
 #endif
@@ -506,7 +502,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     return 0;
   }
 
-  cur_cu = &(&work_tree[depth])->cu[LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH];
+  cur_cu = LCU_GET_CU_AT_PX(&work_tree[depth], x_local, y_local);
   // Assign correct depth
   cur_cu->depth = depth > MAX_DEPTH ? MAX_DEPTH : depth;
   cur_cu->tr_depth = depth > 0 ? depth : 1;
@@ -647,8 +643,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         && x + cu_width <= frame->width && y + cu_width <= frame->height)
     {
       vector2d_t lcu_cu = { x_local / 8, y_local / 8 };
-      cu_info_t *cu_array_d1 = &(&work_tree[depth + 1])->cu[LCU_CU_OFFSET];
-      cu_info_t *cu_d1 = &cu_array_d1[(lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH)];
+      cu_info_t *cu_d1 = LCU_GET_CU(&work_tree[depth + 1], lcu_cu.x, lcu_cu.y);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
@@ -715,16 +710,12 @@ static void init_lcu_t(const encoder_state_t * const state, const int x, const i
     const int x_cu = x >> MAX_DEPTH;
     const int y_cu = y >> MAX_DEPTH;
 
-    // Use top-left sub-cu of LCU as pointer to lcu->cu array to make things
-    // simpler.
-    cu_info_t *lcu_cu = &lcu->cu[LCU_CU_OFFSET];
-
     // Copy top CU row.
     if (y_cu > 0) {
       int i;
       for (i = 0; i < LCU_CU_WIDTH; ++i) {
         const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu + i, y_cu - 1);
-        cu_info_t *to_cu = &lcu_cu[i - LCU_T_CU_WIDTH];
+        cu_info_t *to_cu = LCU_GET_CU(lcu, i, -1);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -733,21 +724,21 @@ static void init_lcu_t(const encoder_state_t * const state, const int x, const i
       int i;
       for (i = 0; i < LCU_CU_WIDTH; ++i) {
         const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu - 1, y_cu + i);
-        cu_info_t *to_cu = &lcu_cu[-1 + i * LCU_T_CU_WIDTH];
+        cu_info_t *to_cu = LCU_GET_CU(lcu, -1, i);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
     // Copy top-left CU.
     if (x_cu > 0 && y_cu > 0) {
       const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu - 1, y_cu - 1);
-      cu_info_t *to_cu = &lcu_cu[-1 - LCU_T_CU_WIDTH];
+      cu_info_t *to_cu = LCU_GET_CU(lcu, -1, -1);
       memcpy(to_cu, from_cu, sizeof(*to_cu));
     }
 
     // Copy top-right CU.
     if (y_cu > 0 && x + LCU_WIDTH < frame->width) {
       const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu + LCU_CU_WIDTH, y_cu - 1);
-      cu_info_t *to_cu = &lcu->cu[LCU_T_CU_WIDTH*LCU_T_CU_WIDTH];
+      cu_info_t *to_cu = LCU_GET_TOP_RIGHT_CU(lcu);
       memcpy(to_cu, from_cu, sizeof(*to_cu));
     }
   }
@@ -806,14 +797,10 @@ static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, i
     const int y_cu = y_px >> MAX_DEPTH;
     videoframe_t * const frame = state->tile->frame;
 
-    // Use top-left sub-cu of LCU as pointer to lcu->cu array to make things
-    // simpler.
-    const cu_info_t *const lcu_cu = &lcu->cu[LCU_CU_OFFSET];
-
     int x, y;
     for (y = 0; y < LCU_CU_WIDTH; ++y) {
       for (x = 0; x < LCU_CU_WIDTH; ++x) {
-        const cu_info_t *from_cu = &lcu_cu[x + y * LCU_T_CU_WIDTH];
+        const cu_info_t *from_cu = LCU_GET_CU(lcu, x, y);
         cu_info_t *to_cu = kvz_videoframe_get_cu(frame, x_cu + x, y_cu + y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
diff --git a/src/search_inter.c b/src/search_inter.c
index dd04259e..92ec6f13 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -951,6 +951,127 @@ static unsigned search_frac(const encoder_state_t * const state,
 }
 
 
+/**
+ * \brief Perform inter search for a single reference frame.
+ */
+static void search_cu_inter_ref(const encoder_state_t * const state,
+                                int x, int y, int depth,
+                                lcu_t *lcu, cu_info_t *cur_cu,
+                                int16_t mv_cand[2][2],
+                                inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
+                                int16_t num_cand,
+                                unsigned ref_idx,
+                                uint32_t(*get_mvd_cost)(vector2d_t *, cabac_data_t*))
+{
+  const int x_cu = x >> 3;
+  const int y_cu = y >> 3;
+  const videoframe_t * const frame = state->tile->frame;
+  kvz_picture *ref_image = state->global->ref->images[ref_idx];
+  uint32_t temp_bitcost = 0;
+  uint32_t temp_cost = 0;
+  vector2d_t orig, mvd;
+  int32_t merged = 0;
+  uint8_t cu_mv_cand = 0;
+  int8_t merge_idx = 0;
+  int8_t ref_list = state->global->refmap[ref_idx].list-1;
+  int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
+  orig.x = x_cu * CU_MIN_SIZE_PIXELS;
+  orig.y = y_cu * CU_MIN_SIZE_PIXELS;
+  // Get MV candidates
+  cur_cu->inter.mv_ref[ref_list] = ref_idx;
+  kvz_inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, ref_list);
+  cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
+
+
+  vector2d_t mv = { 0, 0 };
+  {
+    // Take starting point for MV search from previous frame.
+    // When temporal motion vector candidates are added, there is probably
+    // no point to this anymore, but for now it helps.
+    int mid_x_cu = (x + (LCU_WIDTH >> (depth+1))) / 8;
+    int mid_y_cu = (y + (LCU_WIDTH >> (depth+1))) / 8;
+    cu_info_t *ref_cu = &state->global->ref->cu_arrays[ref_idx]->data[mid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH)];
+    if (ref_cu->type == CU_INTER) {
+      if (ref_cu->inter.mv_dir & 1) {
+        mv.x = ref_cu->inter.mv[0][0];
+        mv.y = ref_cu->inter.mv[0][1];
+      } else {
+        mv.x = ref_cu->inter.mv[1][0];
+        mv.y = ref_cu->inter.mv[1][1];
+      }
+    }
+  }
+
+#if SEARCH_MV_FULL_RADIUS
+  temp_cost += search_mv_full(depth, frame, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+#else
+  switch (state->encoder_control->cfg->ime_algorithm) {
+    case KVZ_IME_TZ:
+      temp_cost += tz_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+      break;
+
+    default:
+      temp_cost += hexagon_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+      break;
+    }
+#endif
+  if (state->encoder_control->cfg->fme_level > 0) {
+    temp_cost = search_frac(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
+  }
+
+  merged = 0;
+  // Check every candidate to find a match
+  for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
+    if (merge_cand[merge_idx].dir != 3 &&
+        merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x &&
+        merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y &&
+        (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
+      merged = 1;
+      break;
+    }
+  }
+
+  // Only check when candidates are different
+  if (!merged && (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
+    vector2d_t mvd_temp1, mvd_temp2;
+    int cand1_cost,cand2_cost;
+
+    mvd_temp1.x = mv.x - mv_cand[0][0];
+    mvd_temp1.y = mv.y - mv_cand[0][1];
+    cand1_cost = get_mvd_cost(&mvd_temp1, (cabac_data_t*)&state->cabac);
+
+    mvd_temp2.x = mv.x - mv_cand[1][0];
+    mvd_temp2.y = mv.y - mv_cand[1][1];
+    cand2_cost = get_mvd_cost(&mvd_temp2, (cabac_data_t*)&state->cabac);
+
+    // Select candidate 1 if it has lower cost
+    if (cand2_cost < cand1_cost) {
+      cu_mv_cand = 1;
+    }
+  }
+  mvd.x = mv.x - mv_cand[cu_mv_cand][0];
+  mvd.y = mv.y - mv_cand[cu_mv_cand][1];
+
+  if(temp_cost < cur_cu->inter.cost) {
+
+    // Map reference index to L0/L1 pictures
+    cur_cu->inter.mv_dir = ref_list+1;
+    cur_cu->inter.mv_ref_coded[ref_list] = state->global->refmap[ref_idx].idx;
+
+    cur_cu->merged        = merged;
+    cur_cu->merge_idx     = merge_idx;
+    cur_cu->inter.mv_ref[ref_list] = ref_idx;
+    cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x;
+    cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y;
+    cur_cu->inter.mvd[ref_list][0] = (int16_t)mvd.x;
+    cur_cu->inter.mvd[ref_list][1] = (int16_t)mvd.y;
+    cur_cu->inter.cost    = temp_cost;
+    cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[ref_list];
+    cur_cu->inter.mv_cand[ref_list] = cu_mv_cand;
+  }
+}
+
+
 /**
  * Update lcu to have best modes at this depth.
  * \return Cost of best mode.
@@ -959,12 +1080,9 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
 {
   const videoframe_t * const frame = state->tile->frame;
   uint32_t ref_idx = 0;
-  int x_local = (x&0x3f), y_local = (y&0x3f);
-  int x_cu = x>>3;
-  int y_cu = y>>3;
-  int cu_pos = LCU_CU_OFFSET+(x_local>>3) + (y_local>>3)*LCU_T_CU_WIDTH;
-
-  cu_info_t *cur_cu = &lcu->cu[cu_pos];
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+  cu_info_t *cur_cu = LCU_GET_CU(lcu, x_local >> 3, y_local >> 3);
 
   int16_t mv_cand[2][2];
   // Search for merge mode candidate
@@ -991,108 +1109,12 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
   cur_cu->inter.cost = UINT_MAX;
 
   for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
-    kvz_picture *ref_image = state->global->ref->images[ref_idx];
-    uint32_t temp_bitcost = 0;
-    uint32_t temp_cost = 0;
-    vector2d_t orig, mvd;
-    int32_t merged = 0;
-    uint8_t cu_mv_cand = 0;
-    int8_t merge_idx = 0;
-    int8_t ref_list = state->global->refmap[ref_idx].list-1;
-    int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
-    orig.x = x_cu * CU_MIN_SIZE_PIXELS;
-    orig.y = y_cu * CU_MIN_SIZE_PIXELS;
-    // Get MV candidates
-    cur_cu->inter.mv_ref[ref_list] = ref_idx;
-    kvz_inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, ref_list);
-    cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;
-
-    vector2d_t mv = { 0, 0 };
-    {
-      // Take starting point for MV search from previous frame.
-      // When temporal motion vector candidates are added, there is probably
-      // no point to this anymore, but for now it helps.
-      int mid_x_cu = (x + (LCU_WIDTH >> (depth+1))) / 8;
-      int mid_y_cu = (y + (LCU_WIDTH >> (depth+1))) / 8;
-      cu_info_t *ref_cu = &state->global->ref->cu_arrays[ref_idx]->data[mid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH)];
-      if (ref_cu->type == CU_INTER) {
-        if (ref_cu->inter.mv_dir & 1) {
-          mv.x = ref_cu->inter.mv[0][0];
-          mv.y = ref_cu->inter.mv[0][1];
-        } else {
-          mv.x = ref_cu->inter.mv[1][0];
-          mv.y = ref_cu->inter.mv[1][1];
-        }
-      }
-    }
-
-#if SEARCH_MV_FULL_RADIUS
-    temp_cost += search_mv_full(depth, frame, ref_pic, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-#else
-    switch (state->encoder_control->cfg->ime_algorithm) {
-      case KVZ_IME_TZ:
-        temp_cost += tz_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-        break;
-
-      default:
-        temp_cost += hexagon_search(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-        break;
-      }
-#endif
-    if (state->encoder_control->cfg->fme_level > 0) {
-      temp_cost = search_frac(state, depth, frame->source, ref_image, &orig, &mv, mv_cand, merge_cand, num_cand, ref_idx, &temp_bitcost);
-    }
-
-    merged = 0;
-    // Check every candidate to find a match
-    for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-      if (merge_cand[merge_idx].dir != 3 &&
-          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x &&
-          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y &&          
-          (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
-        merged = 1;
-        break;
-      }
-    }
-
-    // Only check when candidates are different
-    if (!merged && (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
-      vector2d_t mvd_temp1, mvd_temp2;
-      int cand1_cost,cand2_cost;
-
-      mvd_temp1.x = mv.x - mv_cand[0][0];
-      mvd_temp1.y = mv.y - mv_cand[0][1];
-      cand1_cost = get_mvd_cost(&mvd_temp1, (cabac_data_t*)&state->cabac);
-
-      mvd_temp2.x = mv.x - mv_cand[1][0];
-      mvd_temp2.y = mv.y - mv_cand[1][1];
-      cand2_cost = get_mvd_cost(&mvd_temp2, (cabac_data_t*)&state->cabac);
-
-      // Select candidate 1 if it has lower cost
-      if (cand2_cost < cand1_cost) {
-        cu_mv_cand = 1;
-      }
-    }
-    mvd.x = mv.x - mv_cand[cu_mv_cand][0];
-    mvd.y = mv.y - mv_cand[cu_mv_cand][1];
-
-    if(temp_cost < cur_cu->inter.cost) {
-
-      // Map reference index to L0/L1 pictures
-      cur_cu->inter.mv_dir = ref_list+1;
-      cur_cu->inter.mv_ref_coded[ref_list] = state->global->refmap[ref_idx].idx;
-
-      cur_cu->merged        = merged;
-      cur_cu->merge_idx     = merge_idx;
-      cur_cu->inter.mv_ref[ref_list] = ref_idx;
-      cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x;
-      cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y;
-      cur_cu->inter.mvd[ref_list][0] = (int16_t)mvd.x;
-      cur_cu->inter.mvd[ref_list][1] = (int16_t)mvd.y;
-      cur_cu->inter.cost    = temp_cost;
-      cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[ref_list];
-      cur_cu->inter.mv_cand[ref_list] = cu_mv_cand;
-    }
+    search_cu_inter_ref(state,
+                        x, y, depth,
+                        lcu, cur_cu,
+                        mv_cand, merge_cand, num_cand,
+                        ref_idx,
+                        get_mvd_cost);
   }
 
   // Search bi-pred positions
@@ -1155,7 +1177,8 @@ int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int d
           for (int ypos = 0; ypos < LCU_WIDTH >> depth; ++ypos) {
             int dst_y = ypos*(LCU_WIDTH >> depth);
             for (int xpos = 0; xpos < (LCU_WIDTH >> depth); ++xpos) {
-              tmp_block[dst_y + xpos] = templcu->rec.y[((y + ypos)&(LCU_WIDTH - 1))*LCU_WIDTH + ((x + xpos)&(LCU_WIDTH - 1))];              
+              tmp_block[dst_y + xpos] = templcu->rec.y[
+                SUB_SCU(y + ypos) * LCU_WIDTH + SUB_SCU(x + xpos)];
               tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width];
             }
           }
diff --git a/src/search_intra.c b/src/search_intra.c
index 09efa9b2..45790ef9 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -145,8 +145,8 @@ static double search_intra_trdepth(encoder_state_t * const state,
   const int width_c = width > TR_MIN_WIDTH ? width / 2 : width;
 
   const int offset = width / 2;
-  const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
-  cu_info_t *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
   const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
 
@@ -609,8 +609,8 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
   const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
 
   if (reconstruct_chroma) {
-    const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
-    cu_info_t *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+    cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
     struct {
       double cost;
@@ -645,11 +645,10 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
                               const int depth, lcu_t *lcu)
 {
-  const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
+  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
-  const int cu_index = LCU_CU_OFFSET + lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH;
 
-  cu_info_t *cur_cu = &lcu->cu[cu_index];
+  cu_info_t *cur_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y);
   int8_t intra_mode = cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].mode;
 
   double costs[5];
@@ -710,13 +709,12 @@ double kvz_search_cu_intra(encoder_state_t * const state,
                            const int x_px, const int y_px,
                            const int depth, lcu_t *lcu)
 {
-  const vector2d_t lcu_px = { x_px & 0x3f, y_px & 0x3f };
+  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
   const int8_t cu_width = (LCU_WIDTH >> (depth));
-  const int cu_index = LCU_CU_OFFSET + lcu_cu.x + lcu_cu.y * LCU_T_CU_WIDTH;
   const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
 
-  cu_info_t *cur_cu = &lcu->cu[cu_index];
+  cu_info_t *cur_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y);
 
   kvz_intra_references refs;
 
@@ -728,10 +726,10 @@ double kvz_search_cu_intra(encoder_state_t * const state,
   // Select left and top CUs if they are available.
   // Top CU is not available across LCU boundary.
   if ((x_px >> 3) > 0) {
-    left_cu = &lcu->cu[cu_index - 1];
+    left_cu = LCU_GET_CU(lcu, lcu_cu.x - 1, lcu_cu.y);
   }
   if ((y_px >> 3) > 0 && lcu_cu.y != 0) {
-    above_cu = &lcu->cu[cu_index - LCU_T_CU_WIDTH];
+    above_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y - 1);
   }
   kvz_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
diff --git a/src/transform.c b/src/transform.c
index 954ead17..d5c62b88 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -220,10 +220,10 @@ int kvz_quantize_residual_trskip(
 void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_cu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
-  const vector2d_t lcu_px = {x & 0x3f, y & 0x3f};
+  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
   const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
   if (cur_cu == NULL) {
-    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   const int8_t width = LCU_WIDTH>>depth;
   
@@ -241,9 +241,9 @@ void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, in
 
     // Propagate coded block flags from child CUs to parent CU.
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + (lcu_px.y >> 3)        *LCU_T_CU_WIDTH];
-      cu_info_t *cu_b = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
-      cu_info_t *cu_c = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
+      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
+      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
+      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
       if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
         cbf_set(&cur_cu->cbf.y, depth);
       }
@@ -304,11 +304,11 @@ void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, in
 void kvz_quantize_lcu_chroma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_cu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
-  const vector2d_t lcu_px = {x & 0x3f, y & 0x3f};
+  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
   const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
   const int8_t width = LCU_WIDTH>>depth;
   if (cur_cu == NULL) {
-    cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH];
+    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   
   // Tell clang-analyzer what is up. For some reason it can't figure out from
@@ -325,9 +325,9 @@ void kvz_quantize_lcu_chroma_residual(encoder_state_t * const state, int32_t x,
 
     // Propagate coded block flags from child CUs to parent CU.
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + (lcu_px.y >> 3)        *LCU_T_CU_WIDTH];
-      cu_info_t *cu_b = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
-      cu_info_t *cu_c = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset) >> 3) + ((lcu_px.y + offset) >> 3)*LCU_T_CU_WIDTH];
+      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
+      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
+      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
       if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
         cbf_set(&cur_cu->cbf.u, depth);
       }