From 57ce7e990bb9167ecbd56d5784fc2be91a9a21e5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 18 Mar 2014 11:26:42 +0200
Subject: [PATCH 1/8] Add new reference pixel buffer management to encoding
 loop.

- This is necessary because after we add in-loop filters to be done per LCU,
  the reconstruction buffer will have the deblocked pixels. We only need the
  edge-pixels for intra prediction though so we just save those.

- Right now it only copies the pixels and passes them on to search, where
  the copied pixels are asserted to be the same ones we copy from
  reconstruction buffer.

- New yuv_t struct added for arrays of dynamic length. We might want to change
  other buffers to use it or something like it in the future.
---
 src/encoder.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 src/filter.h  |  2 +-
 src/picture.c | 17 +++++++++++++++++
 src/picture.h | 10 ++++++++++
 src/search.c  | 35 +++++++++++++++--------------------
 src/search.h  |  3 ++-
 6 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index db312234..46ebc1cf 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -383,6 +383,9 @@ static void write_aud(encoder_control* encoder)
 
 void encode_one_frame(encoder_control* encoder)
 {
+  yuv_t *hor_buf = alloc_yuv_t(encoder->in.width);
+  yuv_t *ver_buf = alloc_yuv_t(LCU_WIDTH);
+
   const int is_first_frame = (encoder->frame == 0);
   const int is_i_radl = (encoder->cfg->intra_period == 1 && encoder->frame % 2 == 0);
   const int is_p_radl = (encoder->cfg->intra_period > 1 && (encoder->frame % encoder->cfg->intra_period) == 0);
@@ -464,17 +467,50 @@ void encode_one_frame(encoder_control* encoder)
 
   {
     vector2d lcu;
+    picture *pic = encoder->in.cur_pic;
 
     for (lcu.y = 0; lcu.y < encoder->in.height_in_lcu; lcu.y++) {
       for (lcu.x = 0; lcu.x < encoder->in.width_in_lcu; lcu.x++) {
         const vector2d px = { lcu.x * LCU_WIDTH, lcu.y * LCU_WIDTH };
+        const vector2d size = { encoder->in.width, encoder->in.height };
 
-        search_lcu(encoder, px.x, px.y);
+        // Handle partial LCUs on the right and bottom.
+        const vector2d lcu_dim = {
+          MIN(LCU_WIDTH, size.x - px.x), MIN(LCU_WIDTH, size.y - px.y)
+        };
+        const int right = px.x + lcu_dim.x;
+        const int bottom = px.y + lcu_dim.y;
+
+        search_lcu(encoder, px.x, px.y, hor_buf, ver_buf);
+
+        // Take bottom and right pixels from this LCU to be used on the search of next LCU.
+        picture_blit_pixels(&pic->y_recdata[(bottom - 1) * size.x + px.x],
+                            &hor_buf->y[px.x],
+                            lcu_dim.x, 1, size.x, size.x);
+        picture_blit_pixels(&pic->u_recdata[(bottom / 2 - 1) * size.x / 2 + px.x / 2],
+                            &hor_buf->u[px.x / 2],
+                            lcu_dim.x / 2, 1, size.x / 2, size.x / 2);
+        picture_blit_pixels(&pic->v_recdata[(bottom / 2 - 1) * size.x / 2 + px.x / 2],
+                            &hor_buf->v[px.x / 2],
+                            lcu_dim.x / 2, 1, size.x / 2, size.x / 2);
+
+        picture_blit_pixels(&pic->y_recdata[px.y * size.x + right - 1],
+                            ver_buf->y,
+                            1, lcu_dim.y, size.x, 1);
+        picture_blit_pixels(&pic->u_recdata[px.y * size.x / 4 + (right / 2) - 1],
+                            ver_buf->u,
+                            1, lcu_dim.y / 2, size.x / 2, 1);
+        picture_blit_pixels(&pic->v_recdata[px.y * size.x / 4 + (right / 2) - 1],
+                            ver_buf->v,
+                            1, lcu_dim.y / 2, size.x / 2, 1);
+
+        //encode_lcu(encoder, x.px, y.px, hor_buf, ver_buf);
       }
     }
   }
 
   encode_slice_data(encoder);
+
   cabac_flush(&cabac);
   bitstream_align(encoder->stream);
   bitstream_flush(encoder->stream);
@@ -495,6 +531,9 @@ void encode_one_frame(encoder_control* encoder)
   add_checksum(encoder);
 
   encoder->in.cur_pic->poc = encoder->poc;
+
+  dealloc_yuv_t(hor_buf);
+  dealloc_yuv_t(ver_buf);
 }
 
 static void fill_after_frame(unsigned height, unsigned array_width,
diff --git a/src/filter.h b/src/filter.h
index f0bdc3ff..b7b31745 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -32,7 +32,7 @@
 //////////////////////////////////////////////////////////////////////////
 // FUNCTIONS
 // Deblocking
-void filter_deblock_cu(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
+void filter_deblock_cu(encoder_control *encoder, int32_t x_px, int32_t y_px,
                        int8_t depth, int32_t edge);
 void filter_deblock_edge_luma(encoder_control *encoder,
                               int32_t x_pos, int32_t y_pos,
diff --git a/src/picture.c b/src/picture.c
index 230f870d..6022e297 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -33,6 +33,23 @@
 
 #define PSNRMAX (255.0 * 255.0)
 
+
+yuv_t * alloc_yuv_t(int luma_size)
+{
+  yuv_t * yuv = (yuv_t *)malloc(sizeof(yuv_t) + luma_size * sizeof(pixel) * 2);
+  yuv->size = luma_size;
+  yuv->y = (pixel *)yuv + sizeof(yuv_t);
+  yuv->u = yuv->y + luma_size * sizeof(pixel);
+  yuv->v = yuv->u + luma_size / 2 * sizeof(pixel);
+  return yuv;
+}
+
+void dealloc_yuv_t(yuv_t * yuv)
+{
+  free(yuv);
+}
+
+
 /**
  * \brief BLock Image Transfer from one buffer to another.
  *
diff --git a/src/picture.h b/src/picture.h
index a7977137..90477617 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -177,6 +177,13 @@ typedef struct {
   pixel v[LCU_CHROMA_SIZE];
 } lcu_yuv_t;
 
+typedef struct {
+  int size;
+  pixel *y;
+  pixel *u;
+  pixel *v;
+} yuv_t;
+
 typedef struct {
   lcu_ref_px_t top_ref;  //!< Reference pixels from adjacent LCUs.
   lcu_ref_px_t left_ref; //!< Reference pixels from adjacent LCUs.
@@ -202,6 +209,9 @@ typedef struct {
 //////////////////////////////////////////////////////////////////////////
 // FUNCTIONS
 
+yuv_t * alloc_yuv_t(int luma_size);
+void dealloc_yuv_t(yuv_t * yuv);
+
 picture * picture_init(int32_t width, int32_t height,
                        int32_t width_in_lcu, int32_t height_in_lcu);
 int picture_destroy(picture *pic);
diff --git a/src/search.c b/src/search.c
index 0a220254..5999cce1 100644
--- a/src/search.c
+++ b/src/search.c
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "config.h"
 #include "bitstream.h"
@@ -882,7 +883,7 @@ static int search_cu(encoder_control *encoder, int x, int y, int depth, lcu_t wo
  * - Copy reference pixels from neighbouring LCUs.
  * - Copy reference pixels from this LCU.
  */
-static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu)
+static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu, yuv_t *hor_buf, yuv_t *ver_buf)
 {
   // Copy reference cu_info structs from neighbouring LCUs.
   {
@@ -952,14 +953,18 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t
 
       picture_blit_pixels(&pic->u_recdata[x_c + (y_c - 1) * pic_width_c],
                           &lcu->top_ref.u[1],
-                          x_max, 1, pic_width_c, ref_size_c);
+                          x_max_c, 1, pic_width_c, ref_size_c);
       picture_blit_pixels(&pic->v_recdata[x_c + (y_c - 1) * pic_width_c],
                           &lcu->top_ref.v[1],
-                          x_max, 1, pic_width_c, ref_size_c);
+                          x_max_c, 1, pic_width_c, ref_size_c);
+
+      assert(!memcmp(&hor_buf->y[x], &lcu->top_ref.y[1], x_max));
+      assert(!memcmp(&hor_buf->u[x / 2], &lcu->top_ref.u[1], x_max_c));
+      assert(!memcmp(&hor_buf->v[x / 2], &lcu->top_ref.v[1], x_max_c));
     }
     // Copy left reference pixels.
     if (x > 0) {
-      int y_max = MIN(LCU_REF_PX_WIDTH, pic_height - y);
+      int y_max = MIN(LCU_WIDTH, pic_height - y);
       int y_max_c = y_max / 2;
       picture_blit_pixels(&pic->y_recdata[(x - 1) + y * pic_width],
                           &lcu->left_ref.y[1],
@@ -971,6 +976,10 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t
       picture_blit_pixels(&pic->v_recdata[(x_c - 1) + (y_c) * pic_width_c],
                           &lcu->left_ref.v[1],
                           1, y_max_c, pic_width_c, 1);
+
+      assert(!memcmp(ver_buf->y, &lcu->left_ref.y[1], y_max));
+      assert(!memcmp(ver_buf->u, &lcu->left_ref.u[1], y_max_c));
+      assert(!memcmp(ver_buf->v, &lcu->left_ref.v[1], y_max_c));
     }
     // Copy top-left reference pixel.
     if (x > 0 && y > 0) {
@@ -1065,14 +1074,14 @@ static void copy_lcu_to_cu_data(encoder_control *encoder, int x_px, int y_px, co
  * Search LCU for modes.
  * - Best mode gets copied to current picture.
  */
-void search_lcu(encoder_control *encoder, int x, int y)
+void search_lcu(encoder_control *encoder, int x, int y, yuv_t *hor_buf, yuv_t *ver_buf)
 {
   lcu_t work_tree[MAX_PU_DEPTH + 1];
   int depth;
   // Initialize work tree.
   for (depth = 0; depth <= MAX_PU_DEPTH; ++depth) {
     memset(&work_tree[depth], 0, sizeof(work_tree[depth]));
-    init_lcu_t(encoder, x, y, &work_tree[depth]);
+    init_lcu_t(encoder, x, y, &work_tree[depth], hor_buf, ver_buf);
   }
 
   // Start search from depth 0.
@@ -1080,17 +1089,3 @@ void search_lcu(encoder_control *encoder, int x, int y)
 
   copy_lcu_to_cu_data(encoder, x, y, &work_tree[0]);
 }
-
-
-/**
- * Perform mode search for every LCU in the current picture.
- */
-static void search_frame(encoder_control *encoder)
-{
-  int y_lcu, x_lcu;
-  for (y_lcu = 0; y_lcu < encoder->in.height_in_lcu; y_lcu++) {
-    for (x_lcu = 0; x_lcu < encoder->in.width_in_lcu; x_lcu++) {
-      search_lcu(encoder, x_lcu * LCU_WIDTH, y_lcu * LCU_WIDTH);
-    }
-  }
-}
diff --git a/src/search.h b/src/search.h
index d774acc9..4734c448 100644
--- a/src/search.h
+++ b/src/search.h
@@ -27,8 +27,9 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "picture.h"
 
 
-void search_lcu(encoder_control *encoder, int x, int y);
+void search_lcu(encoder_control *encoder, int x, int y, yuv_t *hor_buf, yuv_t *ver_buf);
 
 #endif

From 7328fc2897d01a8e99a32849c21b26f972f0bd38 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 18 Mar 2014 15:26:40 +0200
Subject: [PATCH 2/8] Add special handling for bottom right LCU pixel.

I didn't take into account that the reference pixel on the top-left of the
LCU gets over written if we just replace the top reference pixels for
current LCU with the bottom reference pixels after doing the search.
To handle this I copy the pixel that gets overwritten to the vertical
reference pixels.
---
 src/encoder.c | 18 ++++++++++++++----
 src/picture.c | 14 ++++++++++----
 src/search.c  | 10 +++++++---
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 46ebc1cf..bef9dcc3 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -384,7 +384,9 @@ static void write_aud(encoder_control* encoder)
 void encode_one_frame(encoder_control* encoder)
 {
   yuv_t *hor_buf = alloc_yuv_t(encoder->in.width);
-  yuv_t *ver_buf = alloc_yuv_t(LCU_WIDTH);
+  // Allocate 2 extra luma pixels so we get 1 extra chroma pixel for the
+  // for the extra pixel on the top right.
+  yuv_t *ver_buf = alloc_yuv_t(LCU_WIDTH + 2);
 
   const int is_first_frame = (encoder->frame == 0);
   const int is_i_radl = (encoder->cfg->intra_period == 1 && encoder->frame % 2 == 0);
@@ -483,6 +485,14 @@ void encode_one_frame(encoder_control* encoder)
 
         search_lcu(encoder, px.x, px.y, hor_buf, ver_buf);
 
+        // Take the bottom right pixel from the LCU above and put it as the
+        // first pixel in this LCUs rightmost pixels.
+        if (lcu.y > 0) {
+          ver_buf->y[0] = hor_buf->y[right - 1];
+          ver_buf->u[0] = hor_buf->u[right / 2 - 1];
+          ver_buf->v[0] = hor_buf->v[right / 2 - 1];
+        }
+
         // Take bottom and right pixels from this LCU to be used on the search of next LCU.
         picture_blit_pixels(&pic->y_recdata[(bottom - 1) * size.x + px.x],
                             &hor_buf->y[px.x],
@@ -495,13 +505,13 @@ void encode_one_frame(encoder_control* encoder)
                             lcu_dim.x / 2, 1, size.x / 2, size.x / 2);
 
         picture_blit_pixels(&pic->y_recdata[px.y * size.x + right - 1],
-                            ver_buf->y,
+                            &ver_buf->y[1],
                             1, lcu_dim.y, size.x, 1);
         picture_blit_pixels(&pic->u_recdata[px.y * size.x / 4 + (right / 2) - 1],
-                            ver_buf->u,
+                            &ver_buf->u[1],
                             1, lcu_dim.y / 2, size.x / 2, 1);
         picture_blit_pixels(&pic->v_recdata[px.y * size.x / 4 + (right / 2) - 1],
-                            ver_buf->v,
+                            &ver_buf->v[1],
                             1, lcu_dim.y / 2, size.x / 2, 1);
 
         //encode_lcu(encoder, x.px, y.px, hor_buf, ver_buf);
diff --git a/src/picture.c b/src/picture.c
index 6022e297..14e60322 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -36,16 +36,22 @@
 
 yuv_t * alloc_yuv_t(int luma_size)
 {
-  yuv_t * yuv = (yuv_t *)malloc(sizeof(yuv_t) + luma_size * sizeof(pixel) * 2);
+  // Get buffers with separate mallocs in order to take advantage of
+  // automatic buffer overrun checks.
+  yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv));
+  yuv->y = (pixel *)malloc(luma_size * sizeof(*yuv->y));
+  yuv->u = (pixel *)malloc(luma_size / 2 * sizeof(*yuv->u));
+  yuv->v = (pixel *)malloc(luma_size / 2 * sizeof(*yuv->v));
   yuv->size = luma_size;
-  yuv->y = (pixel *)yuv + sizeof(yuv_t);
-  yuv->u = yuv->y + luma_size * sizeof(pixel);
-  yuv->v = yuv->u + luma_size / 2 * sizeof(pixel);
+
   return yuv;
 }
 
 void dealloc_yuv_t(yuv_t * yuv)
 {
+  free(yuv->y);
+  free(yuv->u);
+  free(yuv->v);
   free(yuv);
 }
 
diff --git a/src/search.c b/src/search.c
index 5999cce1..519fcf55 100644
--- a/src/search.c
+++ b/src/search.c
@@ -977,9 +977,9 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t
                           &lcu->left_ref.v[1],
                           1, y_max_c, pic_width_c, 1);
 
-      assert(!memcmp(ver_buf->y, &lcu->left_ref.y[1], y_max));
-      assert(!memcmp(ver_buf->u, &lcu->left_ref.u[1], y_max_c));
-      assert(!memcmp(ver_buf->v, &lcu->left_ref.v[1], y_max_c));
+      assert(!memcmp(&ver_buf->y[1], &lcu->left_ref.y[1], y_max));
+      assert(!memcmp(&ver_buf->u[1], &lcu->left_ref.u[1], y_max_c));
+      assert(!memcmp(&ver_buf->v[1], &lcu->left_ref.v[1], y_max_c));
     }
     // Copy top-left reference pixel.
     if (x > 0 && y > 0) {
@@ -991,6 +991,10 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t
 
       lcu->top_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
       lcu->left_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
+
+      assert(ver_buf->y[0] == lcu->top_ref.y[0]);
+      assert(ver_buf->u[0] == lcu->top_ref.u[0]);
+      assert(ver_buf->v[0] == lcu->top_ref.v[0]);
     }
   }
 

From bbd1202f90672e68fe8e9241060c9f7115ce0112 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 18 Mar 2014 15:36:48 +0200
Subject: [PATCH 3/8] Replace old LCU reference pixels initialization with new
 ones.

In the future we might even have just a const pointer to reference pixels.
---
 src/search.c | 69 ++++++++++++----------------------------------------
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/src/search.c b/src/search.c
index 519fcf55..75ac55a3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -883,7 +883,7 @@ static int search_cu(encoder_control *encoder, int x, int y, int depth, lcu_t wo
  * - Copy reference pixels from neighbouring LCUs.
  * - Copy reference pixels from this LCU.
  */
-static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu, yuv_t *hor_buf, yuv_t *ver_buf)
+static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu, const yuv_t *hor_buf, const yuv_t *ver_buf)
 {
   // Copy reference cu_info structs from neighbouring LCUs.
   {
@@ -931,70 +931,33 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t
 
   // Copy reference pixels.
   {
-    const picture *pic = encoder->in.cur_pic;
-
     const int pic_width = encoder->in.width;
-    const int pic_height = encoder->in.height;
-    const int ref_size = LCU_REF_PX_WIDTH;
-
-    const int pic_width_c = encoder->in.width / 2;
-    const int pic_height_c = encoder->in.height / 2;
-    const int ref_size_c = LCU_REF_PX_WIDTH / 2;
-    const int x_c = x / 2;
-    const int y_c = y / 2;
 
     // Copy top reference pixels.
     if (y > 0) {
-      int x_max = MIN(ref_size, pic_width - x);
-      int x_max_c = x_max / 2;
-      picture_blit_pixels(&pic->y_recdata[x + (y - 1) * pic_width],
-                          &lcu->top_ref.y[1],
-                          x_max, 1, pic_width, ref_size);
-
-      picture_blit_pixels(&pic->u_recdata[x_c + (y_c - 1) * pic_width_c],
-                          &lcu->top_ref.u[1],
-                          x_max_c, 1, pic_width_c, ref_size_c);
-      picture_blit_pixels(&pic->v_recdata[x_c + (y_c - 1) * pic_width_c],
-                          &lcu->top_ref.v[1],
-                          x_max_c, 1, pic_width_c, ref_size_c);
-
-      assert(!memcmp(&hor_buf->y[x], &lcu->top_ref.y[1], x_max));
-      assert(!memcmp(&hor_buf->u[x / 2], &lcu->top_ref.u[1], x_max_c));
-      assert(!memcmp(&hor_buf->v[x / 2], &lcu->top_ref.v[1], x_max_c));
+      // hor_buf is of size pic_width so there might not be LCU_REF_PX_WIDTH
+      // number of allocated pixels left.
+      int x_max = MIN(LCU_REF_PX_WIDTH, pic_width - x);
+      memcpy(&lcu->top_ref.y[1], &hor_buf->y[x], x_max);
+      memcpy(&lcu->top_ref.u[1], &hor_buf->u[x / 2], x_max / 2);
+      memcpy(&lcu->top_ref.v[1], &hor_buf->v[x / 2], x_max / 2);
     }
     // Copy left reference pixels.
     if (x > 0) {
-      int y_max = MIN(LCU_WIDTH, pic_height - y);
-      int y_max_c = y_max / 2;
-      picture_blit_pixels(&pic->y_recdata[(x - 1) + y * pic_width],
-                          &lcu->left_ref.y[1],
-                          1, y_max, pic_width, 1);
-
-      picture_blit_pixels(&pic->u_recdata[(x_c - 1) + (y_c) * pic_width_c],
-                          &lcu->left_ref.u[1],
-                          1, y_max_c, pic_width_c, 1);
-      picture_blit_pixels(&pic->v_recdata[(x_c - 1) + (y_c) * pic_width_c],
-                          &lcu->left_ref.v[1],
-                          1, y_max_c, pic_width_c, 1);
-
-      assert(!memcmp(&ver_buf->y[1], &lcu->left_ref.y[1], y_max));
-      assert(!memcmp(&ver_buf->u[1], &lcu->left_ref.u[1], y_max_c));
-      assert(!memcmp(&ver_buf->v[1], &lcu->left_ref.v[1], y_max_c));
+      memcpy(&lcu->left_ref.y[1], &ver_buf->y[1], LCU_WIDTH);
+      memcpy(&lcu->left_ref.u[1], &ver_buf->u[1], LCU_WIDTH);
+      memcpy(&lcu->left_ref.v[1], &ver_buf->v[1], LCU_WIDTH);
     }
     // Copy top-left reference pixel.
     if (x > 0 && y > 0) {
-      lcu->top_ref.y[0] = pic->y_recdata[(x - 1) + (y - 1) * pic_width];
-      lcu->left_ref.y[0] = pic->y_recdata[(x - 1) + (y - 1) * pic_width];
+      lcu->top_ref.y[0] = ver_buf->y[0];
+      lcu->left_ref.y[0] = ver_buf->y[0];
 
-      lcu->top_ref.u[0] = pic->u_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
-      lcu->left_ref.u[0] = pic->u_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
+      lcu->top_ref.u[0] = ver_buf->u[0];
+      lcu->left_ref.u[0] = ver_buf->u[0];
 
-      lcu->top_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
-      lcu->left_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c];
-
-      assert(ver_buf->y[0] == lcu->top_ref.y[0]);
-      assert(ver_buf->u[0] == lcu->top_ref.u[0]);
-      assert(ver_buf->v[0] == lcu->top_ref.v[0]);
+      lcu->top_ref.v[0] = ver_buf->v[0];
+      lcu->left_ref.v[0] = ver_buf->v[0];
     }
   }
 

From c42b25054a848a9ab6d96f8c4b8b7081d1769a6f Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 20 Mar 2014 17:30:20 +0200
Subject: [PATCH 4/8] Modify deblocking to be done per-LCU in the encoding
 loop.

- Intra works. There is still something wrong in inter.

- Avoid horizontal deblocking of the rightmost 4 pixels in the LCU.
  This is because vertical deblocking must be done for all pixels
  before horizontal, but vertical deblocking can't be done for those
  pixels before the next LCU is finished.

- Add separate deblocking of the rightmost pixels of the last LCU
  after the LCU edge has been deblocked.

- This is a pretty ugly hack but will have to do for now.
---
 src/encoder.c | 25 +++++++++++++++++++------
 src/filter.c  | 44 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index bef9dcc3..b60afae3 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -514,7 +514,25 @@ void encode_one_frame(encoder_control* encoder)
                             &ver_buf->v[1],
                             1, lcu_dim.y / 2, size.x / 2, 1);
 
-        //encode_lcu(encoder, x.px, y.px, hor_buf, ver_buf);
+        if (encoder->deblock_enable) {
+          filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_VER);
+
+          // Filter rightmost 4 pixels from last LCU now that they have been
+          // finally deblocked vertically.
+          if (lcu.x > 0) {
+            int y;
+            for (y = 0; y < 64; y += 8) {
+              if (lcu.y + y == 0) continue;
+              filter_deblock_edge_luma(encoder, lcu.x * 64 - 4, lcu.y * 64 + y, 4, EDGE_HOR);
+            }
+            for (y = 0; y < 32; y += 8) {
+              if (lcu.y + y == 0) continue;
+              filter_deblock_edge_chroma(encoder, lcu.x * 32 - 4, lcu.y * 32 + y, 4, EDGE_HOR);
+            }
+          }
+
+          filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_HOR);
+        }
       }
     }
   }
@@ -1262,11 +1280,6 @@ void encode_slice_data(encoder_control* encoder)
   picture *pic = encoder->in.cur_pic;
   const vector2d size_lcu = { encoder->in.width_in_lcu, encoder->in.height_in_lcu };
 
-  // Filtering
-  if(encoder->deblock_enable) {
-    filter_deblock(encoder);
-  }
-
   if (encoder->sao_enable) {
     pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
     pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
diff --git a/src/filter.c b/src/filter.c
index 23796227..9334ee2c 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -179,6 +179,14 @@ void filter_deblock_edge_luma(encoder_control *encoder,
   int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
   int8_t strength = 0;
 
+  {
+    // Don't do anything if there is no PU or TU edge here.
+    int cu_width = LCU_WIDTH >> cu_q->depth;
+    if (dir == EDGE_HOR && ypos % cu_width != 0) {
+      return;
+    }
+  }
+
 
   if(dir == EDGE_VER) {
     offset = 1;
@@ -199,8 +207,19 @@ void filter_deblock_edge_luma(encoder_control *encoder,
     // For each 4-pixel part in the edge
     for (block_idx = 0; block_idx < blocks_in_part; ++block_idx) {
       int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
-      if((block_idx & 1) == 0)
+
       {
+        vector2d px = {
+          (dir == EDGE_HOR ? xpos + block_idx * 4 : xpos),
+          (dir == EDGE_VER ? ypos + block_idx * 4 : ypos)
+        };
+
+        // Don't deblock the last 4x4 block of the LCU. This will be deblocked
+        // when processing the next LCU.
+        if (block_idx > 0 && dir == EDGE_HOR && (px.x + 4) % 64 == 0 && (px.x + 4 != encoder->in.width)) {
+          continue;
+        }
+
         // CU in the side we are filtering, update every 8-pixels
         cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0)) +
                                                          (y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0))
@@ -283,11 +302,19 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
   int8_t strength = 2;
 
   // We cannot filter edges not on 8x8 grid
-  if((depth == MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) ))
+  if((depth >= MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) ))
   {
     return;
   }
 
+  {
+    // Don't do anything if there is no PU or TU edge here.
+    int cu_width = (LCU_WIDTH / 2) >> (cu_q->depth);
+    if (dir == EDGE_HOR && y % cu_width != 0) {
+      return;
+    }
+  }
+
   if(dir == EDGE_VER)
   {
     offset = 1;
@@ -300,14 +327,25 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
     int32_t bitdepth_scale = 1 << (g_bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = g_tc_table_8x8[TC_index]*bitdepth_scale;
-    uint32_t blocks_in_part= (LCU_WIDTH>>(depth+1)) / 4;
+    uint32_t blocks_in_part= (LCU_WIDTH>>(depth == 4 ? depth : depth + 1)) / 4;
     uint32_t blk_idx;
 
     for (blk_idx = 0; blk_idx < blocks_in_part; ++blk_idx)
     {
+      vector2d px = {
+        (dir == EDGE_HOR ? x + blk_idx * 4 : x),
+        (dir == EDGE_VER ? y + blk_idx * 4 : y)
+      };
       cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? blk_idx : 0)) +
                                                          (y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? blk_idx : 0))
                                                           * (encoder->in.width_in_lcu << MAX_DEPTH)];
+
+      // Don't deblock the last 4x4 block of the LCU. This will be deblocked
+      // when processing the next LCU.
+      if (depth != 4 && dir == EDGE_HOR && (px.x + 4) % 32 == 0 && (px.x + 4 != encoder->in.width / 2)) {
+        continue;
+      }
+
       // Only filter when strenght == 2 (one of the blocks is intra coded)
       if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
         // Chroma U

From 0f492c7680eaf27598cadff9c95cf745e26a83c5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 21 Mar 2014 10:42:41 +0200
Subject: [PATCH 5/8] Fix deblocking of transform boundaries.

This fixes issues with inter. Deblocking works now.
---
 src/filter.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index 9334ee2c..635715a0 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -180,11 +180,12 @@ void filter_deblock_edge_luma(encoder_control *encoder,
   int8_t strength = 0;
 
   {
-    // Don't do anything if there is no PU or TU edge here.
-    int cu_width = LCU_WIDTH >> cu_q->depth;
-    if (dir == EDGE_HOR && ypos % cu_width != 0) {
-      return;
-    }
+    // Return if called with a coordinate which is not at CU or TU boundary.
+    // TODO: Add handling for asymmetric inter CU boundaries which do not coincide
+    // with transform boundaries.
+    const int tu_width = LCU_WIDTH >> cu_q->tr_depth;
+    if (dir == EDGE_HOR && (ypos & (tu_width - 1))) return;
+    if (dir == EDGE_VER && (xpos & (tu_width - 1))) return;
   }
 
 

From 4d34377c42917ef8d2cc89874370b286477aca1c Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 21 Mar 2014 10:50:47 +0200
Subject: [PATCH 6/8] Clean up deblocking code a bit.

- Change guards to use the same method of checking for coordinate alignment.

- Move variables to reduce their scope.
---
 src/filter.c | 91 +++++++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index 635715a0..abf5afd5 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -166,18 +166,7 @@ void filter_deblock_edge_luma(encoder_control *encoder,
                               int32_t xpos, int32_t ypos,
                               int8_t depth, int8_t dir)
 {
-  int32_t stride = encoder->in.cur_pic->width;
-  int32_t offset = stride;
-  int32_t beta_offset_div2 = encoder->beta_offset_div2;
-  int32_t tc_offset_div2   = encoder->tc_offset_div2;
-  // TODO: support 10+bits
-  pixel *orig_src = &encoder->in.cur_pic->y_recdata[xpos + ypos*stride];
-  pixel *src = orig_src;
-  int32_t step = 1;
   cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(xpos>>MIN_SIZE) + (ypos>>MIN_SIZE) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-  cu_info *cu_p = NULL;
-  int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
-  int8_t strength = 0;
 
   {
     // Return if called with a coordinate which is not at CU or TU boundary.
@@ -188,13 +177,19 @@ void filter_deblock_edge_luma(encoder_control *encoder,
     if (dir == EDGE_VER && (xpos & (tu_width - 1))) return;
   }
 
-
-  if(dir == EDGE_VER) {
-    offset = 1;
-    step = stride;
-  }
-
   {
+    int32_t stride = encoder->in.cur_pic->width;
+    int32_t offset = stride;
+    int32_t beta_offset_div2 = encoder->beta_offset_div2;
+    int32_t tc_offset_div2   = encoder->tc_offset_div2;
+    // TODO: support 10+bits
+    pixel *orig_src = &encoder->in.cur_pic->y_recdata[xpos + ypos*stride];
+    pixel *src = orig_src;
+    int32_t step = 1;
+    cu_info *cu_p = NULL;
+    int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
+    int8_t strength = 0;
+
     int32_t qp              = encoder->QP;
     int32_t bitdepth_scale  = 1 << (g_bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
@@ -203,6 +198,12 @@ void filter_deblock_edge_luma(encoder_control *encoder,
     uint32_t blocks_in_part = (LCU_WIDTH >> depth) / 4;
     uint32_t block_idx;
     int32_t tc_index,tc,thr_cut;
+
+    if (dir == EDGE_VER) {
+      offset = 1;
+      step = stride;
+    }
+
     // TODO: add CU based QP calculation
 
     // For each 4-pixel part in the edge
@@ -289,48 +290,52 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
                                 int32_t x, int32_t y,
                                 int8_t depth, int8_t dir)
 {
-  int32_t stride = encoder->in.cur_pic->width >> 1;
-  int32_t tc_offset_div2 = encoder->tc_offset_div2;
-  // TODO: support 10+bits
-  pixel *src_u = &encoder->in.cur_pic->u_recdata[x + y*stride];
-  pixel *src_v = &encoder->in.cur_pic->v_recdata[x + y*stride];
-  // Init offset and step to EDGE_HOR
-  int32_t offset = stride;
-  int32_t step = 1;
   cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x>>(MIN_SIZE-1)) + (y>>(MIN_SIZE-1)) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-  cu_info *cu_p = NULL;
-  int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1);
-  int8_t strength = 2;
 
-  // We cannot filter edges not on 8x8 grid
-  if((depth >= MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) ))
-  {
-    return;
+  // Chroma edges that do not lay on a 8x8 grid are not deblocked.
+  if (depth >= MAX_DEPTH) {
+    if (dir == EDGE_HOR && (y & (8 - 1))) return;
+    if (dir == EDGE_VER && (x & (8 - 1))) return;
   }
 
   {
-    // Don't do anything if there is no PU or TU edge here.
-    int cu_width = (LCU_WIDTH / 2) >> (cu_q->depth);
-    if (dir == EDGE_HOR && y % cu_width != 0) {
-      return;
-    }
-  }
-
-  if(dir == EDGE_VER)
-  {
-    offset = 1;
-    step = stride;
+    // Return if called with a coordinate which is not at CU or TU boundary.
+    // TODO: Add handling for asymmetric inter CU boundaries which do not coincide
+    // with transform boundaries.
+    const int tu_width = (LCU_WIDTH / 2) >> cu_q->tr_depth;
+    if (dir == EDGE_HOR && (y & (tu_width - 1))) return;
+    if (dir == EDGE_VER && (x & (tu_width - 1))) return;
   }
 
   // For each subpart
   {
+    int32_t stride = encoder->in.cur_pic->width >> 1;
+    int32_t tc_offset_div2 = encoder->tc_offset_div2;
+    // TODO: support 10+bits
+    pixel *src_u = &encoder->in.cur_pic->u_recdata[x + y*stride];
+    pixel *src_v = &encoder->in.cur_pic->v_recdata[x + y*stride];
+    // Init offset and step to EDGE_HOR
+    int32_t offset = stride;
+    int32_t step = 1;
+    cu_info *cu_p = NULL;
+    int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1);
+    int8_t strength = 2;
+
     int32_t QP             = g_chroma_scale[encoder->QP];
     int32_t bitdepth_scale = 1 << (g_bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = g_tc_table_8x8[TC_index]*bitdepth_scale;
+
+    // Special handling for depth 4. It's meaning is that we want to bypass
+    // last block in LCU check in order to deblock just that block.
     uint32_t blocks_in_part= (LCU_WIDTH>>(depth == 4 ? depth : depth + 1)) / 4;
     uint32_t blk_idx;
 
+    if(dir == EDGE_VER) {
+      offset = 1;
+      step = stride;
+    }
+
     for (blk_idx = 0; blk_idx < blocks_in_part; ++blk_idx)
     {
       vector2d px = {

From 746eaa3671aec47549e3619329f8096d5e826868 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 21 Mar 2014 11:11:40 +0200
Subject: [PATCH 7/8] Move deblocking code to filter module.

---
 src/encoder.c | 18 +-----------------
 src/filter.c  | 34 ++++++++++++++++++++++++++++++++++
 src/filter.h  |  1 +
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index b60afae3..7e47cf01 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -515,23 +515,7 @@ void encode_one_frame(encoder_control* encoder)
                             1, lcu_dim.y / 2, size.x / 2, 1);
 
         if (encoder->deblock_enable) {
-          filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_VER);
-
-          // Filter rightmost 4 pixels from last LCU now that they have been
-          // finally deblocked vertically.
-          if (lcu.x > 0) {
-            int y;
-            for (y = 0; y < 64; y += 8) {
-              if (lcu.y + y == 0) continue;
-              filter_deblock_edge_luma(encoder, lcu.x * 64 - 4, lcu.y * 64 + y, 4, EDGE_HOR);
-            }
-            for (y = 0; y < 32; y += 8) {
-              if (lcu.y + y == 0) continue;
-              filter_deblock_edge_chroma(encoder, lcu.x * 32 - 4, lcu.y * 32 + y, 4, EDGE_HOR);
-            }
-          }
-
-          filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_HOR);
+          filter_deblock_lcu(encoder, px.x, px.y);
         }
       }
     }
diff --git a/src/filter.c b/src/filter.c
index abf5afd5..c1ea9675 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -450,6 +450,40 @@ void filter_deblock(encoder_control* encoder)
 }
 
 
+/**
+ * \brief Deblock a single LCU without using data from right or down.
+ *
+ * Filter all the following edges:
+ * - All edges within the LCU, except for the last 4 pixels on the right when
+ *   using horizontal filtering.
+ * - Left edge and top edge.
+ * - After vertical filtering the left edge, filter the last 4 pixels of
+ *   horizontal edges in the LCU to the left.
+ */
+void filter_deblock_lcu(encoder_control *encoder, int x_px, int y_px)
+{
+  const vector2d lcu = { x_px / LCU_WIDTH, y_px / LCU_WIDTH };
+
+  filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_VER);
+
+  // Filter rightmost 4 pixels from last LCU now that they have been
+  // finally deblocked vertically.
+  if (lcu.x > 0) {
+    int y;
+    for (y = 0; y < 64; y += 8) {
+      if (lcu.y + y == 0) continue;
+      filter_deblock_edge_luma(encoder, lcu.x * 64 - 4, lcu.y * 64 + y, 4, EDGE_HOR);
+    }
+    for (y = 0; y < 32; y += 8) {
+      if (lcu.y + y == 0) continue;
+      filter_deblock_edge_chroma(encoder, lcu.x * 32 - 4, lcu.y * 32 + y, 4, EDGE_HOR);
+    }
+  }
+
+  filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_HOR);
+}
+
+
 /**
  * \brief Interpolation for chroma half-pixel
  * \param src source image in integer pels (-2..width+3, -2..height+3)
diff --git a/src/filter.h b/src/filter.h
index b7b31745..fd33d9e6 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -41,6 +41,7 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
                                 int32_t xpos, int32_t ypos,
                                 int8_t depth, int8_t dir);
 void filter_deblock(encoder_control *encoder);
+void filter_deblock_lcu(encoder_control *encoder, int x_px, int y_px);
 void filter_deblock_luma(pixel *src, int32_t offset, int32_t tc , int8_t sw,
                          int8_t part_p_nofilter, int8_t part_q_nofilter,
                          int32_t thr_cut,

From 953aef03797dbf20e5aabc9e3b5b3deec086ae02 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 21 Mar 2014 11:48:57 +0200
Subject: [PATCH 8/8] Move rest of LCU encoding inside the LCU loop.

- Move SAO search inside the LCU loop.

- Move CU coding inside the LCU loop.

- Move SAO frame reconstruction loop to sao module.
---
 src/encoder.c | 121 +++++++++++++++++++-------------------------------
 src/encoder.h |   1 -
 src/sao.c     |  33 ++++++++++++++
 src/sao.h     |   1 +
 4 files changed, 79 insertions(+), 77 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 7e47cf01..003387a9 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -50,6 +50,9 @@ int8_t g_convert_to_bit[LCU_WIDTH + 1];
 /* Local functions. */
 static void add_checksum(encoder_control* encoder);
 static void encode_VUI(encoder_control* encoder);
+static void encode_sao(encoder_control *encoder,
+                       unsigned x_lcu, uint16_t y_lcu,
+                       sao_info *sao_luma, sao_info *sao_chroma);
 
 /**
  * Initialize g_sig_last_scan with scan positions for a transform block of
@@ -393,6 +396,8 @@ void encode_one_frame(encoder_control* encoder)
   const int is_p_radl = (encoder->cfg->intra_period > 1 && (encoder->frame % encoder->cfg->intra_period) == 0);
   const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
 
+  picture *pic = encoder->in.cur_pic;
+
   // Initialize lambda value(s) to use in search
   init_lambda(encoder);
 
@@ -469,12 +474,12 @@ void encode_one_frame(encoder_control* encoder)
 
   {
     vector2d lcu;
-    picture *pic = encoder->in.cur_pic;
+    const vector2d size = { encoder->in.width, encoder->in.height };
+    const vector2d size_lcu = { encoder->in.width_in_lcu, encoder->in.height_in_lcu };
 
     for (lcu.y = 0; lcu.y < encoder->in.height_in_lcu; lcu.y++) {
       for (lcu.x = 0; lcu.x < encoder->in.width_in_lcu; lcu.x++) {
         const vector2d px = { lcu.x * LCU_WIDTH, lcu.y * LCU_WIDTH };
-        const vector2d size = { encoder->in.width, encoder->in.height };
 
         // Handle partial LCUs on the right and bottom.
         const vector2d lcu_dim = {
@@ -517,12 +522,43 @@ void encode_one_frame(encoder_control* encoder)
         if (encoder->deblock_enable) {
           filter_deblock_lcu(encoder, px.x, px.y);
         }
+
+        if (encoder->sao_enable) {
+          const int stride = encoder->in.width_in_lcu;
+          sao_info *sao_luma = &pic->sao_luma[lcu.y * stride + lcu.x];
+          sao_info *sao_chroma = &pic->sao_chroma[lcu.y * stride + lcu.x];
+          init_sao_info(sao_luma);
+          init_sao_info(sao_chroma);
+
+          {
+            sao_info *sao_top = lcu. y != 0 ? &pic->sao_luma[(lcu.y - 1) * stride + lcu.x] : NULL;
+            sao_info *sao_left = lcu.x != 0 ? &pic->sao_luma[lcu.y * stride + lcu.x -1] : NULL;
+            sao_search_luma(encoder->in.cur_pic, lcu.x, lcu.y, sao_luma, sao_top, sao_left);
+          }
+
+          {
+            sao_info *sao_top = lcu.y != 0 ? &pic->sao_chroma[(lcu.y - 1) * stride + lcu.x] : NULL;
+            sao_info *sao_left = lcu.x != 0 ? &pic->sao_chroma[lcu.y * stride + lcu.x - 1] : NULL;
+            sao_search_chroma(encoder->in.cur_pic, lcu.x, lcu.y, sao_chroma, sao_top, sao_left);
+          }
+
+          // Merge only if both luma and chroma can be merged
+          sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag;
+          sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag;
+
+          encode_sao(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
+        }
+
+        encode_coding_tree(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0);
+
+        {
+          const int last_lcu = (lcu.x == size_lcu.x - 1 && lcu.y == size_lcu.y - 1);
+          cabac_encode_bin_trm(&cabac, last_lcu ? 1 : 0);  // end_of_slice_segment_flag
+        }
       }
     }
   }
 
-  encode_slice_data(encoder);
-
   cabac_flush(&cabac);
   bitstream_align(encoder->stream);
   bitstream_flush(encoder->stream);
@@ -539,6 +575,10 @@ void encode_one_frame(encoder_control* encoder)
 
   bitstream_clear_buffer(encoder->stream);
 
+  if (encoder->sao_enable) {
+    sao_reconstruct_frame(encoder);
+  }
+
   // Calculate checksum
   add_checksum(encoder);
 
@@ -1241,7 +1281,7 @@ static void encode_sao_merge_flags(sao_info *sao,
 }
 
 /**
- * \brief Stub that encodes all LCU's as none type.
+ * \brief Encode SAO information.
  */
 static void encode_sao(encoder_control *encoder,
                        unsigned x_lcu, uint16_t y_lcu,
@@ -1258,77 +1298,6 @@ static void encode_sao(encoder_control *encoder,
   }
 }
 
-void encode_slice_data(encoder_control* encoder)
-{
-  uint16_t x_ctb, y_ctb;
-  picture *pic = encoder->in.cur_pic;
-  const vector2d size_lcu = { encoder->in.width_in_lcu, encoder->in.height_in_lcu };
-
-  if (encoder->sao_enable) {
-    pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
-    pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
-    pixel *new_v_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
-    memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
-    memcpy(new_u_data, pic->u_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
-    memcpy(new_v_data, pic->v_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
-
-    for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
-      for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
-        unsigned stride = encoder->in.width_in_lcu;
-
-        //Fetch luma top and left merge candidate
-        sao_info *sao_top = y_ctb!=0?&pic->sao_luma[(y_ctb-1) * stride + x_ctb]:NULL;
-        sao_info *sao_left = x_ctb!=0?&pic->sao_luma[y_ctb * stride + x_ctb -1]:NULL;
-
-        sao_info *sao_luma = &pic->sao_luma[y_ctb * stride + x_ctb];
-        sao_info *sao_chroma = &pic->sao_chroma[y_ctb * stride + x_ctb];
-        init_sao_info(sao_luma);
-        init_sao_info(sao_chroma);
-
-        sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma, sao_top, sao_left);
-        // Chroma top and left merge candidate
-        sao_top = y_ctb!=0?&pic->sao_chroma[(y_ctb-1) * stride + x_ctb]:NULL;
-        sao_left = x_ctb!=0?&pic->sao_chroma[y_ctb * stride + x_ctb -1]:NULL;
-        sao_search_chroma(encoder->in.cur_pic, x_ctb, y_ctb, sao_chroma, sao_top, sao_left);
-
-        // Merge only if both luma and chroma can be merged
-        sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag;
-        sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag;
-
-        // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-        sao_reconstruct(pic, new_y_data, x_ctb, y_ctb, sao_luma, COLOR_Y);
-        sao_reconstruct(pic, new_u_data, x_ctb, y_ctb, sao_chroma, COLOR_U);
-        sao_reconstruct(pic, new_v_data, x_ctb, y_ctb, sao_chroma, COLOR_V);
-      }
-    }
-
-    free(new_y_data);
-    free(new_u_data);
-    free(new_v_data);
-  }
-
-  // Loop through every LCU in the slice
-  for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
-    for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
-      uint8_t depth = 0;
-      const int last_lcu = (x_ctb == size_lcu.x - 1 && y_ctb == size_lcu.y - 1);
-
-      if (encoder->sao_enable) {
-        picture *pic = encoder->in.cur_pic;
-        unsigned stride = encoder->in.width_in_lcu;
-        sao_info sao_luma = pic->sao_luma[y_ctb * stride + x_ctb];
-        sao_info sao_chroma = pic->sao_chroma[y_ctb * stride + x_ctb];
-
-        encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma);
-      }
-
-      // Recursive function for looping through all the sub-blocks
-      encode_coding_tree(encoder, x_ctb << MAX_DEPTH, y_ctb << MAX_DEPTH, depth);
-
-      cabac_encode_bin_trm(&cabac, last_lcu ? 1 : 0);  // end_of_slice_segment_flag
-    }
-  }
-}
 
 void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
                         uint16_t y_ctb, uint8_t depth)
diff --git a/src/encoder.h b/src/encoder.h
index b4f6dd22..8fda103c 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -109,7 +109,6 @@ int read_one_frame(FILE *file, encoder_control *encoder);
 void encode_seq_parameter_set(encoder_control *encoder);
 void encode_pic_parameter_set(encoder_control *encoder);
 void encode_vid_parameter_set(encoder_control *encoder);
-void encode_slice_data(encoder_control *encoder);
 void encode_slice_header(encoder_control *encoder);
 void encode_access_unit_delimiter(encoder_control* encoder);
 void encode_prefix_sei_version(encoder_control* encoder);
diff --git a/src/sao.c b/src/sao.c
index 3685df66..8edab2db 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -634,3 +634,36 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
   rec_list[0] = rec;
   sao_search_best_mode(orig_list, rec_list, block_width, block_height, 1, sao, sao_top, sao_left);
 }
+
+void sao_reconstruct_frame(encoder_control *encoder)
+{
+  vector2d lcu;
+  picture *pic = encoder->in.cur_pic;
+
+  // These are needed because SAO needs the pre-SAO pixels form left and
+  // top LCUs. Single pixel wide buffers, like what search_lcu takes, would
+  // be enough though.
+  pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
+  pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
+  pixel *new_v_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
+  memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
+  memcpy(new_u_data, pic->u_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
+  memcpy(new_v_data, pic->v_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
+
+  for (lcu.y = 0; lcu.y < encoder->in.height_in_lcu; lcu.y++) {
+    for (lcu.x = 0; lcu.x < encoder->in.width_in_lcu; lcu.x++) {
+      unsigned stride = encoder->in.width_in_lcu;
+      sao_info *sao_luma = &pic->sao_luma[lcu.y * stride + lcu.x];
+      sao_info *sao_chroma = &pic->sao_chroma[lcu.y * stride + lcu.x];
+
+      // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
+      sao_reconstruct(pic, new_y_data, lcu.x, lcu.y, sao_luma, COLOR_Y);
+      sao_reconstruct(pic, new_u_data, lcu.x, lcu.y, sao_chroma, COLOR_U);
+      sao_reconstruct(pic, new_v_data, lcu.x, lcu.y, sao_chroma, COLOR_V);
+    }
+  }
+
+  free(new_y_data);
+  free(new_u_data);
+  free(new_v_data);
+}
diff --git a/src/sao.h b/src/sao.h
index deaa3a71..45733ceb 100644
--- a/src/sao.h
+++ b/src/sao.h
@@ -51,5 +51,6 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
 void sao_reconstruct(picture *pic, const pixel *old_rec,
                      unsigned x_ctb, unsigned y_ctb,
                      const sao_info *sao, color_index color_i);
+void sao_reconstruct_frame(encoder_control *encoder);
 
 #endif