From 291be9507b409735cfe6bcca1d342060c8c152e6 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 23 Oct 2013 19:51:39 +0300
Subject: [PATCH 01/25] Start adding Sample Adaptive Offset capability.

---
 src/context.c |  10 ++++
 src/context.h |   7 +++
 src/encmain.c |   2 +-
 src/encoder.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++--
 src/global.h  |   6 +++
 src/picture.c |   3 ++
 src/picture.h |   2 +
 7 files changed, 158 insertions(+), 4 deletions(-)

diff --git a/src/context.c b/src/context.c
index 2eabeec3..5ccbe15e 100644
--- a/src/context.c
+++ b/src/context.c
@@ -19,6 +19,10 @@
 
 
 // CONTEXTS
+cabac_ctx g_sao_merge_left_flag_model;
+cabac_ctx g_sao_merge_up_flag_model;
+cabac_ctx g_sao_type_idx_luma_model;
+cabac_ctx g_sao_type_idx_chroma_model;
 cabac_ctx g_split_flag_model[3]; //!< \brief split flag context models
 cabac_ctx g_intra_mode_model;    //!< \brief intra mode context models
 cabac_ctx g_chroma_pred_model[2];
@@ -74,6 +78,12 @@ void init_contexts(encoder_control *encoder, int8_t slice)
   uint16_t i;
 
   // Initialize contexts
+  ctx_init(&g_sao_merge_left_flag_model, encoder->QP, INIT_SAO_MERGE_FLAG[slice]);
+  ctx_init(&g_sao_merge_up_flag_model, encoder->QP, INIT_SAO_MERGE_FLAG[slice]);
+  
+  ctx_init(&g_sao_type_idx_luma_model, encoder->QP, INIT_SAO_TYPE_IDX[slice]);
+  ctx_init(&g_sao_type_idx_chroma_model, encoder->QP, INIT_SAO_TYPE_IDX[slice]);
+
   ctx_init(&g_cu_merge_flag_ext_model, encoder->QP, INIT_MERGE_FLAG_EXT[slice][0]);
   ctx_init(&g_cu_merge_idx_ext_model, encoder->QP, INIT_MERGE_IDX_EXT[slice][0]);
   ctx_init(&g_cu_pred_mode_model, encoder->QP, INIT_PRED_MODE[slice][0]);
diff --git a/src/context.h b/src/context.h
index c2d5cbf3..0b0c2bb6 100644
--- a/src/context.h
+++ b/src/context.h
@@ -37,6 +37,10 @@ int32_t context_get_sig_ctx_inc(int32_t pattern_sig_ctx,uint32_t scan_idx,int32_
                                 int32_t pos_y,int32_t block_type,int32_t width, int8_t texture_type);
 
 // CONTEXTS
+extern cabac_ctx g_sao_merge_left_flag_model;
+extern cabac_ctx g_sao_merge_up_flag_model;
+extern cabac_ctx g_sao_type_idx_luma_model;
+extern cabac_ctx g_sao_type_idx_chroma_model;
 extern cabac_ctx g_split_flag_model[3];
 extern cabac_ctx g_intra_mode_model;
 extern cabac_ctx g_chroma_pred_model[2];
@@ -65,6 +69,9 @@ extern cabac_ctx g_mvp_idx_model[2];
 extern cabac_ctx g_cu_qt_root_cbf_model;
 #define CNU 154
 
+static const uint8_t INIT_SAO_MERGE_FLAG[3] = { 153, 153, 153 };
+static const uint8_t INIT_SAO_TYPE_IDX[3] = { 200, 185, 160 };
+
 static const uint8_t 
 INIT_QT_ROOT_CBF[3][1] = 
 {
diff --git a/src/encmain.c b/src/encmain.c
index 0bd39731..58a86c4e 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -150,7 +150,7 @@ int main(int argc, char *argv[])
   encoder->beta_offset_div2  = 0;
   encoder->tc_offset_div2    = 0;
   // SAO
-  encoder->sao_enable = 0;
+  encoder->sao_enable = 1;
 
   init_encoder_input(&encoder->in, input, cfg->width, cfg->height);
 
diff --git a/src/encoder.c b/src/encoder.c
index 07e4a785..4e32caa0 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -705,6 +705,8 @@ void encode_VUI(encoder_control* encoder)
 
 void encode_slice_header(encoder_control* encoder)
 {
+  picture *cur_pic = encoder->in.cur_pic;
+
 #ifdef _DEBUG
   printf("=========== Slice ===========\n");
 #endif
@@ -749,9 +751,9 @@ void encode_slice_header(encoder_control* encoder)
     //end if
   //end if
   if (encoder->sao_enable) {
-      WRITE_U(encoder->stream, 1,1, "slice_sao_luma_flag");
-      WRITE_U(encoder->stream, 0,1, "slice_sao_chroma_flag");
-    }
+    WRITE_U(encoder->stream, cur_pic->slice_sao_luma_flag, 1, "slice_sao_luma_flag");
+    WRITE_U(encoder->stream, cur_pic->slice_sao_chroma_flag, 1, "slice_sao_chroma_flag");
+  }
     
   if (encoder->in.cur_pic->slicetype != SLICE_I) {
       WRITE_U(encoder->stream, 0, 1, "num_ref_idx_active_override_flag");
@@ -768,6 +770,126 @@ void encode_slice_header(encoder_control* encoder)
     //WRITE_U(encoder->stream, 1, 1, "alignment");
 }
 
+
+  // TODO: move somewhere else (sao.h?)
+#define SAO_TYPE_NONE 0
+#define SAO_TYPE_EDGE 1
+#define SAO_TYPE_BAND 2
+#define Y_INDEX 0
+#define U_INDEX 1
+#define V_INDEX 2
+#define YUV_INDEX_END 3
+
+#define NUM_COLORS 3
+#define NUM_SAO_OFFSETS 4
+
+typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2 } color_index;
+
+typedef struct {
+  int type;
+  int merge_left_flag;
+  int merge_up_flag;
+  int offsets[NUM_SAO_OFFSETS];
+  int eo_class;
+} sao_info;
+
+void encode_sao_offsets(encoder_control *encoder, sao_info *sao)
+{
+  int i;
+
+  for (i = 0; i < NUM_SAO_OFFSETS; ++i) {
+    CABAC_BIN(&cabac, sao->offsets[i] > 0 ? 0 : 1, "sao_offset_sign");
+  }
+
+  if (sao->type == SAO_TYPE_EDGE) {
+    for (i = 0; i < NUM_SAO_OFFSETS; ++i) {
+      if (sao->offsets[i] != 0) {
+        // For edge SAO positive sign is encoded as 0.
+        CABAC_BIN(&cabac, sao->offsets[i] > 0 ? 0 : 1, "sao_offset_sign");
+        // TODO: CABAC_BIN sao_band_position[color_i]
+      } else {
+        // TODO: CABAC_BIN sao_eo_class[color_i]
+      }
+    }
+  }
+}
+
+void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color_i)
+{
+  picture *pic = encoder->in.cur_pic;
+
+  // Skip colors with no SAO.
+  if (color_i == COLOR_Y && !pic->slice_sao_luma_flag) {
+    return;
+  } else if (!pic->slice_sao_chroma_flag) {
+    return;
+  }
+
+  if (color_i == COLOR_Y) {
+    cabac.ctx = &g_sao_type_idx_luma_model;
+    CABAC_BIN(&cabac, sao->type, "sao_type_idx_luma");
+  } else {
+    cabac.ctx = &g_sao_type_idx_chroma_model;
+    CABAC_BIN(&cabac, sao->type, "sao_type_idx_chroma");
+  }
+
+  if (sao->type != SAO_TYPE_NONE) {
+    encode_sao_offsets(encoder, 0);
+  }
+}
+
+void encode_sao_merge_flags(encoder_control *encoder, sao_info *sao,
+                            unsigned x_ctb, unsigned y_ctb)
+{
+  // SAO merge flags are not present if merge candidate is not in the same
+  // slice AND tile, but there isn't any such segmentation right now.
+  assert(!USE_SLICES && !USE_TILES);
+
+  // SAO merge flags are not present for the first row and column.
+  if (x_ctb > 0) {
+    cabac.ctx = &g_sao_merge_left_flag_model;
+    CABAC_BIN(&cabac, sao->merge_left_flag ? 1 : 0, "sao_merge_left_flag");
+  }
+  if (y_ctb > 0 && !sao->merge_left_flag) {
+    cabac.ctx = &g_sao_merge_up_flag_model;
+    CABAC_BIN(&cabac, sao->merge_up_flag ? 1 : 0, "sao_merge_up_flag");
+  }
+}
+
+/**
+ * \brief Stub that encodes all LCU's as none type.
+ */
+void encode_sao(encoder_control *encoder, unsigned x_lcu, uint16_t y_lcu)
+{
+  unsigned sao_type[3] = {SAO_TYPE_NONE, SAO_TYPE_NONE, SAO_TYPE_NONE};
+  picture *pic = encoder->in.cur_pic;
+  sao_info tmp_sao[3];
+  sao_info *sao = &tmp_sao[0];
+  
+  // The tmp_sao and these assignments are temporary. The sao pointer will
+  // be given to this function.
+  sao[0].merge_left_flag = 0;
+  sao[0].merge_up_flag = 0;
+  sao[0].type = SAO_TYPE_NONE;
+
+  sao[1].merge_left_flag = 0;
+  sao[1].merge_up_flag = 0;
+  sao[1].type = SAO_TYPE_NONE;
+
+  sao[2].merge_left_flag = 0;
+  sao[2].merge_up_flag = 0;
+  sao[2].type = SAO_TYPE_NONE;
+
+  encode_sao_merge_flags(encoder, sao, x_lcu, y_lcu);
+
+  // If SAO is merged, nothing else needs to be coded.
+  if (!sao->merge_left_flag && !sao->merge_up_flag) {
+    encode_sao_color(encoder, &sao[COLOR_Y], COLOR_Y);
+    encode_sao_color(encoder, &sao[COLOR_U], COLOR_U);
+    encode_sao_color(encoder, &sao[COLOR_V], COLOR_V);
+  }
+}
+
 void encode_slice_data(encoder_control* encoder)
 {
   uint16_t x_ctb, y_ctb;
@@ -783,6 +905,10 @@ void encode_slice_data(encoder_control* encoder)
       uint8_t last_cu_x = (x_ctb == (encoder->in.width_in_lcu - 1)) ? 1 : 0;
       uint8_t depth = 0;
 
+      if (encoder->sao_enable) {
+        encode_sao(encoder, x_ctb, y_ctb);
+      }
+
       // Recursive function for looping through all the sub-blocks
       encode_coding_tree(encoder, x_ctb << MAX_DEPTH, y_ctb << MAX_DEPTH, depth);
 
diff --git a/src/global.h b/src/global.h
index d54a322c..7e53b9a0 100644
--- a/src/global.h
+++ b/src/global.h
@@ -92,6 +92,12 @@ typedef int16_t coefficient;
 #define SIZE_NxN   3
 #define SIZE_NONE  15
 
+// These are for marking incomplete implementations that break if slices or
+// tiles are used with asserts. They should be set to 1 if they are ever
+// implemented.
+#define USE_SLICES 0
+#define USE_TILES 0
+
 /* Inlining functions */
 #ifdef _MSC_VER /* Visual studio */
   #define INLINE __forceinline
diff --git a/src/picture.c b/src/picture.c
index 1f60eda5..a45ae8bc 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -244,6 +244,9 @@ picture *picture_init(int32_t width, int32_t height,
     memset(pic->cu_array[i], 0, sizeof(cu_info) * cu_array_size);
   }
 
+  pic->slice_sao_luma_flag = 1;
+  pic->slice_sao_chroma_flag = 1;
+
   return pic;
 }
 
diff --git a/src/picture.h b/src/picture.h
index 46c4af14..dab3ad99 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -81,6 +81,8 @@ typedef struct
   cu_info** cu_array;           // \brief Info for each CU at each depth.
   uint8_t type;
   uint8_t slicetype;
+  uint8_t slice_sao_luma_flag;
+  uint8_t slice_sao_chroma_flag;
 } picture;
 
 /**

From 3244c98b297674fc7945004a7f837766b585d71f Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 24 Oct 2013 10:27:04 +0300
Subject: [PATCH 02/25] Fix sao cabac contexts.

According to HM implementation, sao luma and chroma use the same contexts.
---
 src/context.c | 13 ++++---------
 src/context.h |  6 ++----
 src/encoder.c |  7 +++----
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/context.c b/src/context.c
index 5ccbe15e..11995cd1 100644
--- a/src/context.c
+++ b/src/context.c
@@ -19,10 +19,8 @@
 
 
 // CONTEXTS
-cabac_ctx g_sao_merge_left_flag_model;
-cabac_ctx g_sao_merge_up_flag_model;
-cabac_ctx g_sao_type_idx_luma_model;
-cabac_ctx g_sao_type_idx_chroma_model;
+cabac_ctx g_sao_merge_flag_model;
+cabac_ctx g_sao_type_idx_model;
 cabac_ctx g_split_flag_model[3]; //!< \brief split flag context models
 cabac_ctx g_intra_mode_model;    //!< \brief intra mode context models
 cabac_ctx g_chroma_pred_model[2];
@@ -78,11 +76,8 @@ void init_contexts(encoder_control *encoder, int8_t slice)
   uint16_t i;
 
   // Initialize contexts
-  ctx_init(&g_sao_merge_left_flag_model, encoder->QP, INIT_SAO_MERGE_FLAG[slice]);
-  ctx_init(&g_sao_merge_up_flag_model, encoder->QP, INIT_SAO_MERGE_FLAG[slice]);
-  
-  ctx_init(&g_sao_type_idx_luma_model, encoder->QP, INIT_SAO_TYPE_IDX[slice]);
-  ctx_init(&g_sao_type_idx_chroma_model, encoder->QP, INIT_SAO_TYPE_IDX[slice]);
+  ctx_init(&g_sao_merge_flag_model, encoder->QP, INIT_SAO_MERGE_FLAG[slice]);
+  ctx_init(&g_sao_type_idx_model, encoder->QP, INIT_SAO_TYPE_IDX[slice]);
 
   ctx_init(&g_cu_merge_flag_ext_model, encoder->QP, INIT_MERGE_FLAG_EXT[slice][0]);
   ctx_init(&g_cu_merge_idx_ext_model, encoder->QP, INIT_MERGE_IDX_EXT[slice][0]);
diff --git a/src/context.h b/src/context.h
index 0b0c2bb6..237f7524 100644
--- a/src/context.h
+++ b/src/context.h
@@ -37,10 +37,8 @@ int32_t context_get_sig_ctx_inc(int32_t pattern_sig_ctx,uint32_t scan_idx,int32_
                                 int32_t pos_y,int32_t block_type,int32_t width, int8_t texture_type);
 
 // CONTEXTS
-extern cabac_ctx g_sao_merge_left_flag_model;
-extern cabac_ctx g_sao_merge_up_flag_model;
-extern cabac_ctx g_sao_type_idx_luma_model;
-extern cabac_ctx g_sao_type_idx_chroma_model;
+extern cabac_ctx g_sao_merge_flag_model;
+extern cabac_ctx g_sao_type_idx_model;
 extern cabac_ctx g_split_flag_model[3];
 extern cabac_ctx g_intra_mode_model;
 extern cabac_ctx g_chroma_pred_model[2];
diff --git a/src/encoder.c b/src/encoder.c
index 4e32caa0..d44fa12d 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -825,11 +825,10 @@ void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color
     return;
   }
 
+  cabac.ctx = &g_sao_type_idx_model;
   if (color_i == COLOR_Y) {
-    cabac.ctx = &g_sao_type_idx_luma_model;
     CABAC_BIN(&cabac, sao->type, "sao_type_idx_luma");
   } else {
-    cabac.ctx = &g_sao_type_idx_chroma_model;
     CABAC_BIN(&cabac, sao->type, "sao_type_idx_chroma");
   }
 
@@ -847,11 +846,11 @@ void encode_sao_merge_flags(encoder_control *encoder, sao_info *sao,
 
   // SAO merge flags are not present for the first row and column.
   if (x_ctb > 0) {
-    cabac.ctx = &g_sao_merge_left_flag_model;
+    cabac.ctx = &g_sao_merge_flag_model;
     CABAC_BIN(&cabac, sao->merge_left_flag ? 1 : 0, "sao_merge_left_flag");
   }
   if (y_ctb > 0 && !sao->merge_left_flag) {
-    cabac.ctx = &g_sao_merge_up_flag_model;
+    cabac.ctx = &g_sao_merge_flag_model;
     CABAC_BIN(&cabac, sao->merge_up_flag ? 1 : 0, "sao_merge_up_flag");
   }
 }

From 0fd9105f0a86f7ef872d91e8d73dec62d9d95db0 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 24 Oct 2013 11:04:16 +0300
Subject: [PATCH 03/25] Fix sao type context init values.

The initialization slice types are in reverse order compared to the codes used
for slice types. They are BPI instead of IPB.
---
 src/context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/context.h b/src/context.h
index 237f7524..080da65f 100644
--- a/src/context.h
+++ b/src/context.h
@@ -68,7 +68,7 @@ extern cabac_ctx g_cu_qt_root_cbf_model;
 #define CNU 154
 
 static const uint8_t INIT_SAO_MERGE_FLAG[3] = { 153, 153, 153 };
-static const uint8_t INIT_SAO_TYPE_IDX[3] = { 200, 185, 160 };
+static const uint8_t INIT_SAO_TYPE_IDX[3] = { 160, 185, 200 };
 
 static const uint8_t 
 INIT_QT_ROOT_CBF[3][1] = 

From e9819cbb5eb8207d35c07a7d6037b037e3b718a0 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Thu, 24 Oct 2013 16:04:46 +0300
Subject: [PATCH 04/25] Fix coding of sao chroma type idx.

---
 src/encoder.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/encoder.c b/src/encoder.c
index d44fa12d..ff07cb6b 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -828,7 +828,8 @@ void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color
   cabac.ctx = &g_sao_type_idx_model;
   if (color_i == COLOR_Y) {
     CABAC_BIN(&cabac, sao->type, "sao_type_idx_luma");
-  } else {
+  } else if (color_i == COLOR_U) {
+    // SAO type is only coded for the first chroma.
     CABAC_BIN(&cabac, sao->type, "sao_type_idx_chroma");
   }
 

From 7bd090272788009452355b01c0732831c15f6047 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 25 Oct 2013 17:14:20 +0300
Subject: [PATCH 05/25] Implement fast distortion estimation for sao.

Add function for blitting pixels from one buffer to another.

Several commits have been squashed to this one.
---
 src/encoder.c | 234 +++++++++++++++++++++++++++++++++++++++++++-------
 src/global.h  |   9 ++
 src/picture.c |  31 +++++++
 src/picture.h |   3 +
 4 files changed, 246 insertions(+), 31 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index ff07cb6b..b7c07cd0 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -772,27 +772,208 @@ void encode_slice_header(encoder_control* encoder)
 
 
   // TODO: move somewhere else (sao.h?)
-#define SAO_TYPE_NONE 0
-#define SAO_TYPE_EDGE 1
-#define SAO_TYPE_BAND 2
 #define Y_INDEX 0
 #define U_INDEX 1
 #define V_INDEX 2
 #define YUV_INDEX_END 3
-
-#define NUM_COLORS 3
 #define NUM_SAO_OFFSETS 4
 
-typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2 } color_index;
+typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2, NUM_COLORS } color_index;
+typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
+typedef enum { SAO_EO0 = 0, SAO_EO1, SAO_EO2, SAO_EO3, SAO_NUM_EO } sao_eo_class;
+typedef enum { SAO_EO_CAT0 = 0, SAO_EO_CAT1, SAO_EO_CAT2, SAO_EO_CAT3, SAO_EO_CAT4, NUM_SAO_EDGE_CATEGORIES } sao_eo_cat;
 
 typedef struct {
-  int type;
+  sao_type type;
+  sao_eo_class eo_class;
+  int ddistortion;
   int merge_left_flag;
   int merge_up_flag;
   int offsets[NUM_SAO_OFFSETS];
-  int eo_class;
 } sao_info;
 
+//#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
+#define SIGN3(x) (((x) > 0) - ((x) < 0))
+#define NUM_SAO_EDGE_DIRS 4;
+
+typedef struct {
+  int x;
+  int y;
+} vector2d;
+
+// Offsets of a and b in relation to c.
+// dir_offset[dir][a or b]
+// |       |   a   | a     |     a |
+// | a c b |   c   |   c   |   c   |
+// |       |   b   |     b | b     |
+static const vector2d g_sao_edge_offsets[4][2] = { 
+  { { 0, -1 }, { 0, 1 } },
+  { { -1, 0 }, { 1, 0 } },
+  { { -1, -1 }, { 1, 1 } },
+  { { -1, 1 }, { 1, -1 } }
+};
+// Mapping of edge_idx values to eo-classes.
+static const unsigned g_sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
+// Mapping relationships between a, b and c to eo_idx.
+#define EO_IDX(a, b, c) (2 + SIGN3((c) - (a)) + SIGN3((c) - (b)))
+
+/**
+ * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param dir_offsets
+ * \param is_chroma  0 for luma, 1 for chroma. Indicates 
+ */
+void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
+                       int eo_class, int block_width,
+                       int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  vector2d a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[eo_class][1];
+  // Arrays orig_data and rec_data are quarter size for chroma.
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_width - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const pixel *c_data = &rec_data[y * block_width + x];
+      pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      pixel c = c_data[0];
+      pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+      
+      int eo_idx = EO_IDX(a, b, c);
+      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
+
+      cat_sum_cnt[0][eo_cat] += orig_data[y * block_width + x] - c;
+      cat_sum_cnt[1][eo_cat] += 1;
+    }
+  }
+}
+
+void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, color_index color)
+{
+  unsigned y, x;
+  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+  // Arrays orig_data and rec_data are quarter size for chroma.
+  unsigned block_width = LCU_WIDTH >> !(color == COLOR_Y);
+
+  for (y = 0; y < block_width; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      pixel *c_data = &rec_data[y * block_width + x];
+      pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      pixel c = c_data[0];
+      pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+      
+      int eo_idx = EO_IDX(a, b, c);
+      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
+
+      c_data[0] += sao->offsets[eo_cat];
+    }
+  }
+}
+
+void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
+                     const sao_info *sao_luma, const sao_info *sao_chroma)
+{
+  pixel rec_y[LCU_LUMA_SIZE];
+  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  // TODO: sao chroma reconstruct
+
+  // Data to tmp buffer.
+  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+
+  sao_reconstruct_color(rec_y, sao_luma, COLOR_Y);
+  //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
+  //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
+  
+  // Copy reconstructed block from tmp buffer to rec image.
+  picture_blit_pixels(rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
+}
+
+void sao_search_best_mode(const pixel *data, const pixel *recdata, 
+                          unsigned block_width, unsigned buf_size, unsigned buf_cnt,
+                          sao_info *sao_out)
+{
+  sao_eo_class edge_class;
+  // This array is used to calculate the mean offset used to minimize distortion.
+  int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES];
+  memset(cat_sum_cnt, 0, 2 * NUM_SAO_EDGE_CATEGORIES);
+
+  sao_out->ddistortion = 0;
+
+  for (edge_class = SAO_EO0; edge_class <= SAO_EO3; ++edge_class) {
+    int edge_offset[NUM_SAO_EDGE_CATEGORIES];
+    int sum_ddistortion = 0;
+    sao_eo_cat edge_cat;
+
+    // Call calc_sao_edge_dir once for luma and twice for chroma.
+    while (buf_cnt--) {
+      calc_sao_edge_dir(data, recdata, edge_class, block_width, cat_sum_cnt);
+      data += buf_size;
+      recdata += buf_size;
+    }
+    
+    for (edge_cat = SAO_EO_CAT1; edge_cat <= SAO_EO_CAT4; ++edge_cat) {
+      int cat_sum = cat_sum_cnt[0][edge_cat];
+      int cat_cnt = cat_sum_cnt[1][edge_cat];
+      
+      // The optimum offset can be calculated by getting the minima of the
+      // fast ddistortion estimation formula. The minima is the mean error
+      // and we round that to the nearest integer.
+      int offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
+      edge_offset[edge_cat] = offset;
+      // The ddistortion is amount by which the SSE of data changes. It should
+      // be negative for all categories, if offset was chosen correctly.
+      // ddistortion = N * h^2 - 2 * h * E, where N is the number of samples 
+      // and E is the sum of errors.
+      // It basically says that all pixels that are not improved by offset
+      // increase increase SSE by h^2 and all pixels that are improved by
+      // offset decrease SSE by h*E.
+      sum_ddistortion += cat_cnt * offset * offset - 2 * offset * cat_sum;
+    }
+    // SAO is not applied for category 0.
+    edge_offset[SAO_EO_CAT0] = 0;
+
+    // Choose the offset class that offers the least error after offset.
+    if (sum_ddistortion < sao_out->ddistortion) {
+      sao_out->eo_class = edge_class;
+      sao_out->ddistortion = sum_ddistortion;
+      memcpy(sao_out->offsets, edge_offset, NUM_SAO_EDGE_CATEGORIES);
+    }
+  }
+}
+
+sao_info sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
+{
+  sao_info sao;
+  sao.merge_left_flag = 0;
+  sao.merge_up_flag = 0;
+  sao.type = SAO_TYPE_NONE;
+  return sao;
+}
+
+sao_info sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
+{
+  // These buffers are needed only until we switch to a LCU based data
+  // structure for pixels. Then we can give pointers directly to that structure
+  // without making copies.
+  // It's 2-dimensional because sao_search_best_mode takes arguments as arrays.
+  pixel orig_y[LCU_LUMA_SIZE];
+  pixel rec_y[LCU_LUMA_SIZE];
+  pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  sao_info sao_params;
+
+  // Fill temporary buffers with picture data.
+  picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+
+  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, &sao_params);
+
+  return sao_params;
+}
+
 void encode_sao_offsets(encoder_control *encoder, sao_info *sao)
 {
   int i;
@@ -859,34 +1040,20 @@ void encode_sao_merge_flags(encoder_control *encoder, sao_info *sao,
 /**
  * \brief Stub that encodes all LCU's as none type.
  */
-void encode_sao(encoder_control *encoder, unsigned x_lcu, uint16_t y_lcu)
+void encode_sao(encoder_control *encoder, unsigned x_lcu, uint16_t y_lcu,
+                sao_info *sao_luma, sao_info *sao_chroma)
 {
   unsigned sao_type[3] = {SAO_TYPE_NONE, SAO_TYPE_NONE, SAO_TYPE_NONE};
   picture *pic = encoder->in.cur_pic;
-  sao_info tmp_sao[3];
-  sao_info *sao = &tmp_sao[0];
   
-  // The tmp_sao and these assignments are temporary. The sao pointer will
-  // be given to this function.
-  sao[0].merge_left_flag = 0;
-  sao[0].merge_up_flag = 0;
-  sao[0].type = SAO_TYPE_NONE;
-
-  sao[1].merge_left_flag = 0;
-  sao[1].merge_up_flag = 0;
-  sao[1].type = SAO_TYPE_NONE;
-
-  sao[2].merge_left_flag = 0;
-  sao[2].merge_up_flag = 0;
-  sao[2].type = SAO_TYPE_NONE;
-
-  encode_sao_merge_flags(encoder, sao, x_lcu, y_lcu);
+  // TODO: transmit merge flags outside sao_info
+  encode_sao_merge_flags(encoder, sao_luma, x_lcu, y_lcu);
 
   // If SAO is merged, nothing else needs to be coded.
-  if (!sao->merge_left_flag && !sao->merge_up_flag) {
-    encode_sao_color(encoder, &sao[COLOR_Y], COLOR_Y);
-    encode_sao_color(encoder, &sao[COLOR_U], COLOR_U);
-    encode_sao_color(encoder, &sao[COLOR_V], COLOR_V);
+  if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
+    encode_sao_color(encoder, sao_luma, COLOR_Y);
+    encode_sao_color(encoder, sao_chroma, COLOR_U);
+    encode_sao_color(encoder, sao_chroma, COLOR_V);
   }
 }
 
@@ -906,7 +1073,12 @@ void encode_slice_data(encoder_control* encoder)
       uint8_t depth = 0;
 
       if (encoder->sao_enable) {
-        encode_sao(encoder, x_ctb, y_ctb);
+        sao_info sao_luma = sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb);
+        sao_info sao_chroma = sao_search_chroma(encoder->in.cur_pic, x_ctb, y_ctb);
+        
+        // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+        // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+        encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma);
       }
 
       // Recursive function for looping through all the sub-blocks
diff --git a/src/global.h b/src/global.h
index 7e53b9a0..28fd5b47 100644
--- a/src/global.h
+++ b/src/global.h
@@ -65,6 +65,9 @@ typedef int16_t coefficient;
 
 /* END OF CONFIG VARIABLES */
 
+#define LCU_LUMA_SIZE (LCU_WIDTH * LCU_WIDTH)
+#define LCU_CHROMA_SIZE (LCU_WIDTH * LCU_WIDTH >> 2)
+
 #define MAX_REF_PIC_COUNT 5
 
 #define AMVP_MAX_NUM_CANDS 2
@@ -80,6 +83,12 @@ typedef int16_t coefficient;
 #define NO_SCU_IN_LCU(no_lcu) ((no_lcu) << MAX_DEPTH)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 
+#define LOG2_LCU_WIDTH 6
+// CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width
+#define CU_TO_PIXEL(x, y, depth, width) (((y) << (LOG2_LCU_WIDTH - (depth))) * (width) \
+                                         + ((x) << (LOG2_LCU_WIDTH - (depth))))
+
+
 #define VERSION_STRING "0.2               "
 #define VERSION 0.2
 
diff --git a/src/picture.c b/src/picture.c
index a45ae8bc..38a314e0 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -43,6 +43,37 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
   }
 }
 
+/**
+ * \brief BLock Image Transfer from one buffer to another.
+ *
+ * It's a stupidly simple loop that copies pixels.
+ *
+ * \param orig  Start of the originating buffer.
+ * \param dst  Start of the destination buffer.
+ * \param width  Width of the copied region.
+ * \param height  Height of the copied region.
+ * \param orig_stride  Width of a row in the originating buffer.
+ * \param dst_stride  Width of a row in the destination buffer.
+ *
+ * This should be inlined, but it's defined here for now to see if Visual
+ * Studios LTCG will inline it.
+ */
+void picture_blit_pixels(const pixel* orig, pixel *dst,
+                         unsigned width, unsigned height,
+                         unsigned orig_stride, unsigned dst_stride)
+{
+  unsigned y, x;
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dst[x] = orig[x];
+    }
+    // Move pointers to the next row.
+    orig += orig_stride;
+    dst += dst_stride;
+  }
+}
+
 /**
  * \brief Set block coded status
  * \param pic    picture to use
diff --git a/src/picture.h b/src/picture.h
index dab3ad99..6d05fa91 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -108,6 +108,9 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
                                 uint8_t depth, int8_t residual);
 void picture_set_block_split(picture *pic, uint32_t x_scu, uint32_t y_scu,
                              uint8_t depth, int8_t split);
+void picture_blit_pixels(const pixel* orig, pixel *dst,
+                         unsigned width, unsigned height,
+                         unsigned orig_stride, unsigned dst_stride);
 
 picture_list * picture_list_init(int size);
 int picture_list_resize(picture_list *list, int size);

From 66fe30252043a61eafcb617dec5ceea50f5e5069 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 1 Nov 2013 00:52:06 +0200
Subject: [PATCH 06/25] Fix cabac context for sao. HM accepts encoded sao.

---
 src/cabac.c   |  39 +++++++++++++++--
 src/cabac.h   |   1 +
 src/encoder.c | 117 ++++++++++++++++++++++++++++----------------------
 src/picture.c |   5 ++-
 4 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/src/cabac.c b/src/cabac.c
index 7d7fd57c..81b64e74 100644
--- a/src/cabac.c
+++ b/src/cabac.c
@@ -11,6 +11,7 @@
 
 #include "cabac.h"
 
+#include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -281,20 +282,52 @@ void cabac_write_unary_max_symbol(cabac_data *data, cabac_ctx *ctx, uint32_t sym
 {
   int8_t code_last = max_symbol > symbol;
 
+  assert(symbol <= max_symbol);
+
   if (!max_symbol) return;
   
   data->ctx = &ctx[0];
-  cabac_encode_bin(data, symbol ? 1 : 0);
+  CABAC_BIN(data, symbol ? 1 : 0, "ums");
   
   if (!symbol) return;
   
   while (--symbol) {
     data->ctx = &ctx[offset];
-    cabac_encode_bin(data, 1);
+    CABAC_BIN(data, 1, "ums");
   }
   if (code_last) {
     data->ctx = &ctx[offset];
-    cabac_encode_bin(data, 0);
+    CABAC_BIN(data, 0, "ums");
+  }
+}
+
+/**
+ * This can be used for Truncated Rice binarization with cRiceParam=0.
+ */
+void cabac_write_unary_max_symbol_ep(cabac_data *data, unsigned symbol, unsigned max_symbol)
+{
+  /*if (symbol == 0) {
+    CABAC_BIN_EP(data, 0, "ums_ep");
+  } else {
+    // Make a bit-string of (symbol) times 1 and a single 0, except when
+    // symbol == max_symbol.
+    unsigned bins = ((1 << symbol) - 1) << (symbol < max_symbol);
+    CABAC_BINS_EP(data, bins, symbol + (symbol < max_symbol), "ums_ep");
+  }*/
+  
+  int8_t code_last = max_symbol > symbol;
+
+  assert(symbol <= max_symbol);
+
+  CABAC_BIN_EP(data, symbol ? 1 : 0, "ums_ep");
+  
+  if (!symbol) return;
+  
+  while (--symbol) {
+    CABAC_BIN_EP(data, 1, "ums_ep");
+  }
+  if (code_last) {
+    CABAC_BIN_EP(data, 0, "ums_ep");
   }
 }
 
diff --git a/src/cabac.h b/src/cabac.h
index fb518814..8bd32610 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -55,6 +55,7 @@ void cabac_write_ep_ex_golomb(cabac_data *data, uint32_t symbol,
 void cabac_write_unary_max_symbol(cabac_data *data, cabac_ctx *ctx, 
                                   uint32_t symbol, int32_t offset, 
                                   uint32_t max_symbol);
+void cabac_write_unary_max_symbol_ep(cabac_data *data, unsigned symbol, unsigned max_symbol);
 
 
 // Macros
diff --git a/src/encoder.c b/src/encoder.c
index 6e598e8f..ba5328fe 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -775,11 +775,8 @@ void encode_slice_header(encoder_control* encoder)
 
 
   // TODO: move somewhere else (sao.h?)
-#define Y_INDEX 0
-#define U_INDEX 1
-#define V_INDEX 2
-#define YUV_INDEX_END 3
-#define NUM_SAO_OFFSETS 4
+#define SAO_ABS_OFFSET_MAX ((1 << (MIN(BIT_DEPTH, 10) - 5)) - 1)
+//#define SAO_ABS_OFFSET_MAX 7
 
 typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2, NUM_COLORS } color_index;
 typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
@@ -792,7 +789,7 @@ typedef struct {
   int ddistortion;
   int merge_left_flag;
   int merge_up_flag;
-  int offsets[NUM_SAO_OFFSETS];
+  int offsets[NUM_SAO_EDGE_CATEGORIES];
 } sao_info;
 
 //#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
@@ -853,16 +850,17 @@ void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
   }
 }
 
-void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, color_index color)
+void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, int block_width)
 {
-  unsigned y, x;
+  int y, x;
   vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
   vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
   // Arrays orig_data and rec_data are quarter size for chroma.
-  unsigned block_width = LCU_WIDTH >> !(color == COLOR_Y);
 
-  for (y = 0; y < block_width; ++y) {
-    for (x = 0; x < block_width; ++x) {
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_width - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
       pixel *c_data = &rec_data[y * block_width + x];
       pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
       pixel c = c_data[0];
@@ -886,7 +884,7 @@ void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb,
   // Data to tmp buffer.
   picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
-  sao_reconstruct_color(rec_y, sao_luma, COLOR_Y);
+  sao_reconstruct_color(rec_y, sao_luma, LCU_WIDTH);
   //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
   //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
   
@@ -901,20 +899,19 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
   sao_eo_class edge_class;
   // This array is used to calculate the mean offset used to minimize distortion.
   int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES];
-  memset(cat_sum_cnt, 0, 2 * NUM_SAO_EDGE_CATEGORIES);
+  memset(cat_sum_cnt, 0, sizeof(int) * 2 * NUM_SAO_EDGE_CATEGORIES);
 
-  sao_out->ddistortion = 0;
+  sao_out->ddistortion = INT_MAX;
 
   for (edge_class = SAO_EO0; edge_class <= SAO_EO3; ++edge_class) {
     int edge_offset[NUM_SAO_EDGE_CATEGORIES];
     int sum_ddistortion = 0;
     sao_eo_cat edge_cat;
+    unsigned i = 0;
 
     // Call calc_sao_edge_dir once for luma and twice for chroma.
-    while (buf_cnt--) {
-      calc_sao_edge_dir(data, recdata, edge_class, block_width, cat_sum_cnt);
-      data += buf_size;
-      recdata += buf_size;
+    for (i = 0; i < buf_cnt; ++i) {
+      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class, block_width, cat_sum_cnt);
     }
     
     for (edge_cat = SAO_EO_CAT1; edge_cat <= SAO_EO_CAT4; ++edge_cat) {
@@ -924,7 +921,11 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
       // The optimum offset can be calculated by getting the minima of the
       // fast ddistortion estimation formula. The minima is the mean error
       // and we round that to the nearest integer.
-      int offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
+      int offset = 0;
+      if (cat_cnt != 0) {
+        offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
+        offset = CLIP(-SAO_ABS_OFFSET_MAX, SAO_ABS_OFFSET_MAX, offset);
+      }
       edge_offset[edge_cat] = offset;
       // The ddistortion is amount by which the SSE of data changes. It should
       // be negative for all categories, if offset was chosen correctly.
@@ -942,7 +943,7 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
     if (sum_ddistortion < sao_out->ddistortion) {
       sao_out->eo_class = edge_class;
       sao_out->ddistortion = sum_ddistortion;
-      memcpy(sao_out->offsets, edge_offset, NUM_SAO_EDGE_CATEGORIES);
+      memcpy(sao_out->offsets, edge_offset, sizeof(int) * NUM_SAO_EDGE_CATEGORIES);
     }
   }
 }
@@ -966,36 +967,19 @@ sao_info sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
   pixel rec_y[LCU_LUMA_SIZE];
   pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-  sao_info sao_params;
+  
+  sao_info sao;
+  sao.merge_left_flag = 0;
+  sao.merge_up_flag = 0;
+  sao.type = SAO_TYPE_EDGE;
 
   // Fill temporary buffers with picture data.
   picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
   picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
-  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, &sao_params);
+  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, &sao);
 
-  return sao_params;
-}
-
-void encode_sao_offsets(encoder_control *encoder, sao_info *sao)
-{
-  int i;
-
-  for (i = 0; i < NUM_SAO_OFFSETS; ++i) {
-    CABAC_BIN(&cabac, sao->offsets[i] > 0 ? 0 : 1, "sao_offset_sign");
-  }
-
-  if (sao->type == SAO_TYPE_EDGE) {
-    for (i = 0; i < NUM_SAO_OFFSETS; ++i) {
-      if (sao->offsets[i] != 0) {
-        // For edge SAO positive sign is encoded as 0.
-        CABAC_BIN(&cabac, sao->offsets[i] > 0 ? 0 : 1, "sao_offset_sign");
-        // TODO: CABAC_BIN sao_band_position[color_i]
-      } else {
-        // TODO: CABAC_BIN sao_eo_class[color_i]
-      }
-    }
-  }
+  return sao;
 }
 
 void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color_i)
@@ -1009,16 +993,44 @@ void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color
     return;
   }
 
-  cabac.ctx = &g_sao_type_idx_model;
-  if (color_i == COLOR_Y) {
-    CABAC_BIN(&cabac, sao->type, "sao_type_idx_luma");
-  } else if (color_i == COLOR_U) {
-    // SAO type is only coded for the first chroma.
-    CABAC_BIN(&cabac, sao->type, "sao_type_idx_chroma");
+  if (color_i != COLOR_V) {
+    //CABAC_BIN(&cabac, sao->type, "sao_type_idx");
+    // TR cMax=2
+    // HM codes only the first bin with context.
+    //cabac_write_unary_max_symbol(&cabac, &g_sao_type_idx_model, sao->type, 0, 2);
+    cabac.ctx = &g_sao_type_idx_model;
+    CABAC_BIN(&cabac, sao->type == 0 ? 0 : 1, "sao_type_idx");
+    if (sao->type == SAO_TYPE_BAND) {
+      CABAC_BIN_EP(&cabac, 0, "sao_type_idx_ep");
+    } else if (sao->type == SAO_TYPE_EDGE) {
+      CABAC_BIN_EP(&cabac, 1, "sao_type_idx_ep");
+    }
   }
 
   if (sao->type != SAO_TYPE_NONE) {
-    encode_sao_offsets(encoder, 0);
+    sao_eo_cat i;
+  
+    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
+      //CABAC_BIN_EP(&cabac, abs(sao->offsets[i]), "sao_offset_abs");
+      // TR cMax=7 (for 8bit), cRiseParam=0
+      cabac_write_unary_max_symbol_ep(&cabac, abs(sao->offsets[i]), 
+                                      SAO_ABS_OFFSET_MAX);
+    }
+
+    if (sao->type == SAO_TYPE_BAND) {
+      for (i = SAO_EO_CAT1; i < SAO_EO_CAT4; ++i) {
+        // Parahprasing spec: "If offset_sign is equal to 0, offsetSign is set
+        // equal to 1. Otherwise to -1."
+        // follows: >=0 is coded as 0, <0 is coded as 1
+        // FL cMax=1 (1 bit)
+        CABAC_BIN_EP(&cabac, sao->offsets[i] >= 0 ? 0 : 1, "sao_offset_sign");
+      }
+      // TODO: sao_band_position
+      // FL cMax=31 (6 bits)
+    } else if (color_i != COLOR_V) {
+      // FL cMax=3 (2 bits)
+      CABAC_BINS_EP(&cabac, sao->eo_class, 2, "sao_eo_class");
+    }
   }
 }
 
@@ -1080,6 +1092,9 @@ void encode_slice_data(encoder_control* encoder)
         
         // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
         // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+
+        sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma, &sao_chroma);
+
         encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma);
       }
 
diff --git a/src/picture.c b/src/picture.c
index fbfdd694..1df0bca2 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -81,12 +81,15 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
  * This should be inlined, but it's defined here for now to see if Visual
  * Studios LTCG will inline it.
  */
-void picture_blit_pixels(const pixel* orig, pixel *dst,
+void picture_blit_pixels(const pixel *orig, pixel *dst,
                          unsigned width, unsigned height,
                          unsigned orig_stride, unsigned dst_stride)
 {
   unsigned y, x;
 
+  const pixel *borig = orig;
+  const pixel *bdst = dst;
+
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       dst[x] = orig[x];

From 31af7e39293c8de14715b7482f3b0c6cb6ec61f9 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 1 Nov 2013 12:36:34 +0200
Subject: [PATCH 07/25] Fix sao offsets. Reconstruction kind of works.

---
 src/encoder.c | 70 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index ba5328fe..fbbc869a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -792,6 +792,12 @@ typedef struct {
   int offsets[NUM_SAO_EDGE_CATEGORIES];
 } sao_info;
 
+void init_sao_info(sao_info *sao) {
+  sao->type = SAO_TYPE_NONE;
+  sao->merge_left_flag = 0;
+  sao->merge_up_flag = 0;
+}
+
 //#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
 #define SIGN3(x) (((x) > 0) - ((x) < 0))
 #define NUM_SAO_EDGE_DIRS 4;
@@ -807,10 +813,10 @@ typedef struct {
 // | a c b |   c   |   c   |   c   |
 // |       |   b   |     b | b     |
 static const vector2d g_sao_edge_offsets[4][2] = { 
-  { { 0, -1 }, { 0, 1 } },
   { { -1, 0 }, { 1, 0 } },
+  { { 0, -1 }, { 0, 1 } },
   { { -1, -1 }, { 1, 1 } },
-  { { -1, 1 }, { 1, -1 } }
+  { { 1, -1 }, { -1, 1 } }
 };
 // Mapping of edge_idx values to eo-classes.
 static const unsigned g_sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
@@ -850,7 +856,7 @@ void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
   }
 }
 
-void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, int block_width)
+void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao_info *sao, int block_width)
 {
   int y, x;
   vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
@@ -861,7 +867,8 @@ void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, int block_width
   // their neighbours.
   for (y = 1; y < block_width - 1; ++y) {
     for (x = 1; x < block_width - 1; ++x) {
-      pixel *c_data = &rec_data[y * block_width + x];
+      const pixel *c_data = &rec_data[y * block_width + x];
+      pixel *new_data = &new_rec_data[y * block_width + x];
       pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
       pixel c = c_data[0];
       pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
@@ -869,7 +876,7 @@ void sao_reconstruct_color(pixel *rec_data, const sao_info *sao, int block_width
       int eo_idx = EO_IDX(a, b, c);
       int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
 
-      c_data[0] += sao->offsets[eo_cat];
+      new_data[0] = CLIP(0, (1 << BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat]);
     }
   }
 }
@@ -878,18 +885,19 @@ void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb,
                      const sao_info *sao_luma, const sao_info *sao_chroma)
 {
   pixel rec_y[LCU_LUMA_SIZE];
+  pixel new_rec_y[LCU_LUMA_SIZE];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   // TODO: sao chroma reconstruct
 
   // Data to tmp buffer.
   picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
-  sao_reconstruct_color(rec_y, sao_luma, LCU_WIDTH);
+  sao_reconstruct_color(rec_y, new_rec_y, sao_luma, LCU_WIDTH);
   //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
   //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
   
   // Copy reconstructed block from tmp buffer to rec image.
-  picture_blit_pixels(rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
+  picture_blit_pixels(new_rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
 }
 
 void sao_search_best_mode(const pixel *data, const pixel *recdata, 
@@ -948,16 +956,12 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
   }
 }
 
-sao_info sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
+ void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
 {
-  sao_info sao;
-  sao.merge_left_flag = 0;
-  sao.merge_up_flag = 0;
-  sao.type = SAO_TYPE_NONE;
-  return sao;
+  
 }
 
-sao_info sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
+void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
 {
   // These buffers are needed only until we switch to a LCU based data
   // structure for pixels. Then we can give pointers directly to that structure
@@ -968,18 +972,20 @@ sao_info sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb)
   pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   
-  sao_info sao;
-  sao.merge_left_flag = 0;
-  sao.merge_up_flag = 0;
-  sao.type = SAO_TYPE_EDGE;
+  sao->offsets[SAO_EO_CAT0] = 0;
+  sao->offsets[SAO_EO_CAT1] = 7;
+  sao->offsets[SAO_EO_CAT2] = 7;
+  sao->offsets[SAO_EO_CAT3] = -7;
+  sao->offsets[SAO_EO_CAT4] = -7;
+  sao->eo_class = SAO_EO0;
+  sao->type = SAO_TYPE_EDGE;
+  return;
 
   // Fill temporary buffers with picture data.
   picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
   picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
-  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, &sao);
-
-  return sao;
+  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, sao);
 }
 
 void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color_i)
@@ -1087,13 +1093,25 @@ void encode_slice_data(encoder_control* encoder)
       uint8_t depth = 0;
 
       if (encoder->sao_enable) {
-        sao_info sao_luma = sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb);
-        sao_info sao_chroma = sao_search_chroma(encoder->in.cur_pic, x_ctb, y_ctb);
+        sao_info sao_luma;
+        sao_info sao_chroma;
+        init_sao_info(&sao_luma);
+        init_sao_info(&sao_chroma);
         
-        // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-        // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+        // Temporary guards against non-LCU size coding units at the edges,
+        // because they aren't handled yet.
+        if (encoder->in.width_in_lcu * LCU_WIDTH != encoder->in.cur_pic->width
+            && x_ctb == encoder->in.width_in_lcu - 1) {
 
-        sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma, &sao_chroma);
+        } else if (encoder->in.height_in_lcu * LCU_WIDTH != encoder->in.cur_pic->height
+                   && y_ctb == encoder->in.height_in_lcu - 1) {
+
+        } else {
+          sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma);
+          // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+          // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+          sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma, &sao_chroma);
+        }
 
         encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma);
       }

From 03f2967899735310ba3a93446b4daf682c2366a5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 17:25:09 +0200
Subject: [PATCH 08/25] Block based sao dead end.

This will never work because the adjacent blocks require the original values
of the boundary pixels to make category decisions.
---
 src/encoder.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 10 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index fbbc869a..574f3591 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -856,7 +856,8 @@ void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
   }
 }
 
-void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao_info *sao, int block_width)
+void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao_info *sao, 
+                           int stride, int new_stride, int block_width, int block_height)
 {
   int y, x;
   vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
@@ -865,13 +866,13 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
 
   // Don't sample the edge pixels because this function doesn't have access to
   // their neighbours.
-  for (y = 1; y < block_width - 1; ++y) {
+  for (y = 1; y < block_height - 1; ++y) {
     for (x = 1; x < block_width - 1; ++x) {
-      const pixel *c_data = &rec_data[y * block_width + x];
-      pixel *new_data = &new_rec_data[y * block_width + x];
-      pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      const pixel *c_data = &rec_data[y * stride + x];
+      pixel *new_data = &new_rec_data[y * new_stride + x];
+      pixel a = c_data[a_ofs.y * stride + a_ofs.x];
       pixel c = c_data[0];
-      pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+      pixel b = c_data[b_ofs.y * stride + b_ofs.x];
       
       int eo_idx = EO_IDX(a, b, c);
       int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
@@ -881,18 +882,120 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
   }
 }
 
+/**
+ * \brief Calculate dimensions of the buffer used by sao reconstruction.
+ *
+ * This function calculates 4 vectors that can be used to make the temporary
+ * buffers required by sao_reconstruct_color.
+ *
+ * Vector block is the area affected by sao. Vectors tr and br are top-left
+ * margin and bottom-right margin, which contain pixels that are not modified
+ * by the reconstruction of this LCU but are needed by the reconstruction.
+ * Vector rec is the coordinate of the area required by sao reconstruction.
+ *
+ * The margins are always either 0 or 1, depending on the direction of the
+ * edge offset class.
+ *
+ * This also takes into account borders of the picture and non-LCU sized
+ * CU's at the bottom and right of the picture.
+ * 
+ * \ rec
+ *  +------+
+ *  |\ tl  |
+ *  | +--+ |
+ *  | |\ block
+ *  | | \| |
+ *  | +--+ |
+ *  |     \ br
+ *  +------+
+ *
+ * \param pic  Picture.
+ * \param sao  Sao parameters.
+ * \param rec  Top-left corner of the LCU, modified to be top-left corner of 
+ */
+void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec, 
+                         vector2d *tl, vector2d *br, vector2d *block)
+{
+  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+
+  // Handle top and left.
+  if (rec->y == 0) {
+    tl->y = 0;
+    if (a_ofs.y == -1 || b_ofs.y == -1) {
+      block->y -= 1;
+      tl->y += 1;
+    }
+  }
+  if (rec->x == 0) {
+    tl->x = 0;
+    if (a_ofs.x == -1 || b_ofs.x == -1) {
+      block->x -= 1;
+      tl->x += 1;
+    }
+  }
+
+  // Handle right and bottom, taking care of non-LCU sized CUs.
+  if (rec->y + LCU_WIDTH >= pic->height) {
+    br->y = 0;
+    if (rec->y + LCU_WIDTH >= pic->height) {
+      block->y = pic->height - rec->y;
+    }
+    if (a_ofs.y == 1 || b_ofs.y == 1) {
+      block->y -= 1;
+      br->y += 1;
+    }
+  }
+  if (rec->x + LCU_WIDTH >= pic->width) {
+    br->x = 0;
+    if (rec->x + LCU_WIDTH > pic->width) {
+      block->x = pic->width - rec->x;
+    }
+    if (a_ofs.x == 1 || b_ofs.y == 1) {
+      block->x -= 1;
+      br->x += 1;
+    }
+  }
+
+  if (rec->y != 0) {
+    rec->y -= 1;
+  }
+  if (rec->x != 0) {
+    rec->x -= 1;
+  }
+}
+
 void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
                      const sao_info *sao_luma, const sao_info *sao_chroma)
 {
-  pixel rec_y[LCU_LUMA_SIZE];
+  pixel rec_y[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
   pixel new_rec_y[LCU_LUMA_SIZE];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-  // TODO: sao chroma reconstruct
+
+  int x = x_ctb * LCU_WIDTH, y = y_ctb * LCU_WIDTH;
+  
+  vector2d rec;
+  vector2d tl = { 1, 1 };
+  vector2d br = { 1, 1 };
+  vector2d block = { LCU_WIDTH, LCU_WIDTH };
+
+  rec.x = x_ctb * LCU_WIDTH;
+  rec.y = y_ctb * LCU_WIDTH;
+
+  sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
 
   // Data to tmp buffer.
-  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+  picture_blit_pixels(&pic->y_recdata[rec.y * pic->width + rec.x], rec_y,
+                      tl.x + block.x + br.x,
+                      tl.y + block.y + br.y,
+                      pic->width, LCU_WIDTH + 2);
 
-  sao_reconstruct_color(rec_y, new_rec_y, sao_luma, LCU_WIDTH);
+  picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+
+  sao_reconstruct_color(&rec_y[tl.y * (tl.x + block.x + br.x) + tl.x], 
+                        new_rec_y, sao_luma, 
+                        LCU_WIDTH + 2, LCU_WIDTH,
+                        block.x, block.y);
   //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
   //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
   

From a57b938270acb1be276e280bae4854c528c09e31 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 19:27:47 +0200
Subject: [PATCH 09/25] Add new module sao.

- Move sao-stuff not directly related to encoding to sao-module.
- Calculate sao for all LCUs before encoding any of them. This is in
  preparation to doing the reconstruction line at a time instead of
  LCU at a time.
---
 src/encoder.c | 369 +++++---------------------------------------------
 src/global.h  |  10 +-
 src/picture.c |   7 +
 src/picture.h |   5 +
 src/sao.c     | 299 ++++++++++++++++++++++++++++++++++++++++
 src/sao.h     |  53 ++++++++
 src/search.c  |   5 -
 7 files changed, 406 insertions(+), 342 deletions(-)
 create mode 100644 src/sao.c
 create mode 100644 src/sao.h

diff --git a/src/encoder.c b/src/encoder.c
index 574f3591..2e08c6dc 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -27,6 +27,7 @@
 #include "inter.h"
 #include "filter.h"
 #include "search.h"
+#include "sao.h"
 
 int16_t g_lambda_cost[55];
 uint32_t* g_sig_last_scan[3][7];
@@ -774,323 +775,6 @@ void encode_slice_header(encoder_control* encoder)
 }
 
 
-  // TODO: move somewhere else (sao.h?)
-#define SAO_ABS_OFFSET_MAX ((1 << (MIN(BIT_DEPTH, 10) - 5)) - 1)
-//#define SAO_ABS_OFFSET_MAX 7
-
-typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2, NUM_COLORS } color_index;
-typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
-typedef enum { SAO_EO0 = 0, SAO_EO1, SAO_EO2, SAO_EO3, SAO_NUM_EO } sao_eo_class;
-typedef enum { SAO_EO_CAT0 = 0, SAO_EO_CAT1, SAO_EO_CAT2, SAO_EO_CAT3, SAO_EO_CAT4, NUM_SAO_EDGE_CATEGORIES } sao_eo_cat;
-
-typedef struct {
-  sao_type type;
-  sao_eo_class eo_class;
-  int ddistortion;
-  int merge_left_flag;
-  int merge_up_flag;
-  int offsets[NUM_SAO_EDGE_CATEGORIES];
-} sao_info;
-
-void init_sao_info(sao_info *sao) {
-  sao->type = SAO_TYPE_NONE;
-  sao->merge_left_flag = 0;
-  sao->merge_up_flag = 0;
-}
-
-//#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
-#define SIGN3(x) (((x) > 0) - ((x) < 0))
-#define NUM_SAO_EDGE_DIRS 4;
-
-typedef struct {
-  int x;
-  int y;
-} vector2d;
-
-// Offsets of a and b in relation to c.
-// dir_offset[dir][a or b]
-// |       |   a   | a     |     a |
-// | a c b |   c   |   c   |   c   |
-// |       |   b   |     b | b     |
-static const vector2d g_sao_edge_offsets[4][2] = { 
-  { { -1, 0 }, { 1, 0 } },
-  { { 0, -1 }, { 0, 1 } },
-  { { -1, -1 }, { 1, 1 } },
-  { { 1, -1 }, { -1, 1 } }
-};
-// Mapping of edge_idx values to eo-classes.
-static const unsigned g_sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
-// Mapping relationships between a, b and c to eo_idx.
-#define EO_IDX(a, b, c) (2 + SIGN3((c) - (a)) + SIGN3((c) - (b)))
-
-/**
- * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
- * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
- * \param dir_offsets
- * \param is_chroma  0 for luma, 1 for chroma. Indicates 
- */
-void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
-                       int eo_class, int block_width,
-                       int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
-{
-  int y, x;
-  vector2d a_ofs = g_sao_edge_offsets[eo_class][0];
-  vector2d b_ofs = g_sao_edge_offsets[eo_class][1];
-  // Arrays orig_data and rec_data are quarter size for chroma.
-
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
-  for (y = 1; y < block_width - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const pixel *c_data = &rec_data[y * block_width + x];
-      pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
-      pixel c = c_data[0];
-      pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
-      
-      int eo_idx = EO_IDX(a, b, c);
-      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
-
-      cat_sum_cnt[0][eo_cat] += orig_data[y * block_width + x] - c;
-      cat_sum_cnt[1][eo_cat] += 1;
-    }
-  }
-}
-
-void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao_info *sao, 
-                           int stride, int new_stride, int block_width, int block_height)
-{
-  int y, x;
-  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
-  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
-  // Arrays orig_data and rec_data are quarter size for chroma.
-
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const pixel *c_data = &rec_data[y * stride + x];
-      pixel *new_data = &new_rec_data[y * new_stride + x];
-      pixel a = c_data[a_ofs.y * stride + a_ofs.x];
-      pixel c = c_data[0];
-      pixel b = c_data[b_ofs.y * stride + b_ofs.x];
-      
-      int eo_idx = EO_IDX(a, b, c);
-      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
-
-      new_data[0] = CLIP(0, (1 << BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat]);
-    }
-  }
-}
-
-/**
- * \brief Calculate dimensions of the buffer used by sao reconstruction.
- *
- * This function calculates 4 vectors that can be used to make the temporary
- * buffers required by sao_reconstruct_color.
- *
- * Vector block is the area affected by sao. Vectors tr and br are top-left
- * margin and bottom-right margin, which contain pixels that are not modified
- * by the reconstruction of this LCU but are needed by the reconstruction.
- * Vector rec is the coordinate of the area required by sao reconstruction.
- *
- * The margins are always either 0 or 1, depending on the direction of the
- * edge offset class.
- *
- * This also takes into account borders of the picture and non-LCU sized
- * CU's at the bottom and right of the picture.
- * 
- * \ rec
- *  +------+
- *  |\ tl  |
- *  | +--+ |
- *  | |\ block
- *  | | \| |
- *  | +--+ |
- *  |     \ br
- *  +------+
- *
- * \param pic  Picture.
- * \param sao  Sao parameters.
- * \param rec  Top-left corner of the LCU, modified to be top-left corner of 
- */
-void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec, 
-                         vector2d *tl, vector2d *br, vector2d *block)
-{
-  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
-  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
-
-  // Handle top and left.
-  if (rec->y == 0) {
-    tl->y = 0;
-    if (a_ofs.y == -1 || b_ofs.y == -1) {
-      block->y -= 1;
-      tl->y += 1;
-    }
-  }
-  if (rec->x == 0) {
-    tl->x = 0;
-    if (a_ofs.x == -1 || b_ofs.x == -1) {
-      block->x -= 1;
-      tl->x += 1;
-    }
-  }
-
-  // Handle right and bottom, taking care of non-LCU sized CUs.
-  if (rec->y + LCU_WIDTH >= pic->height) {
-    br->y = 0;
-    if (rec->y + LCU_WIDTH >= pic->height) {
-      block->y = pic->height - rec->y;
-    }
-    if (a_ofs.y == 1 || b_ofs.y == 1) {
-      block->y -= 1;
-      br->y += 1;
-    }
-  }
-  if (rec->x + LCU_WIDTH >= pic->width) {
-    br->x = 0;
-    if (rec->x + LCU_WIDTH > pic->width) {
-      block->x = pic->width - rec->x;
-    }
-    if (a_ofs.x == 1 || b_ofs.y == 1) {
-      block->x -= 1;
-      br->x += 1;
-    }
-  }
-
-  if (rec->y != 0) {
-    rec->y -= 1;
-  }
-  if (rec->x != 0) {
-    rec->x -= 1;
-  }
-}
-
-void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
-                     const sao_info *sao_luma, const sao_info *sao_chroma)
-{
-  pixel rec_y[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
-  pixel new_rec_y[LCU_LUMA_SIZE];
-  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-
-  int x = x_ctb * LCU_WIDTH, y = y_ctb * LCU_WIDTH;
-  
-  vector2d rec;
-  vector2d tl = { 1, 1 };
-  vector2d br = { 1, 1 };
-  vector2d block = { LCU_WIDTH, LCU_WIDTH };
-
-  rec.x = x_ctb * LCU_WIDTH;
-  rec.y = y_ctb * LCU_WIDTH;
-
-  sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
-
-  // Data to tmp buffer.
-  picture_blit_pixels(&pic->y_recdata[rec.y * pic->width + rec.x], rec_y,
-                      tl.x + block.x + br.x,
-                      tl.y + block.y + br.y,
-                      pic->width, LCU_WIDTH + 2);
-
-  picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
-
-  sao_reconstruct_color(&rec_y[tl.y * (tl.x + block.x + br.x) + tl.x], 
-                        new_rec_y, sao_luma, 
-                        LCU_WIDTH + 2, LCU_WIDTH,
-                        block.x, block.y);
-  //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
-  //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
-  
-  // Copy reconstructed block from tmp buffer to rec image.
-  picture_blit_pixels(new_rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
-}
-
-void sao_search_best_mode(const pixel *data, const pixel *recdata, 
-                          unsigned block_width, unsigned buf_size, unsigned buf_cnt,
-                          sao_info *sao_out)
-{
-  sao_eo_class edge_class;
-  // This array is used to calculate the mean offset used to minimize distortion.
-  int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES];
-  memset(cat_sum_cnt, 0, sizeof(int) * 2 * NUM_SAO_EDGE_CATEGORIES);
-
-  sao_out->ddistortion = INT_MAX;
-
-  for (edge_class = SAO_EO0; edge_class <= SAO_EO3; ++edge_class) {
-    int edge_offset[NUM_SAO_EDGE_CATEGORIES];
-    int sum_ddistortion = 0;
-    sao_eo_cat edge_cat;
-    unsigned i = 0;
-
-    // Call calc_sao_edge_dir once for luma and twice for chroma.
-    for (i = 0; i < buf_cnt; ++i) {
-      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class, block_width, cat_sum_cnt);
-    }
-    
-    for (edge_cat = SAO_EO_CAT1; edge_cat <= SAO_EO_CAT4; ++edge_cat) {
-      int cat_sum = cat_sum_cnt[0][edge_cat];
-      int cat_cnt = cat_sum_cnt[1][edge_cat];
-      
-      // The optimum offset can be calculated by getting the minima of the
-      // fast ddistortion estimation formula. The minima is the mean error
-      // and we round that to the nearest integer.
-      int offset = 0;
-      if (cat_cnt != 0) {
-        offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
-        offset = CLIP(-SAO_ABS_OFFSET_MAX, SAO_ABS_OFFSET_MAX, offset);
-      }
-      edge_offset[edge_cat] = offset;
-      // The ddistortion is amount by which the SSE of data changes. It should
-      // be negative for all categories, if offset was chosen correctly.
-      // ddistortion = N * h^2 - 2 * h * E, where N is the number of samples 
-      // and E is the sum of errors.
-      // It basically says that all pixels that are not improved by offset
-      // increase increase SSE by h^2 and all pixels that are improved by
-      // offset decrease SSE by h*E.
-      sum_ddistortion += cat_cnt * offset * offset - 2 * offset * cat_sum;
-    }
-    // SAO is not applied for category 0.
-    edge_offset[SAO_EO_CAT0] = 0;
-
-    // Choose the offset class that offers the least error after offset.
-    if (sum_ddistortion < sao_out->ddistortion) {
-      sao_out->eo_class = edge_class;
-      sao_out->ddistortion = sum_ddistortion;
-      memcpy(sao_out->offsets, edge_offset, sizeof(int) * NUM_SAO_EDGE_CATEGORIES);
-    }
-  }
-}
-
- void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
-{
-  
-}
-
-void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
-{
-  // These buffers are needed only until we switch to a LCU based data
-  // structure for pixels. Then we can give pointers directly to that structure
-  // without making copies.
-  // It's 2-dimensional because sao_search_best_mode takes arguments as arrays.
-  pixel orig_y[LCU_LUMA_SIZE];
-  pixel rec_y[LCU_LUMA_SIZE];
-  pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-  
-  sao->offsets[SAO_EO_CAT0] = 0;
-  sao->offsets[SAO_EO_CAT1] = 7;
-  sao->offsets[SAO_EO_CAT2] = 7;
-  sao->offsets[SAO_EO_CAT3] = -7;
-  sao->offsets[SAO_EO_CAT4] = -7;
-  sao->eo_class = SAO_EO0;
-  sao->type = SAO_TYPE_EDGE;
-  return;
-
-  // Fill temporary buffers with picture data.
-  picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
-  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
-
-  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, sao);
-}
-
 void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color_i)
 {
   picture *pic = encoder->in.cur_pic;
@@ -1185,6 +869,34 @@ void encode_slice_data(encoder_control* encoder)
 {
   uint16_t x_ctb, y_ctb;
   
+  if (encoder->sao_enable) {
+    for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
+      for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
+        picture *pic = encoder->in.cur_pic;
+        unsigned stride = encoder->in.height_in_lcu;
+        sao_info *sao_luma = &pic->sao_luma[y_ctb * stride + x_ctb];
+        sao_info *sao_chroma = &pic->sao_chroma[y_ctb * stride + x_ctb];
+        init_sao_info(sao_luma);
+        init_sao_info(sao_chroma);
+
+        // Temporary guards against non-LCU size coding units at the edges,
+        // because they aren't handled yet.
+        if (encoder->in.width_in_lcu * LCU_WIDTH != encoder->in.cur_pic->width
+            && x_ctb == encoder->in.width_in_lcu - 1) {
+
+        } else if (encoder->in.height_in_lcu * LCU_WIDTH != encoder->in.cur_pic->height
+                   && y_ctb == encoder->in.height_in_lcu - 1) {
+
+        } else {
+          sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma);
+          // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+          // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+          sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma, sao_chroma);
+        }
+      }
+    }
+  }
+
   init_contexts(encoder,encoder->in.cur_pic->slicetype);
 
   // Loop through every LCU in the slice
@@ -1196,25 +908,10 @@ void encode_slice_data(encoder_control* encoder)
       uint8_t depth = 0;
 
       if (encoder->sao_enable) {
-        sao_info sao_luma;
-        sao_info sao_chroma;
-        init_sao_info(&sao_luma);
-        init_sao_info(&sao_chroma);
-        
-        // Temporary guards against non-LCU size coding units at the edges,
-        // because they aren't handled yet.
-        if (encoder->in.width_in_lcu * LCU_WIDTH != encoder->in.cur_pic->width
-            && x_ctb == encoder->in.width_in_lcu - 1) {
-
-        } else if (encoder->in.height_in_lcu * LCU_WIDTH != encoder->in.cur_pic->height
-                   && y_ctb == encoder->in.height_in_lcu - 1) {
-
-        } else {
-          sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma);
-          // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-          // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-          sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, &sao_luma, &sao_chroma);
-        }
+        picture *pic = encoder->in.cur_pic;
+        unsigned stride = encoder->in.height_in_lcu;
+        sao_info sao_luma = pic->sao_luma[y_ctb * stride + x_ctb];
+        sao_info sao_chroma = pic->sao_chroma[y_ctb * stride + x_ctb];
 
         encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma);
       }
diff --git a/src/global.h b/src/global.h
index 2efe1b17..868b1cdb 100644
--- a/src/global.h
+++ b/src/global.h
@@ -87,13 +87,16 @@ typedef int16_t coefficient;
 // CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width
 #define CU_TO_PIXEL(x, y, depth, width) (((y) << (LOG2_LCU_WIDTH - (depth))) * (width) \
                                          + ((x) << (LOG2_LCU_WIDTH - (depth))))
-
+//#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
+#define SIGN3(x) (((x) > 0) - ((x) < 0))
 
 #define VERSION_STRING "0.2               "
 #define VERSION 0.2
 
 //#define VERBOSE 1
 
+#define SAO_ABS_OFFSET_MAX ((1 << (MIN(BIT_DEPTH, 10) - 5)) - 1)
+
 
 #define SIZE_2Nx2N 0
 #define SIZE_2NxN  1
@@ -125,4 +128,9 @@ typedef int16_t coefficient;
 #define FREE_POINTER(pointer) { free(pointer); pointer = NULL; }
 #define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; }
 
+typedef struct {
+  int x;
+  int y;
+} vector2d;
+
 #endif
\ No newline at end of file
diff --git a/src/picture.c b/src/picture.c
index 1df0bca2..d0efd4e1 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -16,6 +16,8 @@
 #include <stdlib.h>
 #include <math.h>
 
+#include "sao.h"
+
 
 #define PSNRMAX (255.0 * 255.0)
 
@@ -306,6 +308,8 @@ picture *picture_init(int32_t width, int32_t height,
 
   pic->slice_sao_luma_flag = 1;
   pic->slice_sao_chroma_flag = 1;
+  pic->sao_luma = MALLOC(sao_info, width_in_lcu * height_in_lcu);
+  pic->sao_chroma = MALLOC(sao_info, width_in_lcu * height_in_lcu);
 
   return pic;
 }
@@ -346,6 +350,9 @@ int picture_destroy(picture *pic)
   FREE_POINTER(pic->pred_u);
   FREE_POINTER(pic->pred_v);
 
+  FREE_POINTER(pic->sao_luma);
+  FREE_POINTER(pic->sao_chroma);
+
   return 1;
 }
 
diff --git a/src/picture.h b/src/picture.h
index 619906a1..ea11cae4 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -14,6 +14,9 @@
 
 #include "global.h"
 
+//#include "sao.h"
+struct sao_info_struct;
+
 
 //////////////////////////////////////////////////////////////////////////
 // CONSTANTS
@@ -105,6 +108,8 @@ typedef struct
   uint8_t slicetype;
   uint8_t slice_sao_luma_flag;
   uint8_t slice_sao_chroma_flag;
+  struct sao_info_struct *sao_luma;   //!< \brief Array of sao parameters for every LCU.
+  struct sao_info_struct *sao_chroma;   //!< \brief Array of sao parameters for every LCU.
 } picture;
 
 /**
diff --git a/src/sao.c b/src/sao.c
new file mode 100644
index 00000000..31e45876
--- /dev/null
+++ b/src/sao.c
@@ -0,0 +1,299 @@
+/**
+ * \file
+ * 
+ * \author Marko Viitanen ( fador@iki.fi ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ * \author Ari Koivula ( ari@koivu.la ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ */
+
+#include "sao.h"
+
+#include <string.h>
+
+#include "picture.h"
+
+
+
+void init_sao_info(sao_info *sao) {
+  sao->type = SAO_TYPE_NONE;
+  sao->merge_left_flag = 0;
+  sao->merge_up_flag = 0;
+}
+
+// Mapping of edge_idx values to eo-classes.
+static const unsigned g_sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
+// Mapping relationships between a, b and c to eo_idx.
+#define EO_IDX(a, b, c) (2 + SIGN3((c) - (a)) + SIGN3((c) - (b)))
+
+/**
+ * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param dir_offsets
+ * \param is_chroma  0 for luma, 1 for chroma. Indicates 
+ */
+void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
+                       int eo_class, int block_width,
+                       int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
+{
+  int y, x;
+  vector2d a_ofs = g_sao_edge_offsets[eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[eo_class][1];
+  // Arrays orig_data and rec_data are quarter size for chroma.
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_width - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const pixel *c_data = &rec_data[y * block_width + x];
+      pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
+      pixel c = c_data[0];
+      pixel b = c_data[b_ofs.y * block_width + b_ofs.x];
+      
+      int eo_idx = EO_IDX(a, b, c);
+      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
+
+      cat_sum_cnt[0][eo_cat] += orig_data[y * block_width + x] - c;
+      cat_sum_cnt[1][eo_cat] += 1;
+    }
+  }
+}
+
+void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao_info *sao, 
+                           int stride, int new_stride, int block_width, int block_height)
+{
+  int y, x;
+  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+  // Arrays orig_data and rec_data are quarter size for chroma.
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_height - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const pixel *c_data = &rec_data[y * stride + x];
+      pixel *new_data = &new_rec_data[y * new_stride + x];
+      pixel a = c_data[a_ofs.y * stride + a_ofs.x];
+      pixel c = c_data[0];
+      pixel b = c_data[b_ofs.y * stride + b_ofs.x];
+      
+      int eo_idx = EO_IDX(a, b, c);
+      int eo_cat = g_sao_eo_idx_to_eo_category[eo_idx];
+
+      new_data[0] = CLIP(0, (1 << BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat]);
+    }
+  }
+}
+
+/**
+ * \brief Calculate dimensions of the buffer used by sao reconstruction.
+ *
+ * This function calculates 4 vectors that can be used to make the temporary
+ * buffers required by sao_reconstruct_color.
+ *
+ * Vector block is the area affected by sao. Vectors tr and br are top-left
+ * margin and bottom-right margin, which contain pixels that are not modified
+ * by the reconstruction of this LCU but are needed by the reconstruction.
+ * Vector rec is the coordinate of the area required by sao reconstruction.
+ *
+ * The margins are always either 0 or 1, depending on the direction of the
+ * edge offset class.
+ *
+ * This also takes into account borders of the picture and non-LCU sized
+ * CU's at the bottom and right of the picture.
+ * 
+ * \ rec
+ *  +------+
+ *  |\ tl  |
+ *  | +--+ |
+ *  | |\ block
+ *  | | \| |
+ *  | +--+ |
+ *  |     \ br
+ *  +------+
+ *
+ * \param pic  Picture.
+ * \param sao  Sao parameters.
+ * \param rec  Top-left corner of the LCU, modified to be top-left corner of 
+ */
+void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec, 
+                         vector2d *tl, vector2d *br, vector2d *block)
+{
+  vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
+  vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+
+  // Handle top and left.
+  if (rec->y == 0) {
+    tl->y = 0;
+    if (a_ofs.y == -1 || b_ofs.y == -1) {
+      block->y -= 1;
+      tl->y += 1;
+    }
+  }
+  if (rec->x == 0) {
+    tl->x = 0;
+    if (a_ofs.x == -1 || b_ofs.x == -1) {
+      block->x -= 1;
+      tl->x += 1;
+    }
+  }
+
+  // Handle right and bottom, taking care of non-LCU sized CUs.
+  if (rec->y + LCU_WIDTH >= pic->height) {
+    br->y = 0;
+    if (rec->y + LCU_WIDTH >= pic->height) {
+      block->y = pic->height - rec->y;
+    }
+    if (a_ofs.y == 1 || b_ofs.y == 1) {
+      block->y -= 1;
+      br->y += 1;
+    }
+  }
+  if (rec->x + LCU_WIDTH >= pic->width) {
+    br->x = 0;
+    if (rec->x + LCU_WIDTH > pic->width) {
+      block->x = pic->width - rec->x;
+    }
+    if (a_ofs.x == 1 || b_ofs.y == 1) {
+      block->x -= 1;
+      br->x += 1;
+    }
+  }
+
+  if (rec->y != 0) {
+    rec->y -= 1;
+  }
+  if (rec->x != 0) {
+    rec->x -= 1;
+  }
+}
+
+void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
+                     const sao_info *sao_luma, const sao_info *sao_chroma)
+{
+  pixel rec_y[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
+  pixel new_rec_y[LCU_LUMA_SIZE];
+  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+
+  int x = x_ctb * LCU_WIDTH, y = y_ctb * LCU_WIDTH;
+  
+  vector2d rec;
+  vector2d tl = { 1, 1 };
+  vector2d br = { 1, 1 };
+  vector2d block = { LCU_WIDTH, LCU_WIDTH };
+
+  rec.x = x_ctb * LCU_WIDTH;
+  rec.y = y_ctb * LCU_WIDTH;
+
+  sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
+
+  // Data to tmp buffer.
+  picture_blit_pixels(&pic->y_recdata[rec.y * pic->width + rec.x], rec_y,
+                      tl.x + block.x + br.x,
+                      tl.y + block.y + br.y,
+                      pic->width, LCU_WIDTH + 2);
+
+  picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+
+  sao_reconstruct_color(&rec_y[tl.y * (tl.x + block.x + br.x) + tl.x], 
+                        new_rec_y, sao_luma, 
+                        LCU_WIDTH + 2, LCU_WIDTH,
+                        block.x, block.y);
+  //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
+  //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
+  
+  // Copy reconstructed block from tmp buffer to rec image.
+  picture_blit_pixels(new_rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
+}
+
+
+
+void sao_search_best_mode(const pixel *data, const pixel *recdata, 
+                          unsigned block_width, unsigned buf_size, unsigned buf_cnt,
+                          sao_info *sao_out)
+{
+  sao_eo_class edge_class;
+  // This array is used to calculate the mean offset used to minimize distortion.
+  int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES];
+  memset(cat_sum_cnt, 0, sizeof(int) * 2 * NUM_SAO_EDGE_CATEGORIES);
+
+  sao_out->ddistortion = INT_MAX;
+
+  for (edge_class = SAO_EO0; edge_class <= SAO_EO3; ++edge_class) {
+    int edge_offset[NUM_SAO_EDGE_CATEGORIES];
+    int sum_ddistortion = 0;
+    sao_eo_cat edge_cat;
+    unsigned i = 0;
+
+    // Call calc_sao_edge_dir once for luma and twice for chroma.
+    for (i = 0; i < buf_cnt; ++i) {
+      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class, block_width, cat_sum_cnt);
+    }
+    
+    for (edge_cat = SAO_EO_CAT1; edge_cat <= SAO_EO_CAT4; ++edge_cat) {
+      int cat_sum = cat_sum_cnt[0][edge_cat];
+      int cat_cnt = cat_sum_cnt[1][edge_cat];
+      
+      // The optimum offset can be calculated by getting the minima of the
+      // fast ddistortion estimation formula. The minima is the mean error
+      // and we round that to the nearest integer.
+      int offset = 0;
+      if (cat_cnt != 0) {
+        offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
+        offset = CLIP(-SAO_ABS_OFFSET_MAX, SAO_ABS_OFFSET_MAX, offset);
+      }
+      edge_offset[edge_cat] = offset;
+      // The ddistortion is amount by which the SSE of data changes. It should
+      // be negative for all categories, if offset was chosen correctly.
+      // ddistortion = N * h^2 - 2 * h * E, where N is the number of samples 
+      // and E is the sum of errors.
+      // It basically says that all pixels that are not improved by offset
+      // increase increase SSE by h^2 and all pixels that are improved by
+      // offset decrease SSE by h*E.
+      sum_ddistortion += cat_cnt * offset * offset - 2 * offset * cat_sum;
+    }
+    // SAO is not applied for category 0.
+    edge_offset[SAO_EO_CAT0] = 0;
+
+    // Choose the offset class that offers the least error after offset.
+    if (sum_ddistortion < sao_out->ddistortion) {
+      sao_out->eo_class = edge_class;
+      sao_out->ddistortion = sum_ddistortion;
+      memcpy(sao_out->offsets, edge_offset, sizeof(int) * NUM_SAO_EDGE_CATEGORIES);
+    }
+  }
+}
+
+ void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
+{
+  
+}
+
+void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
+{
+  // These buffers are needed only until we switch to a LCU based data
+  // structure for pixels. Then we can give pointers directly to that structure
+  // without making copies.
+  // It's 2-dimensional because sao_search_best_mode takes arguments as arrays.
+  pixel orig_y[LCU_LUMA_SIZE];
+  pixel rec_y[LCU_LUMA_SIZE];
+  pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  
+  sao->offsets[SAO_EO_CAT0] = 0;
+  sao->offsets[SAO_EO_CAT1] = 7;
+  sao->offsets[SAO_EO_CAT2] = 7;
+  sao->offsets[SAO_EO_CAT3] = -7;
+  sao->offsets[SAO_EO_CAT4] = -7;
+  sao->eo_class = SAO_EO0;
+  sao->type = SAO_TYPE_EDGE;
+  return;
+
+  // Fill temporary buffers with picture data.
+  picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+
+  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, sao);
+}
diff --git a/src/sao.h b/src/sao.h
new file mode 100644
index 00000000..8fabd2ae
--- /dev/null
+++ b/src/sao.h
@@ -0,0 +1,53 @@
+#ifndef SAO_H_
+#define SAO_H_
+/**
+ * \file
+ * \brief Coding Unit (CU) and picture data related functions.
+ * 
+ * \author Marko Viitanen ( fador@iki.fi ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ * \author Ari Koivula ( ari@koivu.la ), 
+ *         Tampere University of Technology,
+ *         Department of Pervasive Computing.
+ */
+
+#include "global.h"
+#include "picture.h"
+
+
+typedef enum { COLOR_Y = 0, COLOR_U = 1, COLOR_V = 2, NUM_COLORS } color_index;
+typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
+typedef enum { SAO_EO0 = 0, SAO_EO1, SAO_EO2, SAO_EO3, SAO_NUM_EO } sao_eo_class;
+typedef enum { SAO_EO_CAT0 = 0, SAO_EO_CAT1, SAO_EO_CAT2, SAO_EO_CAT3, SAO_EO_CAT4, NUM_SAO_EDGE_CATEGORIES } sao_eo_cat;
+
+// Offsets of a and b in relation to c.
+// dir_offset[dir][a or b]
+// |       |   a   | a     |     a |
+// | a c b |   c   |   c   |   c   |
+// |       |   b   |     b | b     |
+static const vector2d g_sao_edge_offsets[SAO_NUM_EO][2] = { 
+  { { -1, 0 }, { 1, 0 } },
+  { { 0, -1 }, { 0, 1 } },
+  { { -1, -1 }, { 1, 1 } },
+  { { 1, -1 }, { -1, 1 } }
+};
+
+
+typedef struct sao_info_struct {
+  sao_type type;
+  sao_eo_class eo_class;
+  int ddistortion;
+  int merge_left_flag;
+  int merge_up_flag;
+  int offsets[NUM_SAO_EDGE_CATEGORIES];
+} sao_info;
+
+
+void init_sao_info(sao_info *sao);
+void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
+void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
+void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
+                     const sao_info *sao_luma, const sao_info *sao_chroma);
+
+#endif
\ No newline at end of file
diff --git a/src/search.c b/src/search.c
index eb7ac666..87f3d6b8 100644
--- a/src/search.c
+++ b/src/search.c
@@ -35,11 +35,6 @@
   && (x) + (block_width) <= (width) \
   && (y) + (block_height) <= (height))
 
-typedef struct {
-  int x;
-  int y;
-} vector2d;
-
 /** 
  * This is used in the hexagon_search to select 3 points to search.
  * 

From f21df00386cd473756b8f89fbddf23ff21be7a82 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 20:33:29 +0200
Subject: [PATCH 10/25] Save a copy of luma for sao.

---
 src/encoder.c | 8 ++++++--
 src/sao.c     | 5 +++--
 src/sao.h     | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 2e08c6dc..bbe6b8b7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -868,11 +868,12 @@ void encode_sao(encoder_control *encoder, unsigned x_lcu, uint16_t y_lcu,
 void encode_slice_data(encoder_control* encoder)
 {
   uint16_t x_ctb, y_ctb;
+  picture *pic = encoder->in.cur_pic;
+  pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
   
   if (encoder->sao_enable) {
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
       for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
-        picture *pic = encoder->in.cur_pic;
         unsigned stride = encoder->in.height_in_lcu;
         sao_info *sao_luma = &pic->sao_luma[y_ctb * stride + x_ctb];
         sao_info *sao_chroma = &pic->sao_chroma[y_ctb * stride + x_ctb];
@@ -891,12 +892,15 @@ void encode_slice_data(encoder_control* encoder)
           sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma);
           // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
           // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-          sao_reconstruct(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma, sao_chroma);
+          sao_reconstruct(encoder->in.cur_pic, new_y_data, x_ctb, y_ctb, sao_luma, sao_chroma);
         }
       }
     }
   }
 
+  memcpy(pic->y_recdata, new_y_data, sizeof(pixel) * pic->width * pic->height);
+  free(new_y_data);
+
   init_contexts(encoder,encoder->in.cur_pic->slicetype);
 
   // Loop through every LCU in the slice
diff --git a/src/sao.c b/src/sao.c
index 31e45876..952fb03b 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -170,12 +170,13 @@ void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec,
   }
 }
 
-void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
+void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y_ctb, 
                      const sao_info *sao_luma, const sao_info *sao_chroma)
 {
   pixel rec_y[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
   pixel new_rec_y[LCU_LUMA_SIZE];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  pixel *new_y_recdata = &new_y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
 
   int x = x_ctb * LCU_WIDTH, y = y_ctb * LCU_WIDTH;
   
@@ -205,7 +206,7 @@ void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb,
   //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
   
   // Copy reconstructed block from tmp buffer to rec image.
-  picture_blit_pixels(new_rec_y, y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
+  picture_blit_pixels(new_rec_y, new_y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
 }
 
 
diff --git a/src/sao.h b/src/sao.h
index 8fabd2ae..e32010fe 100644
--- a/src/sao.h
+++ b/src/sao.h
@@ -47,7 +47,7 @@ typedef struct sao_info_struct {
 void init_sao_info(sao_info *sao);
 void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
 void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
-void sao_reconstruct(picture *pic, unsigned x_ctb, unsigned y_ctb, 
+void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y_ctb, 
                      const sao_info *sao_luma, const sao_info *sao_chroma);
 
 #endif
\ No newline at end of file

From 5f3ee9e0964c1e05595064b603dcae941c6d4c6a Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 20:35:48 +0200
Subject: [PATCH 11/25] Sao vcxproj files.

---
 build/VS2010/HEVC_encoder.vcxproj         | 6 ++----
 build/VS2010/HEVC_encoder.vcxproj.filters | 6 ++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/build/VS2010/HEVC_encoder.vcxproj b/build/VS2010/HEVC_encoder.vcxproj
index 4aa42f54..f9cef408 100644
--- a/build/VS2010/HEVC_encoder.vcxproj
+++ b/build/VS2010/HEVC_encoder.vcxproj
@@ -23,9 +23,7 @@
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>HEVC_encoder</RootNamespace>
   </PropertyGroup>
-
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
@@ -42,9 +40,7 @@
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
   </PropertyGroup>
-
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-
   <ImportGroup Label="ExtensionSettings">
     <Import Project="..\..\yasm\vsyasm.props" />
   </ImportGroup>
@@ -88,6 +84,7 @@
     <ClCompile Include="..\..\src\intra.c" />
     <ClCompile Include="..\..\src\nal.c" />
     <ClCompile Include="..\..\src\picture.c" />
+    <ClCompile Include="..\..\src\sao.c" />
     <ClCompile Include="..\..\src\search.c" />
     <ClCompile Include="..\..\src\transform.c" />
   </ItemGroup>
@@ -104,6 +101,7 @@
     <ClInclude Include="..\..\src\intra.h" />
     <ClInclude Include="..\..\src\nal.h" />
     <ClInclude Include="..\..\src\picture.h" />
+    <ClInclude Include="..\..\src\sao.h" />
     <ClInclude Include="..\..\src\search.h" />
     <ClInclude Include="..\..\src\transform.h" />
     <ClInclude Include="..\..\src\x64\test64.h" />
diff --git a/build/VS2010/HEVC_encoder.vcxproj.filters b/build/VS2010/HEVC_encoder.vcxproj.filters
index fafb8952..560b3c4a 100644
--- a/build/VS2010/HEVC_encoder.vcxproj.filters
+++ b/build/VS2010/HEVC_encoder.vcxproj.filters
@@ -69,6 +69,9 @@
     <ClCompile Include="..\..\src\debug.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\sao.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\global.h">
@@ -119,6 +122,9 @@
     <ClInclude Include="..\..\src\debug.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\sao.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\x86\test.asm">

From 91024a2095652541e3811c273b0d618755e21dc3 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 20:51:51 +0200
Subject: [PATCH 12/25] Disable deblocking because it breaks sao.

---
 src/encmain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encmain.c b/src/encmain.c
index 34938997..c6d041e4 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -146,7 +146,7 @@ int main(int argc, char *argv[])
   encoder->QP       = 32;
   encoder->in.video_format = FORMAT_420;
   // deblocking filter
-  encoder->deblock_enable  = 1;
+  encoder->deblock_enable  = 0;
   encoder->beta_offset_div2  = 0;
   encoder->tc_offset_div2    = 0;
   // SAO

From 5791301c2b0cd8598173882cae3fc4ee2614c9e9 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 4 Nov 2013 21:37:04 +0200
Subject: [PATCH 13/25] Fix moving of rightmost LCUs during sao reconstruction.

Now the leftmost LCUs move though.
---
 src/sao.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sao.c b/src/sao.c
index 952fb03b..8e9eb5f9 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -198,7 +198,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
 
   picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
-  sao_reconstruct_color(&rec_y[tl.y * (tl.x + block.x + br.x) + tl.x], 
+  sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
                         new_rec_y, sao_luma, 
                         LCU_WIDTH + 2, LCU_WIDTH,
                         block.x, block.y);

From 98f2a1aedc5f4933c2729ae15412549dea9e5549 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 5 Nov 2013 10:49:42 +0200
Subject: [PATCH 14/25] Fix LCU borders in sao reconstruction.

---
 src/encoder.c | 1 +
 src/sao.c     | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index bbe6b8b7..6c9190b7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -870,6 +870,7 @@ void encode_slice_data(encoder_control* encoder)
   uint16_t x_ctb, y_ctb;
   picture *pic = encoder->in.cur_pic;
   pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
+  //memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
   
   if (encoder->sao_enable) {
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
diff --git a/src/sao.c b/src/sao.c
index 8e9eb5f9..95a614ed 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -71,8 +71,8 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
 
   // Don't sample the edge pixels because this function doesn't have access to
   // their neighbours.
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
       const pixel *c_data = &rec_data[y * stride + x];
       pixel *new_data = &new_rec_data[y * new_stride + x];
       pixel a = c_data[a_ofs.y * stride + a_ofs.x];
@@ -196,7 +196,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
                       tl.y + block.y + br.y,
                       pic->width, LCU_WIDTH + 2);
 
-  picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+  //picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
   sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
                         new_rec_y, sao_luma, 

From 1c03471d579adf9d63c58a97a08822c67068de09 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 6 Nov 2013 17:23:38 +0200
Subject: [PATCH 15/25] Fix for leftmost LCUs being offset by one pixel during
 SAO reconstruction.

---
 src/sao.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/sao.c b/src/sao.c
index 95a614ed..c3e48483 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -185,8 +185,8 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   vector2d br = { 1, 1 };
   vector2d block = { LCU_WIDTH, LCU_WIDTH };
 
-  rec.x = x_ctb * LCU_WIDTH;
-  rec.y = y_ctb * LCU_WIDTH;
+  rec.x = x;
+  rec.y = y;
 
   sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
 
@@ -199,7 +199,8 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   //picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
   sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
-                        new_rec_y, sao_luma, 
+                        &new_rec_y[(tl.y + rec.y - y) * (LCU_WIDTH) + (tl.x + rec.x - x)],
+                        sao_luma, 
                         LCU_WIDTH + 2, LCU_WIDTH,
                         block.x, block.y);
   //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);

From 3eccdc0d7b2ec0083497eda684bd9b60967670e5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 6 Nov 2013 17:27:16 +0200
Subject: [PATCH 16/25] Move deblocking filtering to happen before SAO
 reconstruction.

-Re-enable deblocking.
---
 src/encmain.c | 2 +-
 src/encoder.c | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/encmain.c b/src/encmain.c
index c6d041e4..34938997 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -146,7 +146,7 @@ int main(int argc, char *argv[])
   encoder->QP       = 32;
   encoder->in.video_format = FORMAT_420;
   // deblocking filter
-  encoder->deblock_enable  = 0;
+  encoder->deblock_enable  = 1;
   encoder->beta_offset_div2  = 0;
   encoder->tc_offset_div2    = 0;
   // SAO
diff --git a/src/encoder.c b/src/encoder.c
index 6c9190b7..dd7c8b15 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -321,10 +321,6 @@ void encode_one_frame(encoder_control* encoder)
     bitstream_clear_buffer(encoder->stream);
   }  
   
-  // Filtering
-  if(encoder->deblock_enable) {
-    filter_deblock(encoder);
-  }
   // Calculate checksum
   add_checksum(encoder);
 }
@@ -872,6 +868,11 @@ void encode_slice_data(encoder_control* encoder)
   pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
   //memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
   
+  // Filtering
+  if(encoder->deblock_enable) {
+    filter_deblock(encoder);
+  }
+
   if (encoder->sao_enable) {
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
       for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {

From 7a20e797be2b3b260839ada94a5396a82e7335c7 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 6 Nov 2013 17:46:30 +0200
Subject: [PATCH 17/25] Fix incorrect SAO encoding.

---
 src/encoder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index dd7c8b15..ad643173 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -876,7 +876,7 @@ void encode_slice_data(encoder_control* encoder)
   if (encoder->sao_enable) {
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
       for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
-        unsigned stride = encoder->in.height_in_lcu;
+        unsigned stride = encoder->in.width_in_lcu;
         sao_info *sao_luma = &pic->sao_luma[y_ctb * stride + x_ctb];
         sao_info *sao_chroma = &pic->sao_chroma[y_ctb * stride + x_ctb];
         init_sao_info(sao_luma);
@@ -915,7 +915,7 @@ void encode_slice_data(encoder_control* encoder)
 
       if (encoder->sao_enable) {
         picture *pic = encoder->in.cur_pic;
-        unsigned stride = encoder->in.height_in_lcu;
+        unsigned stride = encoder->in.width_in_lcu;
         sao_info sao_luma = pic->sao_luma[y_ctb * stride + x_ctb];
         sao_info sao_chroma = pic->sao_chroma[y_ctb * stride + x_ctb];
 

From f9061d322ad127f2dc09656881fb7862cdc8de8e Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 6 Nov 2013 18:08:24 +0200
Subject: [PATCH 18/25] Fix handling of incomplete LCU's in SAO reconstruction.

---
 src/encoder.c | 18 ++++--------------
 src/sao.c     |  7 +++++--
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index ad643173..55e73f1d 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -882,20 +882,10 @@ void encode_slice_data(encoder_control* encoder)
         init_sao_info(sao_luma);
         init_sao_info(sao_chroma);
 
-        // Temporary guards against non-LCU size coding units at the edges,
-        // because they aren't handled yet.
-        if (encoder->in.width_in_lcu * LCU_WIDTH != encoder->in.cur_pic->width
-            && x_ctb == encoder->in.width_in_lcu - 1) {
-
-        } else if (encoder->in.height_in_lcu * LCU_WIDTH != encoder->in.cur_pic->height
-                   && y_ctb == encoder->in.height_in_lcu - 1) {
-
-        } else {
-          sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma);
-          // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-          // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-          sao_reconstruct(encoder->in.cur_pic, new_y_data, x_ctb, y_ctb, sao_luma, sao_chroma);
-        }
+        sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma);
+        // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+        // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
+        sao_reconstruct(encoder->in.cur_pic, new_y_data, x_ctb, y_ctb, sao_luma, sao_chroma);
       }
     }
   }
diff --git a/src/sao.c b/src/sao.c
index c3e48483..b5939336 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -199,7 +199,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   //picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
   sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
-                        &new_rec_y[(tl.y + rec.y - y) * (LCU_WIDTH) + (tl.x + rec.x - x)],
+                        &new_rec_y,
                         sao_luma, 
                         LCU_WIDTH + 2, LCU_WIDTH,
                         block.x, block.y);
@@ -207,7 +207,10 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
   
   // Copy reconstructed block from tmp buffer to rec image.
-  picture_blit_pixels(new_rec_y, new_y_recdata, LCU_WIDTH, LCU_WIDTH, LCU_WIDTH, pic->width);
+  // 
+  picture_blit_pixels(new_rec_y, 
+                      &new_y_recdata[(tl.y + rec.y - y) * (LCU_WIDTH) + (tl.x + rec.x - x)],
+                      block.x, block.y, LCU_WIDTH, pic->width);
 }
 
 

From ef8a984d4fba01d7b53b9882869b732f492c2ab5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 8 Nov 2013 11:02:18 +0200
Subject: [PATCH 19/25] Fix incorrect blitting of top row during SAO
 reconstruction.

---
 src/sao.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sao.c b/src/sao.c
index b5939336..b7725145 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -209,7 +209,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   // Copy reconstructed block from tmp buffer to rec image.
   // 
   picture_blit_pixels(new_rec_y, 
-                      &new_y_recdata[(tl.y + rec.y - y) * (LCU_WIDTH) + (tl.x + rec.x - x)],
+                      &new_y_recdata[(tl.y + rec.y - y) * (pic->width) + (tl.x + rec.x - x)],
                       block.x, block.y, LCU_WIDTH, pic->width);
 }
 

From 43ae719ddbd25ff44f32baee4c481ee655d50621 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 8 Nov 2013 15:04:06 +0200
Subject: [PATCH 20/25] Tweak implementation of SAO reconstruction to be a bit
 more clear.

---
 src/encoder.c | 10 +++++-----
 src/sao.c     | 20 ++++++++------------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 55e73f1d..624f647e 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -865,8 +865,6 @@ void encode_slice_data(encoder_control* encoder)
 {
   uint16_t x_ctb, y_ctb;
   picture *pic = encoder->in.cur_pic;
-  pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
-  //memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
   
   // Filtering
   if(encoder->deblock_enable) {
@@ -874,6 +872,9 @@ void encode_slice_data(encoder_control* encoder)
   }
 
   if (encoder->sao_enable) {
+    pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
+    memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
+
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
       for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
         unsigned stride = encoder->in.width_in_lcu;
@@ -888,10 +889,9 @@ void encode_slice_data(encoder_control* encoder)
         sao_reconstruct(encoder->in.cur_pic, new_y_data, x_ctb, y_ctb, sao_luma, sao_chroma);
       }
     }
-  }
 
-  memcpy(pic->y_recdata, new_y_data, sizeof(pixel) * pic->width * pic->height);
-  free(new_y_data);
+    free(new_y_data);
+  }
 
   init_contexts(encoder,encoder->in.cur_pic->slicetype);
 
diff --git a/src/sao.c b/src/sao.c
index b7725145..6049047e 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -96,7 +96,7 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
  * Vector block is the area affected by sao. Vectors tr and br are top-left
  * margin and bottom-right margin, which contain pixels that are not modified
  * by the reconstruction of this LCU but are needed by the reconstruction.
- * Vector rec is the coordinate of the area required by sao reconstruction.
+ * Vector rec is the offset from the CU to the required pixel area.
  *
  * The margins are always either 0 or 1, depending on the direction of the
  * edge offset class.
@@ -104,7 +104,7 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
  * This also takes into account borders of the picture and non-LCU sized
  * CU's at the bottom and right of the picture.
  * 
- * \ rec
+ * \ CU + rec
  *  +------+
  *  |\ tl  |
  *  | +--+ |
@@ -162,12 +162,8 @@ void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec,
     }
   }
 
-  if (rec->y != 0) {
-    rec->y -= 1;
-  }
-  if (rec->x != 0) {
-    rec->x -= 1;
-  }
+  rec->y = (rec->y == 0 ? 0 : -1);
+  rec->x = (rec->x == 0 ? 0 : -1);
 }
 
 void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y_ctb, 
@@ -191,7 +187,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
 
   // Data to tmp buffer.
-  picture_blit_pixels(&pic->y_recdata[rec.y * pic->width + rec.x], rec_y,
+  picture_blit_pixels(&new_y_data[(y + rec.y) * pic->width + x + rec.x], rec_y,
                       tl.x + block.x + br.x,
                       tl.y + block.y + br.y,
                       pic->width, LCU_WIDTH + 2);
@@ -199,7 +195,7 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   //picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
 
   sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
-                        &new_rec_y,
+                        &new_rec_y[(rec.y + tl.y) * LCU_WIDTH + rec.x + tl.x],
                         sao_luma, 
                         LCU_WIDTH + 2, LCU_WIDTH,
                         block.x, block.y);
@@ -208,8 +204,8 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   
   // Copy reconstructed block from tmp buffer to rec image.
   // 
-  picture_blit_pixels(new_rec_y, 
-                      &new_y_recdata[(tl.y + rec.y - y) * (pic->width) + (tl.x + rec.x - x)],
+  picture_blit_pixels(&new_rec_y[(tl.y + rec.y) * LCU_WIDTH + (tl.x + rec.x)], 
+                      &y_recdata[(tl.y + rec.y) * (pic->width) + (tl.x + rec.x)],
                       block.x, block.y, LCU_WIDTH, pic->width);
 }
 

From 8b0eb665558bf0dbe5735fe1561e06f137a9e3c5 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 8 Nov 2013 15:04:53 +0200
Subject: [PATCH 21/25] Fix bug in SAO reconstruction.

---
 src/sao.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sao.c b/src/sao.c
index 6049047e..24b71201 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -156,7 +156,7 @@ void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec,
     if (rec->x + LCU_WIDTH > pic->width) {
       block->x = pic->width - rec->x;
     }
-    if (a_ofs.x == 1 || b_ofs.y == 1) {
+    if (a_ofs.x == 1 || b_ofs.x == 1) {
       block->x -= 1;
       br->x += 1;
     }

From 22d21ffac260f9916cd579dc02de1c84ff95e7b9 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 8 Nov 2013 15:39:01 +0200
Subject: [PATCH 22/25] Enable SAO search.

-Add guard to reconstruction to avoid reconstructing LCUs with no sao type.
-Add temporary guard to SAO search to skip LCUs can't be handled yet.
---
 src/sao.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/sao.c b/src/sao.c
index 24b71201..bb9b129b 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -181,6 +181,10 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
   vector2d br = { 1, 1 };
   vector2d block = { LCU_WIDTH, LCU_WIDTH };
 
+  if (sao_luma->type == SAO_TYPE_NONE) {
+    return;
+  }
+
   rec.x = x;
   rec.y = y;
 
@@ -282,15 +286,22 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
   pixel rec_y[LCU_LUMA_SIZE];
   pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+
+  // TODO: Fix searching of SAO in <LCU blocks. (reconstruction works)
+  if (x_ctb == pic->width_in_lcu - 1 || y_ctb == pic->height_in_lcu - 1) {
+    return;
+  }
   
-  sao->offsets[SAO_EO_CAT0] = 0;
+  /*sao->offsets[SAO_EO_CAT0] = 0;
   sao->offsets[SAO_EO_CAT1] = 7;
   sao->offsets[SAO_EO_CAT2] = 7;
   sao->offsets[SAO_EO_CAT3] = -7;
   sao->offsets[SAO_EO_CAT4] = -7;
   sao->eo_class = SAO_EO0;
   sao->type = SAO_TYPE_EDGE;
-  return;
+  return;*/
+
+  sao->type = SAO_TYPE_EDGE;
 
   // Fill temporary buffers with picture data.
   picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);

From 692ef3e9d9d0839dc29d368b5332e4840df60374 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Fri, 8 Nov 2013 16:13:48 +0200
Subject: [PATCH 23/25] Add guard against illegal SAO edge offsets.

---
 src/encoder.c | 13 ++++++++-----
 src/sao.c     |  9 +++++++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 624f647e..15e2ce63 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -799,11 +799,14 @@ void encode_sao_color(encoder_control *encoder, sao_info *sao, color_index color
   if (sao->type != SAO_TYPE_NONE) {
     sao_eo_cat i;
   
-    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
-      //CABAC_BIN_EP(&cabac, abs(sao->offsets[i]), "sao_offset_abs");
-      // TR cMax=7 (for 8bit), cRiseParam=0
-      cabac_write_unary_max_symbol_ep(&cabac, abs(sao->offsets[i]), 
-                                      SAO_ABS_OFFSET_MAX);
+    // TR cMax=7 (for 8bit), cRiseParam=0
+    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT2; ++i) {
+      assert(sao->offsets[i] >= 0);
+      cabac_write_unary_max_symbol_ep(&cabac, sao->offsets[i], SAO_ABS_OFFSET_MAX);
+    }
+    for (i = SAO_EO_CAT3; i <= SAO_EO_CAT4; ++i) {
+      assert(sao->offsets[i] <= 0);
+      cabac_write_unary_max_symbol_ep(&cabac, -sao->offsets[i], SAO_ABS_OFFSET_MAX);
     }
 
     if (sao->type == SAO_TYPE_BAND) {
diff --git a/src/sao.c b/src/sao.c
index bb9b129b..51875567 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -249,6 +249,15 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
         offset = (cat_sum + (cat_cnt >> 1)) / cat_cnt;
         offset = CLIP(-SAO_ABS_OFFSET_MAX, SAO_ABS_OFFSET_MAX, offset);
       }
+
+      // Sharpening edge offsets can't be encoded, so set them to 0 here.
+      if (edge_cat >= SAO_EO_CAT1 && edge_cat <= SAO_EO_CAT2 && offset < 0) {
+        offset = 0;
+      }
+      if (edge_cat >= SAO_EO_CAT3 && edge_cat <= SAO_EO_CAT4 && offset > 0) {
+        offset = 0;
+      }
+
       edge_offset[edge_cat] = offset;
       // The ddistortion is amount by which the SSE of data changes. It should
       // be negative for all categories, if offset was chosen correctly.

From 84cd618dafe69ebfa2bf33cb3eb51c238ec1dca8 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Mon, 11 Nov 2013 09:30:12 +0200
Subject: [PATCH 24/25] Add calculation of SAO-offsets for non-LCU sized CUs.
 Luma SAO works now.

---
 src/sao.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/sao.c b/src/sao.c
index 51875567..c92e7c39 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -35,7 +35,7 @@ static const unsigned g_sao_eo_idx_to_eo_category[] = { 1, 2, 0, 3, 4 };
  * \param is_chroma  0 for luma, 1 for chroma. Indicates 
  */
 void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
-                       int eo_class, int block_width,
+                       int eo_class, int block_width, int block_height,
                        int cat_sum_cnt[2][NUM_SAO_EDGE_CATEGORIES])
 {
   int y, x;
@@ -45,7 +45,7 @@ void calc_sao_edge_dir(const pixel *orig_data, const pixel *rec_data,
 
   // Don't sample the edge pixels because this function doesn't have access to
   // their neighbours.
-  for (y = 1; y < block_width - 1; ++y) {
+  for (y = 1; y < block_height - 1; ++y) {
     for (x = 1; x < block_width - 1; ++x) {
       const pixel *c_data = &rec_data[y * block_width + x];
       pixel a = c_data[a_ofs.y * block_width + a_ofs.x];
@@ -216,7 +216,8 @@ void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y
 
 
 void sao_search_best_mode(const pixel *data, const pixel *recdata, 
-                          unsigned block_width, unsigned buf_size, unsigned buf_cnt,
+                          int block_width, int block_height,
+                          unsigned buf_size, unsigned buf_cnt,
                           sao_info *sao_out)
 {
   sao_eo_class edge_class;
@@ -234,7 +235,8 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
 
     // Call calc_sao_edge_dir once for luma and twice for chroma.
     for (i = 0; i < buf_cnt; ++i) {
-      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class, block_width, cat_sum_cnt);
+      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class,
+                        block_width, block_height, cat_sum_cnt);
     }
     
     for (edge_cat = SAO_EO_CAT1; edge_cat <= SAO_EO_CAT4; ++edge_cat) {
@@ -295,10 +297,14 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
   pixel rec_y[LCU_LUMA_SIZE];
   pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  int block_width = LCU_WIDTH;
+  int block_height = LCU_WIDTH;
 
-  // TODO: Fix searching of SAO in <LCU blocks. (reconstruction works)
-  if (x_ctb == pic->width_in_lcu - 1 || y_ctb == pic->height_in_lcu - 1) {
-    return;
+  if (x_ctb * LCU_WIDTH + LCU_WIDTH >= (unsigned)pic->width) {
+    block_width = pic->width - x_ctb * LCU_WIDTH;
+  }
+  if (y_ctb * LCU_WIDTH + LCU_WIDTH >= (unsigned)pic->height) {
+    block_height = pic->height - y_ctb * LCU_WIDTH;
   }
   
   /*sao->offsets[SAO_EO_CAT0] = 0;
@@ -313,8 +319,8 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
   sao->type = SAO_TYPE_EDGE;
 
   // Fill temporary buffers with picture data.
-  picture_blit_pixels(y_data, orig_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
-  picture_blit_pixels(y_recdata, rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
+  picture_blit_pixels(y_data, orig_y, block_width, block_height, pic->width, LCU_WIDTH);
+  picture_blit_pixels(y_recdata, rec_y, block_width, block_height, pic->width, LCU_WIDTH);
 
-  sao_search_best_mode(orig_y, rec_y, LCU_WIDTH, LCU_LUMA_SIZE, 1, sao);
+  sao_search_best_mode(orig_y, rec_y, block_width, block_height, LCU_LUMA_SIZE, 1, sao);
 }

From 3af65b84778105ad2d30fc26cee592e0b68aceec Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Tue, 12 Nov 2013 11:55:39 +0200
Subject: [PATCH 25/25] Add SAO searching and reconstruction for chroma.

One I frame and 99 P frames encoded with SAO off and on.
Processed 100 frames,    6693224 bits AVG PSNR: 30.7248 37.8978 37.8287
Processed 100 frames,    6295072 bits AVG PSNR: 32.2511 38.9373 38.9818
---
 src/encoder.c |  11 +++-
 src/sao.c     | 144 +++++++++++++++++++++++++++++++-------------------
 src/sao.h     |   5 +-
 3 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 15e2ce63..45e33f9a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -876,7 +876,11 @@ void encode_slice_data(encoder_control* encoder)
 
   if (encoder->sao_enable) {
     pixel *new_y_data = MALLOC(pixel, pic->width * pic->height);
+    pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
+    pixel *new_v_data = MALLOC(pixel, (pic->width * pic->height) >> 2);
     memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height);
+    memcpy(new_u_data, pic->u_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
+    memcpy(new_v_data, pic->v_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2);
 
     for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) {
       for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) {
@@ -887,13 +891,18 @@ void encode_slice_data(encoder_control* encoder)
         init_sao_info(sao_chroma);
 
         sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma);
+        sao_search_chroma(encoder->in.cur_pic, x_ctb, y_ctb, sao_chroma);
         // sao_do_merge(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
         // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma);
-        sao_reconstruct(encoder->in.cur_pic, new_y_data, x_ctb, y_ctb, sao_luma, sao_chroma);
+        sao_reconstruct(pic, new_y_data, x_ctb, y_ctb, sao_luma, COLOR_Y);
+        sao_reconstruct(pic, new_u_data, x_ctb, y_ctb, sao_chroma, COLOR_U);
+        sao_reconstruct(pic, new_v_data, x_ctb, y_ctb, sao_chroma, COLOR_V);
       }
     }
 
     free(new_y_data);
+    free(new_u_data);
+    free(new_v_data);
   }
 
   init_contexts(encoder,encoder->in.cur_pic->slicetype);
diff --git a/src/sao.c b/src/sao.c
index c92e7c39..ff50e0ba 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -118,11 +118,16 @@ void sao_reconstruct_color(const pixel *rec_data, pixel *new_rec_data, const sao
  * \param sao  Sao parameters.
  * \param rec  Top-left corner of the LCU, modified to be top-left corner of 
  */
-void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec, 
+void sao_calc_block_dims(const picture *pic, color_index color_i, 
+                         const sao_info *sao, vector2d *rec, 
                          vector2d *tl, vector2d *br, vector2d *block)
 {
   vector2d a_ofs = g_sao_edge_offsets[sao->eo_class][0];
   vector2d b_ofs = g_sao_edge_offsets[sao->eo_class][1];
+  const int is_chroma = (color_i != COLOR_Y ? 1 : 0);
+  int width = pic->width >> is_chroma;
+  int height = pic->height >> is_chroma;
+  int block_width = LCU_WIDTH >> is_chroma;
 
   // Handle top and left.
   if (rec->y == 0) {
@@ -141,20 +146,20 @@ void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec,
   }
 
   // Handle right and bottom, taking care of non-LCU sized CUs.
-  if (rec->y + LCU_WIDTH >= pic->height) {
+  if (rec->y + block_width >= height) {
     br->y = 0;
-    if (rec->y + LCU_WIDTH >= pic->height) {
-      block->y = pic->height - rec->y;
+    if (rec->y + block_width >= height) {
+      block->y = height - rec->y;
     }
     if (a_ofs.y == 1 || b_ofs.y == 1) {
       block->y -= 1;
       br->y += 1;
     }
   }
-  if (rec->x + LCU_WIDTH >= pic->width) {
+  if (rec->x + block_width >= width) {
     br->x = 0;
-    if (rec->x + LCU_WIDTH > pic->width) {
-      block->x = pic->width - rec->x;
+    if (rec->x + block_width > width) {
+      block->x = width - rec->x;
     }
     if (a_ofs.x == 1 || b_ofs.x == 1) {
       block->x -= 1;
@@ -166,58 +171,62 @@ void sao_calc_block_dims(const picture *pic, const sao_info *sao, vector2d *rec,
   rec->x = (rec->x == 0 ? 0 : -1);
 }
 
-void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y_ctb, 
-                     const sao_info *sao_luma, const sao_info *sao_chroma)
+void sao_reconstruct(picture *pic, const pixel *old_rec, 
+                     unsigned x_ctb, unsigned y_ctb, 
+                     const sao_info *sao, color_index color_i)
 {
-  pixel rec_y[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
-  pixel new_rec_y[LCU_LUMA_SIZE];
-  pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
-  pixel *new_y_recdata = &new_y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
+  const int is_chroma = (color_i != COLOR_Y ? 1 : 0);
+  const int pic_stride = pic->width >> is_chroma;
+  const int lcu_stride = LCU_WIDTH >> is_chroma;
+  const int buf_stride = lcu_stride + 2;
 
-  int x = x_ctb * LCU_WIDTH, y = y_ctb * LCU_WIDTH;
-  
-  vector2d rec;
+  pixel *recdata = (color_i == COLOR_Y ? pic->y_recdata : 
+                    (color_i == COLOR_U ? pic->u_recdata : pic->v_recdata));
+  pixel buf_rec[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
+  pixel new_rec[LCU_WIDTH * LCU_WIDTH];
+  // Calling CU_TO_PIXEL with depth 1 is the same as using block size of 32.
+  pixel *lcu_rec = &recdata[CU_TO_PIXEL(x_ctb, y_ctb, is_chroma, pic_stride)];
+  const pixel *old_lcu_rec = &old_rec[CU_TO_PIXEL(x_ctb, y_ctb, is_chroma, pic_stride)];
+
+  vector2d ofs;
   vector2d tl = { 1, 1 };
   vector2d br = { 1, 1 };
   vector2d block = { LCU_WIDTH, LCU_WIDTH };
 
-  if (sao_luma->type == SAO_TYPE_NONE) {
+  if (sao->type == SAO_TYPE_NONE) {
     return;
   }
 
-  rec.x = x;
-  rec.y = y;
-
-  sao_calc_block_dims(pic, sao_luma, &rec, &tl, &br, &block);
+  ofs.x = x_ctb * lcu_stride;
+  ofs.y = y_ctb * lcu_stride;
+  block.x = lcu_stride;
+  block.y = lcu_stride;
+  sao_calc_block_dims(pic, color_i, sao, &ofs, &tl, &br, &block);
 
   // Data to tmp buffer.
-  picture_blit_pixels(&new_y_data[(y + rec.y) * pic->width + x + rec.x], rec_y,
+  picture_blit_pixels(&old_lcu_rec[ofs.y * pic_stride + ofs.x], 
+                      buf_rec,
                       tl.x + block.x + br.x,
                       tl.y + block.y + br.y,
-                      pic->width, LCU_WIDTH + 2);
+                      pic_stride, buf_stride);
 
-  //picture_blit_pixels(y_recdata, new_rec_y, LCU_WIDTH, LCU_WIDTH, pic->width, LCU_WIDTH);
-
-  sao_reconstruct_color(&rec_y[tl.y * (LCU_WIDTH + 2) + tl.x], 
-                        &new_rec_y[(rec.y + tl.y) * LCU_WIDTH + rec.x + tl.x],
-                        sao_luma, 
-                        LCU_WIDTH + 2, LCU_WIDTH,
+  sao_reconstruct_color(&buf_rec[tl.y * buf_stride + tl.x], 
+                        &new_rec[(ofs.y + tl.y) * lcu_stride + ofs.x + tl.x],
+                        sao, 
+                        buf_stride, lcu_stride,
                         block.x, block.y);
-  //sao_reconstruct_color(rec_u, sao_chroma, COLOR_U);
-  //sao_reconstruct_color(rec_v, sao_chroma, COLOR_V);
-  
+
   // Copy reconstructed block from tmp buffer to rec image.
-  // 
-  picture_blit_pixels(&new_rec_y[(tl.y + rec.y) * LCU_WIDTH + (tl.x + rec.x)], 
-                      &y_recdata[(tl.y + rec.y) * (pic->width) + (tl.x + rec.x)],
-                      block.x, block.y, LCU_WIDTH, pic->width);
+  picture_blit_pixels(&new_rec[(tl.y + ofs.y) * lcu_stride + (tl.x + ofs.x)], 
+                      &lcu_rec[(tl.y + ofs.y) * pic_stride + (tl.x + ofs.x)],
+                      block.x, block.y, lcu_stride, pic_stride);
 }
 
 
 
-void sao_search_best_mode(const pixel *data, const pixel *recdata, 
+void sao_search_best_mode(const pixel *data[], const pixel *recdata[], 
                           int block_width, int block_height,
-                          unsigned buf_size, unsigned buf_cnt,
+                          unsigned buf_cnt,
                           sao_info *sao_out)
 {
   sao_eo_class edge_class;
@@ -235,7 +244,7 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
 
     // Call calc_sao_edge_dir once for luma and twice for chroma.
     for (i = 0; i < buf_cnt; ++i) {
-      calc_sao_edge_dir(data + i * buf_size, recdata + i * buf_size, edge_class,
+      calc_sao_edge_dir(data[i], recdata[i], edge_class,
                         block_width, block_height, cat_sum_cnt);
     }
     
@@ -284,17 +293,50 @@ void sao_search_best_mode(const pixel *data, const pixel *recdata,
 
  void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
 {
-  
+  pixel orig_u[LCU_CHROMA_SIZE];
+  pixel rec_u[LCU_CHROMA_SIZE];
+  pixel orig_v[LCU_CHROMA_SIZE];
+  pixel rec_v[LCU_CHROMA_SIZE];
+  pixel *orig[2] = { orig_u, orig_v };
+  pixel *rec[2] = { rec_u, rec_v };
+  pixel *u_data = &pic->u_data[CU_TO_PIXEL(x_ctb, y_ctb, 1, pic->width / 2)];
+  pixel *u_recdata = &pic->u_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 1, pic->width / 2)];
+  pixel *v_data = &pic->v_data[CU_TO_PIXEL(x_ctb, y_ctb, 1, pic->width / 2)];
+  pixel *v_recdata = &pic->v_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 1, pic->width / 2)];
+  int block_width  = (LCU_WIDTH / 2);
+  int block_height = (LCU_WIDTH / 2);
+
+  if (x_ctb * (LCU_WIDTH / 2) + (LCU_WIDTH / 2) >= (unsigned)pic->width / 2) {
+    block_width = (pic->width - x_ctb * LCU_WIDTH) / 2;
+  }
+  if (y_ctb * (LCU_WIDTH / 2) + (LCU_WIDTH / 2) >= (unsigned)pic->height / 2) {
+    block_height = (pic->height - y_ctb * LCU_WIDTH) / 2;
+  }
+
+  sao->type = SAO_TYPE_EDGE;
+
+  // Fill temporary buffers with picture data.
+  // These buffers are needed only until we switch to a LCU based data
+  // structure for pixels. Then we can give pointers directly to that structure
+  // without making copies.
+  picture_blit_pixels(u_data, orig_u, block_width, block_height,
+                      pic->width / 2, LCU_WIDTH / 2);
+  picture_blit_pixels(v_data, orig_v, block_width, block_height, 
+                      pic->width / 2, LCU_WIDTH / 2);
+  picture_blit_pixels(u_recdata, rec_u, block_width, block_height,
+                      pic->width / 2, LCU_WIDTH / 2);
+  picture_blit_pixels(v_recdata, rec_v, block_width, block_height,
+                      pic->width / 2, LCU_WIDTH / 2);
+
+  sao_search_best_mode(orig, rec, block_width, block_height, 2, sao);
 }
 
 void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao)
 {
-  // These buffers are needed only until we switch to a LCU based data
-  // structure for pixels. Then we can give pointers directly to that structure
-  // without making copies.
-  // It's 2-dimensional because sao_search_best_mode takes arguments as arrays.
   pixel orig_y[LCU_LUMA_SIZE];
   pixel rec_y[LCU_LUMA_SIZE];
+  pixel *orig[1] = { orig_y };
+  pixel *rec[1] = { rec_y };
   pixel *y_data = &pic->y_data[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   pixel *y_recdata = &pic->y_recdata[CU_TO_PIXEL(x_ctb, y_ctb, 0, pic->width)];
   int block_width = LCU_WIDTH;
@@ -306,21 +348,15 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf
   if (y_ctb * LCU_WIDTH + LCU_WIDTH >= (unsigned)pic->height) {
     block_height = pic->height - y_ctb * LCU_WIDTH;
   }
-  
-  /*sao->offsets[SAO_EO_CAT0] = 0;
-  sao->offsets[SAO_EO_CAT1] = 7;
-  sao->offsets[SAO_EO_CAT2] = 7;
-  sao->offsets[SAO_EO_CAT3] = -7;
-  sao->offsets[SAO_EO_CAT4] = -7;
-  sao->eo_class = SAO_EO0;
-  sao->type = SAO_TYPE_EDGE;
-  return;*/
 
   sao->type = SAO_TYPE_EDGE;
 
   // Fill temporary buffers with picture data.
+  // These buffers are needed only until we switch to a LCU based data
+  // structure for pixels. Then we can give pointers directly to that structure
+  // without making copies.
   picture_blit_pixels(y_data, orig_y, block_width, block_height, pic->width, LCU_WIDTH);
   picture_blit_pixels(y_recdata, rec_y, block_width, block_height, pic->width, LCU_WIDTH);
 
-  sao_search_best_mode(orig_y, rec_y, block_width, block_height, LCU_LUMA_SIZE, 1, sao);
+  sao_search_best_mode(orig, rec, block_width, block_height, 1, sao);
 }
diff --git a/src/sao.h b/src/sao.h
index e32010fe..0be99eea 100644
--- a/src/sao.h
+++ b/src/sao.h
@@ -47,7 +47,8 @@ typedef struct sao_info_struct {
 void init_sao_info(sao_info *sao);
 void sao_search_chroma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
 void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_info *sao);
-void sao_reconstruct(picture *pic, pixel *new_y_data, unsigned x_ctb, unsigned y_ctb, 
-                     const sao_info *sao_luma, const sao_info *sao_chroma);
+void sao_reconstruct(picture *pic, const pixel *old_rec, 
+                     unsigned x_ctb, unsigned y_ctb, 
+                     const sao_info *sao, color_index color_i);
 
 #endif
\ No newline at end of file