From f30b9c2a110a1d38117b0770dadd9a90d23e4bb6 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Mon, 5 May 2014 15:17:52 +0200
Subject: [PATCH 01/21] Fix a buffer overflow in parse_tiles_specification

---
 src/config.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/config.c b/src/config.c
index b74ab843..ec146a8a 100644
--- a/src/config.c
+++ b/src/config.c
@@ -155,7 +155,7 @@ static int parse_enum(const char *arg, const char * const *names, int8_t *dst)
 static int parse_tiles_specification(const char* const arg, int32_t * const ntiles, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
-  int32_t values[256];
+  int32_t values[MAX_TILES_PER_DIM];
   
   int i;
   
@@ -189,6 +189,7 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil
     if (current_arg) ++current_arg;
     values[*ntiles] = current_value;
     ++(*ntiles);
+    if (MAX_TILES_PER_DIM <= *ntiles) break;
   } while (current_arg);
   
   if (MAX_TILES_PER_DIM <= *ntiles || 0 >= *ntiles) {

From f0b076876fd6945a515ded0f1dc66621be23fc9e Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Mon, 5 May 2014 11:19:56 +0200
Subject: [PATCH 02/21] Moved all the stream related stuff into
 substream_write_bitstream

---
 src/encoder.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index c9998a74..e76f2130 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -499,7 +499,7 @@ static void write_aud(encoder_state * const encoder_state)
   bitstream_align(stream);
 }
 
-static void substream_write_bitstream(encoder_state * const encoder_state, const int last_part) {
+static void substream_write_bitstream(encoder_state * const encoder_state, const int end_of_sub_stream) {
   const encoder_control * const encoder = encoder_state->encoder_control;
   const picture* const cur_pic = encoder_state->cur_pic;
   const int lcu_count = cur_pic->width_in_lcu * cur_pic->height_in_lcu;
@@ -517,11 +517,18 @@ static void substream_write_bitstream(encoder_state * const encoder_state, const
     
     encode_coding_tree(encoder_state, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0);
 
-    cabac_encode_bin_trm(&encoder_state->cabac, ((lcu_id == lcu_count - 1) && last_part) ? 1 : 0);  // end_of_slice_segment_flag
+    cabac_encode_bin_trm(&encoder_state->cabac, ((lcu_id == lcu_count - 1) && !end_of_sub_stream) ? 1 : 0);  // end_of_slice_segment_flag
+  }
+  if (end_of_sub_stream) {
+    cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
+    cabac_flush(&encoder_state->cabac);
+  } else {
+    cabac_flush(&encoder_state->cabac);
+    bitstream_align(&encoder_state->stream);
   }
 }
 
-static void substream_encode(encoder_state * const encoder_state, const int last_part) {
+static void substream_encode(encoder_state * const encoder_state) {
   const encoder_control * const encoder = encoder_state->encoder_control;
 #ifndef NDEBUG
   const unsigned long long int debug_bitstream_position = bitstream_tell(&(encoder_state->stream));
@@ -630,9 +637,6 @@ static void substream_encode(encoder_state * const encoder_state, const int last
   
   //We should not have written to bitstream!
   assert(debug_bitstream_position == bitstream_tell(&(encoder_state->stream)));
-  
-  //Now, write bitstream
-  substream_write_bitstream(encoder_state, last_part);
 
   yuv_t_free(hor_buf);
   yuv_t_free(ver_buf);
@@ -775,7 +779,8 @@ void encode_one_frame(encoder_state * const main_state)
       subencoder->cur_pic->slicetype = main_state->cur_pic->slicetype;
       subencoder->cur_pic->type = main_state->cur_pic->type;
       
-      substream_encode(subencoder, !(main_state->children[i+1].encoder_control));
+      substream_encode(subencoder);
+      substream_write_bitstream(subencoder, (main_state->children[i+1].encoder_control) != NULL);
       
       subencoder_blit_pixels(main_state, main_state->cur_pic->y_recdata, subencoder, subencoder->cur_pic->y_recdata, 1);
       subencoder_blit_pixels(main_state, main_state->cur_pic->u_recdata, subencoder, subencoder->cur_pic->u_recdata, 0);
@@ -785,15 +790,6 @@ void encode_one_frame(encoder_state * const main_state)
     //This has to be serial
     i = 0;
     do {
-      if (!main_state->children[i+1].encoder_control) {
-        //last tile
-        cabac_flush(&main_state->children[i].cabac);
-        bitstream_align(&main_state->children[i].stream);
-      } else {
-        //Other tiles
-        cabac_encode_bin_trm(&main_state->children[i].cabac, 1); // end_of_sub_stream_one_bit == 1
-        cabac_flush(&main_state->children[i].cabac);
-      }
       //Append bitstream to main stream
       bitstream_append(&main_state->stream, &main_state->children[i].stream);
       bitstream_clear(&main_state->children[i].stream);
@@ -801,9 +797,8 @@ void encode_one_frame(encoder_state * const main_state)
     
   } else {
     //Encode the whole thing as one stream
-    substream_encode(main_state, 1);
-    cabac_flush(&main_state->cabac);
-    bitstream_align(stream);
+    substream_encode(main_state);
+    substream_write_bitstream(main_state, 0);
   }
   
   // Calculate checksum

From 2d6f1992461684b633a0547a56701697eae18c94 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Mon, 5 May 2014 13:33:41 +0200
Subject: [PATCH 03/21] reorganized encoder_state structure

---
 src/encoder.c |  2 ++
 src/encoder.h | 33 +++++++++++++++++----------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index e76f2130..45b1bf4d 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -787,6 +787,8 @@ void encode_one_frame(encoder_state * const main_state)
       subencoder_blit_pixels(main_state, main_state->cur_pic->v_recdata, subencoder, subencoder->cur_pic->v_recdata, 0);
     }
     
+    //We should do the slice header here, because we can have the entry points
+    
     //This has to be serial
     i = 0;
     do {
diff --git a/src/encoder.h b/src/encoder.h
index 3fee4c6a..43a6de50 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -124,28 +124,29 @@ typedef struct
 
 typedef struct encoder_state {
   const encoder_control *encoder_control;
-  
-  int32_t lcu_offset_x;
-  int32_t lcu_offset_y;
-  
-  picture *cur_pic;
-  int32_t frame;
-  int32_t poc; /*!< \brief picture order count */
-  
-  bitstream stream;
-  
-  picture_list *ref;
-  int8_t ref_list;
-  int8_t ref_idx_num[2];
-  int8_t QP;             // \brief Quantization parameter
-  
   double cur_lambda_cost;
-  
+  bitstream stream;
   cabac_data cabac;
   
   //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
   //Use do { } while (encoder_state->children[++i].encoder_control)
   struct encoder_state *children;
+  
+  //Tile: offset in LCU for current encoder_state
+  int32_t lcu_offset_x;
+  int32_t lcu_offset_y;
+  
+  //Current picture to encode
+  picture *cur_pic;
+  int32_t frame;
+  int32_t poc; /*!< \brief picture order count */
+  
+  //Current picture available references
+  picture_list *ref;
+  int8_t ref_list;
+  int8_t ref_idx_num[2];
+  
+  int8_t QP;             //!< \brief Quantization parameter
 } encoder_state;
 
 int encoder_control_init(encoder_control *encoder, const config *cfg);

From c2872bd6b0e424c87fc87d9bce9b14102eaef7a3 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Mon, 5 May 2014 15:17:22 +0200
Subject: [PATCH 04/21] Slices and WPP in command line and encoder

---
 src/config.c  | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/config.h  |  5 ++++
 src/encmain.c | 10 +++++++
 src/encoder.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/encoder.h |  7 +++++
 src/global.h  |  1 +
 6 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/src/config.c b/src/config.c
index ec146a8a..f2efdb34 100644
--- a/src/config.c
+++ b/src/config.c
@@ -85,6 +85,12 @@ int config_init(config *cfg)
   cfg->tiles_height_count         = 0;
   cfg->tiles_width_split          = NULL;
   cfg->tiles_height_split          = NULL;
+  
+  cfg->wpp = 0;
+  cfg->slice_count = 1;
+  cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
+  cfg->slice_addresses_in_ts[0] = 0;
+  
 
   return 1;
 }
@@ -101,6 +107,7 @@ int config_destroy(config *cfg)
   FREE_POINTER(cfg->cqmfile);
   FREE_POINTER(cfg->tiles_width_split);
   FREE_POINTER(cfg->tiles_height_split);
+  FREE_POINTER(cfg->slice_addresses_in_ts);
   free(cfg);
 
   return 1;
@@ -211,6 +218,67 @@ static int parse_tiles_specification(const char* const arg, int32_t * const ntil
   return 1;
 }
 
+static int parse_slice_specification(const char* const arg, int32_t * const nslices, int32_t** const array) {
+  const char* current_arg = NULL;
+  int32_t current_value;
+  int32_t values[MAX_SLICES];
+  
+  int i;
+  
+  //Free pointer in any case
+  if (*array) {
+    FREE_POINTER(*array);
+  }
+  
+  //If the arg starts with u, we want an uniform split
+  if (arg[0]=='u') {
+    *nslices = atoi(arg+1);
+    if (MAX_SLICES <= *nslices || 0 >= *nslices) {
+      fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_SLICES)!\n", *nslices + 1, MAX_SLICES);
+      return 0;
+    }
+    //Done with parsing
+    return 1;
+  }
+  
+  //We have a comma-separated list of int for the split...
+  current_arg = arg;
+  //We always have a slice starting at 0
+  values[0] = 0;
+  *nslices = 1;
+  do {
+    int ret = sscanf(current_arg, "%d", &current_value);
+    if (ret != 1) {
+      fprintf(stderr, "Could not parse integer \"%s\"!\n", current_arg);
+      return 0;
+    }
+    current_arg = strchr(current_arg, ',');
+    //Skip the , if we found one
+    if (current_arg) ++current_arg;
+    values[*nslices] = current_value;
+    ++(*nslices);
+    if (MAX_SLICES <= *nslices) break;
+  } while (current_arg);
+  
+  if (MAX_SLICES <= *nslices || 0 >= *nslices) {
+    fprintf(stderr, "Invalid number of slices (0 < %d <= %d = MAX_SLICES)!\n", *nslices, MAX_SLICES);
+    return 0;
+  }
+  
+  *array = MALLOC(int32_t, *nslices);
+  if (!*array) {
+    fprintf(stderr, "Could not allocate array for slices\n");
+    return 0;
+  }
+  
+  //TODO: memcpy?
+  for (i = 0; i < *nslices; ++i) {
+    (*array)[i] = values[i];
+  }
+  
+  return 1;
+}
+
 static int config_parse(config *cfg, const char *name, const char *value)
 {
   static const char * const overscan_names[]    = { "undef", "show", "crop", NULL };
@@ -341,6 +409,10 @@ static int config_parse(config *cfg, const char *name, const char *value)
     error = !parse_tiles_specification(value, &cfg->tiles_width_count, &cfg->tiles_width_split);
   else if OPT("tiles-height-split")
     error = !parse_tiles_specification(value, &cfg->tiles_height_count, &cfg->tiles_height_split);
+  else if OPT("wpp")
+    cfg->wpp = atobool(value);
+  else if OPT("slice-addresses")
+    error = !parse_slice_specification(value, &cfg->slice_count, &cfg->slice_addresses_in_ts);
   else
     return 0;
 #undef OPT
@@ -389,6 +461,8 @@ int config_read(config *cfg,int argc, char *argv[])
     { "seek",               required_argument, NULL, 0 },
     { "tiles-width-split",  required_argument, NULL, 0 },
     { "tiles-height-split", required_argument, NULL, 0 },
+    { "wpp",                      no_argument, NULL, 0 },
+    { "slice-addresses",    required_argument, NULL, 0 },
     {0, 0, 0, 0}
   };
 
diff --git a/src/config.h b/src/config.h
index 8c424655..8886988d 100644
--- a/src/config.h
+++ b/src/config.h
@@ -69,6 +69,11 @@ typedef struct
   int32_t tiles_height_count;      /*!< \brief number of tiles separation in y direction */
   int32_t* tiles_width_split;      /*!< \brief tiles split x coordinates (dimension: tiles_width_count) */
   int32_t* tiles_height_split;      /*!< \brief tiles split y coordinates (dimension: tiles_height_count) */
+  
+  int wpp;
+  
+  int32_t slice_count;
+  int32_t* slice_addresses_in_ts;
 } config;
 
 /* Function definitions */
diff --git a/src/encmain.c b/src/encmain.c
index c8035b43..e8e98b32 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -151,6 +151,16 @@ int main(int argc, char *argv[])
             "                                   Can also be u followed by and a single int n,\n"
             "                                   in which case it produces rows of uniform height.\n"
             "\n"
+            "  Wpp:\n"
+            "          --wpp:                   Enable wavefront parallel processing\n"
+            "\n"
+            "  Slices:\n"
+            "          --slice-addresses <string>|u<int>: \n"
+            "                                   Specifies a comma separated list of LCU\n"
+            "                                   positions in tile scan order of tile separations.\n"
+            "                                   Can also be u followed by and a single int n,\n"
+            "                                   in which case it produces uniform slice length.\n"
+            "\n"
             "  Deprecated parameters: (might be removed at some point)\n"
             "     Use --input-res:\n"
             "       -w, --width               : Width of input in pixels\n"
diff --git a/src/encoder.c b/src/encoder.c
index 45b1bf4d..3e50bdfe 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -80,6 +80,26 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
   encoder_state->cur_lambda_cost = lambda;
 }
 
+static int lcu_at_slice_start(encoder_control * const encoder, int lcu_addr_in_rs) {
+  int i;
+  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_rs == 0) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_rs) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_slice_end(encoder_control * const encoder, int lcu_addr_in_rs) {
+  int i;
+  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_rs == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_rs + 1) return 1;
+  }
+  return 0;
+}
+
 int encoder_control_init(encoder_control * const encoder, const config * const cfg) {
   if (!cfg) {
     fprintf(stderr, "Config object must not be null!\n");
@@ -253,6 +273,41 @@ int encoder_control_init(encoder_control * const encoder, const config * const c
     encoder->tiles_ctb_addr_ts_to_rs = tiles_ctb_addr_ts_to_rs;
     
     encoder->tiles_tile_id = tiles_tile_id;
+    
+    //Slices
+    {
+      int *slice_addresses_in_ts;
+      encoder->slice_count = encoder->cfg->slice_count;
+      if (encoder->slice_count == 0) {
+        encoder->slice_count = 1;
+        slice_addresses_in_ts = MALLOC(int, encoder->slice_count);
+        slice_addresses_in_ts[0] = 0;
+      } else {
+        int i;
+        slice_addresses_in_ts = MALLOC(int, encoder->slice_count);
+        if (!encoder->cfg->slice_addresses_in_ts) {
+          slice_addresses_in_ts[0] = 0;
+          for (i=1; i < encoder->slice_count; ++i) {
+            slice_addresses_in_ts[i] = encoder->in.width_in_lcu * encoder->in.height_in_lcu * i / encoder->slice_count;
+          }
+        } else {
+          for (i=0; i < encoder->slice_count; ++i) {
+            slice_addresses_in_ts[i] = encoder->cfg->slice_addresses_in_ts[i];
+          }
+        }
+      }
+      
+      encoder->slice_addresses_in_ts = slice_addresses_in_ts;
+    }
+    
+    encoder->wpp = encoder->cfg->wpp;
+    
+    //FIXME: remove
+    if (encoder->slice_count) {
+      
+      lcu_at_slice_start(encoder, 0);
+      lcu_at_slice_end(encoder, 0);
+    }
 
 #ifdef _DEBUG
     printf("Tiles columns width:");
@@ -268,18 +323,35 @@ int encoder_control_init(encoder_control * const encoder, const config * const c
     //Print tile index map
     for (y = 0; y < encoder->in.height_in_lcu; ++y) {
       for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        printf("%2d ", encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[y * encoder->in.width_in_lcu + x]]);
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        const char slice_start = lcu_at_slice_start(encoder, lcu_id_ts) ? '|' : ' ';
+        const char slice_end = lcu_at_slice_end(encoder, lcu_id_ts)  ? '|' : ' ';
+        
+        printf("%c%03d%c", slice_start, encoder->tiles_tile_id[lcu_id_ts], slice_end);
       }
       printf("\n");
     }
+    printf("\n");
+    if (encoder->wpp) {
+      printf("Wavefront Parallel Processing: enabled\n");
+    } else {
+      printf("Wavefront Parallel Processing: disabled\n");
+    }
+    printf("\n");
 #endif //_DEBUG
 
+    
+
   }
   
   return 1;
 }
 
 int encoder_control_finalize(encoder_control * const encoder) {
+  //Slices
+  FREE_POINTER(encoder->slice_addresses_in_ts);
+  
   //Tiles
   FREE_POINTER(encoder->tiles_col_width);
   FREE_POINTER(encoder->tiles_row_height);
diff --git a/src/encoder.h b/src/encoder.h
index 43a6de50..a00b616a 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -120,6 +120,13 @@ typedef struct
   
   const int32_t *tiles_tile_id; /*!<spec:  TileId (6.5.1); dimension: PicSizeInCtbsY */
   
+  //WPP
+  int wpp;
+  
+  //Slices
+  int slice_count;
+  const int* slice_addresses_in_ts;
+  
 } encoder_control;
 
 typedef struct encoder_state {
diff --git a/src/global.h b/src/global.h
index 39146ca8..4394d333 100644
--- a/src/global.h
+++ b/src/global.h
@@ -128,6 +128,7 @@ typedef int16_t coefficient;
 #define SIZE_NONE  15
 
 #define MAX_TILES_PER_DIM 16
+#define MAX_SLICES 16
 
 /* Inlining functions */
 #ifdef _MSC_VER /* Visual studio */

From 5ce518a47ab38006ed1810a6e0b7937bfa49f6d6 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 6 May 2014 07:29:16 +0200
Subject: [PATCH 05/21] lcu_at_tile_start and lcu_at_tile_end helper functions

---
 src/encoder.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/encoder.c b/src/encoder.c
index 3e50bdfe..03e6ce60 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -100,6 +100,24 @@ static int lcu_at_slice_end(encoder_control * const encoder, int lcu_addr_in_rs)
   return 0;
 }
 
+static int lcu_at_tile_start(encoder_control * const encoder, int lcu_addr_in_rs) {
+  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_rs == 0) return 1;
+  if (encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs - 1]] != encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs]]) {
+    return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_tile_end(encoder_control * const encoder, int lcu_addr_in_rs) {
+  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_rs == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  if (encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs + 1]] != encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs]]) {
+    return 1;
+  }
+  return 0;
+}
+
 int encoder_control_init(encoder_control * const encoder, const config * const cfg) {
   if (!cfg) {
     fprintf(stderr, "Config object must not be null!\n");
@@ -304,9 +322,10 @@ int encoder_control_init(encoder_control * const encoder, const config * const c
     
     //FIXME: remove
     if (encoder->slice_count) {
-      
       lcu_at_slice_start(encoder, 0);
       lcu_at_slice_end(encoder, 0);
+      lcu_at_tile_start(encoder, 0);
+      lcu_at_tile_end(encoder, 0);
     }
 
 #ifdef _DEBUG

From a23edd0339ed1e4f775b3787d89feac13b7c4592 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 6 May 2014 08:06:10 +0200
Subject: [PATCH 06/21] added parent to encoder_state

---
 src/encoder.c | 4 +++-
 src/encoder.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/encoder.c b/src/encoder.c
index 03e6ce60..8c695d96 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -387,13 +387,15 @@ int encoder_control_finalize(encoder_control * const encoder) {
   return 1;
 }
 
-static int encoder_state_init_one(encoder_state * const state, const encoder_state * const parent_state, const int tile_x, const int tile_y) {
+static int encoder_state_init_one(encoder_state * const state, encoder_state * const parent_state, const int tile_x, const int tile_y) {
   const encoder_control *encoder;
   int width_in_lcu;
   int height_in_lcu;
   int width;
   int height;
   
+  state->parent = parent_state;
+  
   if (!parent_state) {
     //Use encoder_control from current state (has to be initialized)
     encoder = state->encoder_control;
diff --git a/src/encoder.h b/src/encoder.h
index a00b616a..4956fe39 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -138,6 +138,7 @@ typedef struct encoder_state {
   //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
   //Use do { } while (encoder_state->children[++i].encoder_control)
   struct encoder_state *children;
+  struct encoder_state *parent;
   
   //Tile: offset in LCU for current encoder_state
   int32_t lcu_offset_x;

From 6c6adf18c7292a30055b962f06c64c74af6df733 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 6 May 2014 10:13:18 +0200
Subject: [PATCH 07/21] Refactor encoder_state

---
 src/encmain.c   |  21 +-
 src/encoder.c   | 607 ++++++++++++++++++++++++++++++------------------
 src/encoder.h   |  82 +++++--
 src/filter.c    |  12 +-
 src/inter.c     |  34 +--
 src/intra.c     |   4 +-
 src/rdo.c       |  32 +--
 src/sao.c       |  12 +-
 src/search.c    |  50 ++--
 src/transform.c |   6 +-
 10 files changed, 535 insertions(+), 325 deletions(-)

diff --git a/src/encmain.c b/src/encmain.c
index e8e98b32..9170fc40 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -266,12 +266,13 @@ int main(int argc, char *argv[])
          encoder.in.width, encoder.in.height,
          encoder.in.real_width, encoder.in.real_height);
   
-  if (!encoder_state_init(&encoder_state, &encoder)) {
+  encoder_state.encoder_control = &encoder;
+  if (!encoder_state_init(&encoder_state, NULL)) {
     goto exit_failure;
   }
   
-  encoder_state.frame    = 0;
-  encoder_state.QP       = (int8_t)encoder.cfg->qp;
+  encoder_state.global->frame    = 0;
+  encoder_state.global->QP       = (int8_t)encoder.cfg->qp;
 
   // Only the code that handles conformance window coding needs to know
   // the real dimensions. As a quick fix for broken non-multiple of 8 videos,
@@ -282,14 +283,14 @@ int main(int argc, char *argv[])
   //cfg->height = encoder.in.height;
 
   // Start coding cycle while data on input and not on the last frame
-  while(!cfg->frames || encoder_state.frame < cfg->frames) {
+  while(!cfg->frames || encoder_state.global->frame < cfg->frames) {
     int32_t diff;
     double temp_psnr[3];
 
     // Skip '--seek' frames before input.
     // This block can be moved outside this while loop when there is a
     // mechanism to skip the while loop on error.
-    if (encoder_state.frame == 0 && cfg->seek > 0) {
+    if (encoder_state.global->frame == 0 && cfg->seek > 0) {
       int frame_bytes = cfg->width * cfg->height * 3 / 2;
       int error = 0;
 
@@ -312,14 +313,14 @@ int main(int argc, char *argv[])
     // Read one frame from the input
     if (!read_one_frame(input, &encoder_state)) {
       if (!feof(input))
-        fprintf(stderr, "Failed to read a frame %d\n", encoder_state.frame);
+        fprintf(stderr, "Failed to read a frame %d\n", encoder_state.global->frame);
       break;
     }
 
     // The actual coding happens here, after this function we have a coded frame
     encode_one_frame(&encoder_state);
     
-    cur_pic = encoder_state.cur_pic;
+    cur_pic = encoder_state.tile->cur_pic;
 
     if (cfg->debug != NULL) {
       // Write reconstructed frame out.
@@ -353,7 +354,7 @@ int main(int argc, char *argv[])
     temp_psnr[1] = image_psnr(cur_pic->u_data, cur_pic->u_recdata, cfg->width>>1, cfg->height>>1);
     temp_psnr[2] = image_psnr(cur_pic->v_data, cur_pic->v_recdata, cfg->width>>1, cfg->height>>1);
 
-    fprintf(stderr, "POC %4d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f\n", encoder_state.frame,
+    fprintf(stderr, "POC %4d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f\n", encoder_state.global->frame,
            "BPI"[cur_pic->slicetype%3], diff<<3,
            temp_psnr[0], temp_psnr[1], temp_psnr[2]);
 
@@ -371,8 +372,8 @@ int main(int argc, char *argv[])
   fgetpos(output,(fpos_t*)&curpos);
 
   // Print statistics of the coding
-  fprintf(stderr, " Processed %d frames, %10llu bits AVG PSNR: %2.4f %2.4f %2.4f\n", encoder_state.frame, (long long unsigned int) curpos<<3,
-         psnr[0] / encoder_state.frame, psnr[1] / encoder_state.frame, psnr[2] / encoder_state.frame);
+  fprintf(stderr, " Processed %d frames, %10llu bits AVG PSNR: %2.4f %2.4f %2.4f\n", encoder_state.global->frame, (long long unsigned int) curpos<<3,
+         psnr[0] / encoder_state.global->frame, psnr[1] / encoder_state.global->frame, psnr[2] / encoder_state.global->frame);
   fprintf(stderr, " Total time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC);
 
   fclose(input);
diff --git a/src/encoder.c b/src/encoder.c
index 8c695d96..14b82907 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -58,8 +58,8 @@ static void encode_sao(encoder_state *encoder,
  */
 void encoder_state_init_lambda(encoder_state * const encoder_state)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
-  double qp = encoder_state->QP;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  double qp = encoder_state->global->QP;
   double lambda_scale = 1.0;
   double qp_temp      = qp - 12;
   double lambda;
@@ -77,42 +77,42 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
     lambda *= 0.95;
   }
 
-  encoder_state->cur_lambda_cost = lambda;
+  encoder_state->global->cur_lambda_cost = lambda;
 }
 
-static int lcu_at_slice_start(encoder_control * const encoder, int lcu_addr_in_rs) {
+static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
   int i;
-  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_rs == 0) return 1;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
   for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_rs) return 1;
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
   }
   return 0;
 }
 
-static int lcu_at_slice_end(encoder_control * const encoder, int lcu_addr_in_rs) {
+static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
   int i;
-  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_rs == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
   for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_rs + 1) return 1;
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
   }
   return 0;
 }
 
-static int lcu_at_tile_start(encoder_control * const encoder, int lcu_addr_in_rs) {
-  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_rs == 0) return 1;
-  if (encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs - 1]] != encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs]]) {
+static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
     return 1;
   }
   return 0;
 }
 
-static int lcu_at_tile_end(encoder_control * const encoder, int lcu_addr_in_rs) {
-  assert(lcu_addr_in_rs >= 0 && lcu_addr_in_rs < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_rs == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  if (encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs + 1]] != encoder->tiles_tile_id[encoder->tiles_ctb_addr_rs_to_ts[lcu_addr_in_rs]]) {
+static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
     return 1;
   }
   return 0;
@@ -387,101 +387,279 @@ int encoder_control_finalize(encoder_control * const encoder) {
   return 1;
 }
 
-static int encoder_state_init_one(encoder_state * const state, encoder_state * const parent_state, const int tile_x, const int tile_y) {
-  const encoder_control *encoder;
-  int width_in_lcu;
-  int height_in_lcu;
-  int width;
-  int height;
-  
-  state->parent = parent_state;
-  
-  if (!parent_state) {
-    //Use encoder_control from current state (has to be initialized)
-    encoder = state->encoder_control;
-    assert(encoder);
-    
-    width_in_lcu = encoder->in.width_in_lcu;
-    height_in_lcu = encoder->in.height_in_lcu;
-    width = encoder->in.width;
-    height = encoder->in.height;
-    
-    state->lcu_offset_x = 0;
-    state->lcu_offset_y = 0;
-  } else {
-    //Use parent encoder_control
-    encoder = parent_state->encoder_control;
-    assert(encoder);
-    state->encoder_control = parent_state->encoder_control;
-    
-    state->lcu_offset_x = encoder->tiles_col_bd[tile_x];
-    state->lcu_offset_y = encoder->tiles_row_bd[tile_y];
-    
-    width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
-    height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
-    width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - state->lcu_offset_x * LCU_WIDTH);
-    height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - state->lcu_offset_y * LCU_WIDTH);
+static int encoder_state_config_global_init(encoder_state * const encoder_state) {
+  encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
+  if(!encoder_state->global->ref) {
+    fprintf(stderr, "Failed to allocate the picture list!\n");
+    return 0;
   }
-  
-  //Ok we have all the variables initialized, do the real work now
-  
-  if (parent_state) {
-    if (!bitstream_init(&state->stream, BITSTREAM_TYPE_MEMORY)) {
-      fprintf(stderr, "Could not initialize stream (subencoder)!\n");
-      return 0;
-    }
-    
-    //FIXME: at some point, we may want to have a ref list for each subencoder (would allow overlapping between frames)
-    state->ref = parent_state->ref;
-    state->ref_list = parent_state->ref_list;
-  } else {
-    // Allocate the bitstream struct
-    if (!bitstream_init(&state->stream, BITSTREAM_TYPE_FILE)) {
-      fprintf(stderr, "Could not initialize stream!\n");
-      return 0;
-    }
-    
-    state->ref = picture_list_init(MAX_REF_PIC_COUNT);
-    if(!state->ref) {
-      fprintf(stderr, "Failed to allocate the picture list!\n");
-      return 0;
-    }
-    state->ref_list = REF_PIC_LIST_0;
-  }
-  
-  state->frame = 0;
-  state->poc = 0;
-  
-  state->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
+  encoder_state->global->ref_list = REF_PIC_LIST_0;
+  encoder_state->global->frame = 0;
+  encoder_state->global->poc = 0;
+  return 1;
+}
 
-  if (!state->cur_pic) {
+static void encoder_state_config_global_finalize(encoder_state * const encoder_state) {
+  picture_list_destroy(encoder_state->global->ref);
+}
+
+
+
+static int encoder_state_config_tile_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_x, const int lcu_offset_y,
+                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
+  
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  encoder_state->tile->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
+
+  if (!encoder_state->tile->cur_pic) {
     printf("Error allocating picture!\r\n");
     return 0;
   }
   
   // Init coeff data table
-  state->cur_pic->coeff_y = MALLOC(coefficient, width * height);
-  state->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
-  state->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
-
-  state->children = NULL;
+  //FIXME: move them
+  encoder_state->tile->cur_pic->coeff_y = MALLOC(coefficient, width * height);
+  encoder_state->tile->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
+  encoder_state->tile->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
   
-  // Set CABAC output bitstream
-  state->cabac.stream = &state->stream;
+  encoder_state->tile->lcu_offset_x = lcu_offset_x;
+  encoder_state->tile->lcu_offset_y = lcu_offset_y;
   
+  encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
   return 1;
 }
 
-int encoder_state_init(encoder_state * const encoder_state, const encoder_control * const encoder) {
-  encoder_state->encoder_control = encoder;
-  if (!encoder_state_init_one(encoder_state, NULL, 0, 0)) {
-    fprintf(stderr, "Could not initialize main encoder state!\n");
-    return 0;
+static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
+  picture_free(encoder_state->tile->cur_pic);
+  encoder_state->tile->cur_pic = NULL;
+}
+
+static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
+                                          const int start_address_in_ts, const int end_address_in_ts) {
+  //Has to be called AFTER initializing encoder_state->tile
+  encoder_state->slice->start_in_ts = start_address_in_ts - encoder_state->tile->lcu_offset_in_ts;
+  encoder_state->slice->end_in_ts = end_address_in_ts - encoder_state->tile->lcu_offset_in_ts;
+  
+  encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
+  encoder_state->slice->end_in_ts = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
+  return 1;
+}
+
+static void encoder_state_config_slice_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+static int encoder_state_config_wfrow_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_y) {
+  
+  encoder_state->wfrow->lcu_offset_y = lcu_offset_y;
+  return 1;
+}
+
+static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+
+int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
+  //We require that, if parent_state is NULL:
+  //child_state->encoder_control is set
+  //
+  //If parent_state is not NULL, the following variable should either be set to NULL,
+  //in order to inherit from parent, or should point to a valid structure:
+  //child_state->global
+  //child_state->tile
+  //child_state->slice
+  //child_state->wfrow
+  
+  child_state->parent = parent_state;
+  child_state->children = MALLOC(encoder_state, 1);
+  child_state->children[0].encoder_control = NULL;
+  
+  if (!parent_state) {
+    const encoder_control * const encoder = child_state->encoder_control;
+    child_state->type = ENCODER_STATE_TYPE_MAIN;
+    assert(child_state->encoder_control);
+    child_state->global = MALLOC(encoder_state_config_global, 1);
+    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
+      fprintf(stderr, "Could not initialize encoder_state->global!\n");
+      return 0;
+    }
+    child_state->tile = MALLOC(encoder_state_config_tile, 1);
+    if (!child_state->tile || !encoder_state_config_tile_init(child_state, 0, 0, encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu)) {
+      fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+      return 0;
+    }
+    child_state->slice = MALLOC(encoder_state_config_slice, 1);
+    if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
+      fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+      return 0;
+    }
+    child_state->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+    if (!child_state->wfrow || !encoder_state_config_wfrow_init(child_state, 0)) {
+      fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+      return 0;
+    }
+  } else {
+    child_state->encoder_control = parent_state->encoder_control;
+    if (!child_state->global) child_state->global = parent_state->global;
+    if (!child_state->tile) child_state->tile = parent_state->tile;
+    if (!child_state->slice) child_state->slice = parent_state->slice;
+    if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
   }
   
-  encoder_state->stream.file.output = encoder->out.file;
+  //Allocate bitstream
+  if (child_state->type == ENCODER_STATE_TYPE_MAIN) {
+    //Main encoder outputs to file
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_FILE)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+    child_state->stream.file.output = child_state->encoder_control->out.file;
+  } else {
+    //Other encoders use a memory bitstream
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_MEMORY)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+  }
   
-  if (encoder->tiles_enable) {
+  // Set CABAC output bitstream
+  child_state->cabac.stream = &child_state->stream;
+  
+  //Create sub-encoders
+  {
+    const encoder_control * const encoder = child_state->encoder_control;
+    int child_count = 0;
+    //We first check the type of this element.
+    //If it's a MAIN, it can allow both slices or tiles as child
+    //If it's a TILE, it can allow slices as child, if its parent is not a slice, or wavefront rows if there is no other children
+    //If it's a SLICE, it can allow tiles as child, if its parent is not a tile, or wavefront rows if there is no other children
+    //If it's a WAVEFRONT_ROW, it doesn't allow any children
+    int children_allow_wavefront_row = 0;
+    int children_allow_slice = 0;
+    int children_allow_tile = 0;
+    
+    int start_in_ts, end_in_ts;
+    
+    switch(child_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        children_allow_slice = 1;
+        children_allow_tile = 1;
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        break;
+      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
+        break;
+      default:
+        fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
+        assert(0);
+    }
+    
+    //Full span to analyze
+    start_in_ts = child_state->tile->lcu_offset_in_ts + child_state->slice->start_in_ts;
+    end_in_ts = MIN(child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu, child_state->tile->lcu_offset_in_ts + child_state->slice->end_in_ts);
+    while (start_in_ts < end_in_ts) {
+      encoder_state *new_child = NULL;
+      int range_start = start_in_ts;
+      int range_end_slice = start_in_ts; //Will be incremented to get the range of the "thing"
+      int range_end_tile = start_in_ts; //Will be incremented to get the range of the "thing"
+      
+      int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
+      int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
+      
+      //Find the smallest structure following the cursor
+      if (slice_allowed) {
+        while(!lcu_at_slice_end(encoder, range_end_slice)) {
+          ++range_end_slice;
+        }
+      }
+      
+      if (tile_allowed) {
+        while(!lcu_at_tile_end(encoder, range_end_tile)) {
+          ++range_end_tile;
+        }
+      }
+      
+      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed);
+      
+      if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
+        //Create a slice
+        
+        printf("%p slice: %d - %d\n", child_state, range_start, range_end_slice);
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_SLICE;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->wfrow = child_state->wfrow;
+        new_child->slice = MALLOC(encoder_state_config_slice, 1);
+        if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
+          fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+          return 0;
+        }
+      }
+      
+      if ((!slice_allowed || (range_end_slice < range_end_tile)) && !new_child && tile_allowed) {
+        //Create a tile
+        int tile_id = encoder->tiles_tile_id[range_start];
+        int tile_x = tile_id % encoder->tiles_num_tile_columns;
+        int tile_y = tile_id / encoder->tiles_num_tile_columns;
+        
+        int lcu_offset_x = encoder->tiles_col_bd[tile_x];
+        int lcu_offset_y = encoder->tiles_row_bd[tile_y];
+        int width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
+        int height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
+        int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
+        int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
+        
+        printf("%p tile: %d - %d (%d)\n", child_state, range_start, range_end_tile, tile_id);
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_TILE;
+        new_child->global = child_state->global;
+        new_child->tile = MALLOC(encoder_state_config_tile, 1);
+        new_child->slice = child_state->slice;
+        new_child->wfrow = child_state->wfrow;
+        
+        if (!new_child->tile || !encoder_state_config_tile_init(new_child, lcu_offset_x, lcu_offset_y, width, height, width_in_lcu, height_in_lcu)) {
+          fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+          return 0;
+        }
+      }
+      
+      if (new_child) {
+        child_state->children = realloc(child_state->children, sizeof(encoder_state) * (2+child_count));
+        child_state->children[1+child_count].encoder_control = NULL;
+        if (!child_state->children) {
+          fprintf(stderr, "Failed to allocate memory for children...\n");
+          return 0;
+        }
+        if (!encoder_state_init(&child_state->children[child_count], child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+        child_count += 1;
+      }
+      
+      start_in_ts = MAX(range_end_slice, range_end_tile) + 1;
+    }
+    
+    if (children_allow_wavefront_row) {
+      printf("Wavefront\n");
+    }
+  }
+  
+/*  if (encoder->tiles_enable) {
     int x,y;
     //Allocate subencoders (valid subencoder have a non null encoder_control field, so we use a null one to mark the end of the list)
     encoder_state->children = MALLOC(struct encoder_state, encoder->tiles_num_tile_columns * encoder->tiles_num_tile_rows + 1);
@@ -496,49 +674,50 @@ int encoder_state_init(encoder_state * const encoder_state, const encoder_contro
           return 0;
         }
       }
-    }
-  }
-  
+  */
   return 1;
 }
 
-static int encoder_state_finalize_one(encoder_state * const encoder_state) {
-  picture_free(encoder_state->cur_pic);
-  encoder_state->cur_pic = NULL;
-  
-  bitstream_finalize(&encoder_state->stream);
-  return 1;
-}
-
-
-int encoder_state_finalize(encoder_state * const encoder_state) {
+void encoder_state_finalize(encoder_state * const encoder_state) {
   if (encoder_state->children) {
     int i=0;
     do {
-      encoder_state_finalize_one(&encoder_state->children[i]);
+      encoder_state_finalize(&encoder_state->children[i]);
     } while (encoder_state->children[++i].encoder_control);
     
     FREE_POINTER(encoder_state->children);
   }
   
-  encoder_state_finalize_one(encoder_state);
-  picture_list_destroy(encoder_state->ref);
-  return 1;
-}
-
-static void encoder_clear_refs(encoder_state *encoder_state) {
-  if (encoder_state->children) {
-    int i=0;
-    do {
-      encoder_state->children[i].poc = 0;
-    } while (encoder_state->children[++i].encoder_control);
+  if (!encoder_state->parent || (encoder_state->parent->wfrow != encoder_state->wfrow)) {
+    encoder_state_config_wfrow_finalize(encoder_state);
+    FREE_POINTER(encoder_state->wfrow);
   }
   
-  while (encoder_state->ref->used_size) {
-    picture_list_rem(encoder_state->ref, encoder_state->ref->used_size - 1);
+  if (!encoder_state->parent || (encoder_state->parent->slice != encoder_state->slice)) {
+    encoder_state_config_slice_finalize(encoder_state);
+    FREE_POINTER(encoder_state->slice);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->tile != encoder_state->tile)) {
+    encoder_state_config_tile_finalize(encoder_state);
+    FREE_POINTER(encoder_state->tile);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->global != encoder_state->global)) {
+    encoder_state_config_global_finalize(encoder_state);
+    FREE_POINTER(encoder_state->global);
+  }
+  
+  bitstream_finalize(&encoder_state->stream);
+}
+
+
+static void encoder_clear_refs(encoder_state *encoder_state) {
+  while (encoder_state->global->ref->used_size) {
+    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
   }
 
-  encoder_state->poc = 0;
+  encoder_state->global->poc = 0;
 }
 
 void encoder_control_input_init(encoder_control * const encoder,
@@ -594,7 +773,7 @@ static void write_aud(encoder_state * const encoder_state)
 
 static void substream_write_bitstream(encoder_state * const encoder_state, const int end_of_sub_stream) {
   const encoder_control * const encoder = encoder_state->encoder_control;
-  const picture* const cur_pic = encoder_state->cur_pic;
+  const picture* const cur_pic = encoder_state->tile->cur_pic;
   const int lcu_count = cur_pic->width_in_lcu * cur_pic->height_in_lcu;
   int lcu_id;
   vector2d lcu;
@@ -627,19 +806,19 @@ static void substream_encode(encoder_state * const encoder_state) {
   const unsigned long long int debug_bitstream_position = bitstream_tell(&(encoder_state->stream));
 #endif
   
-  yuv_t *hor_buf = yuv_t_alloc(encoder_state->cur_pic->width);
+  yuv_t *hor_buf = yuv_t_alloc(encoder_state->tile->cur_pic->width);
   // Allocate 2 extra luma pixels so we get 1 extra chroma pixel for the
   // for the extra pixel on the top right.
   yuv_t *ver_buf = yuv_t_alloc(LCU_WIDTH + 2);
   
   cabac_start(&encoder_state->cabac);
-  init_contexts(encoder_state, encoder_state->QP, encoder_state->cur_pic->slicetype);
+  init_contexts(encoder_state, encoder_state->global->QP, encoder_state->tile->cur_pic->slicetype);
 
   // Initialize lambda value(s) to use in search
   encoder_state_init_lambda(encoder_state);
 
   {
-    picture* const cur_pic = encoder_state->cur_pic;
+    picture* const cur_pic = encoder_state->tile->cur_pic;
     int lcu_id;
     int lcu_count = cur_pic->width_in_lcu * cur_pic->height_in_lcu;
     
@@ -736,14 +915,14 @@ static void substream_encode(encoder_state * const encoder_state) {
 }
 
 static void subencoder_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
-  const int source_offset_x = source_enc->lcu_offset_x * LCU_WIDTH;
-  const int source_offset_y = source_enc->lcu_offset_y * LCU_WIDTH;
+  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
   
-  const int target_offset_x = target_enc->lcu_offset_x * LCU_WIDTH;
-  const int target_offset_y = target_enc->lcu_offset_y * LCU_WIDTH;
+  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
   
-  int source_stride = source_enc->cur_pic->width;
-  int target_stride = target_enc->cur_pic->width;
+  int source_stride = source_enc->tile->cur_pic->width;
+  int target_stride = target_enc->tile->cur_pic->width;
   
   int width;
   int height;
@@ -755,21 +934,21 @@ static void subencoder_blit_pixels(const encoder_state * const target_enc, pixel
   assert(target_enc->children || source_enc->children);
 
   if (is_y_channel) {
-    target_offset = source_offset_x + source_offset_y * target_enc->cur_pic->width;
-    source_offset = target_offset_x + target_offset_y * source_enc->cur_pic->width;
+    target_offset = source_offset_x + source_offset_y * target_enc->tile->cur_pic->width;
+    source_offset = target_offset_x + target_offset_y * source_enc->tile->cur_pic->width;
   } else {
-    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->cur_pic->width/2;
-    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->cur_pic->width/2;
+    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->cur_pic->width/2;
+    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->cur_pic->width/2;
   }
   
   if (target_enc->children) {
     //Use information from the source
-    width = MIN(source_enc->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->cur_pic->width - source_offset_x);
-    height = MIN(source_enc->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->cur_pic->height - source_offset_y);
+    width = MIN(source_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->width - source_offset_x);
+    height = MIN(source_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->height - source_offset_y);
   } else {
     //Use information from the target
-    width = MIN(target_enc->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->cur_pic->width - target_offset_x);
-    height = MIN(target_enc->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->cur_pic->height - target_offset_y);
+    width = MIN(target_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->width - target_offset_x);
+    height = MIN(target_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->height - target_offset_y);
   }
   
   if (!is_y_channel) {
@@ -789,9 +968,9 @@ void encode_one_frame(encoder_state * const main_state)
   const encoder_control * const encoder = main_state->encoder_control;
   bitstream * const stream = &main_state->stream;
 
-  const int is_first_frame = (main_state->frame == 0);
-  const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->frame % 2 == 0);
-  const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->frame % encoder->cfg->intra_period) == 0);
+  const int is_first_frame = (main_state->global->frame == 0);
+  const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
+  const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
   const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
 
 
@@ -803,8 +982,8 @@ void encode_one_frame(encoder_state * const main_state)
     // Clear the reference list
     encoder_clear_refs(main_state);
 
-    main_state->cur_pic->slicetype = SLICE_I;
-    main_state->cur_pic->type = NAL_IDR_W_RADL;
+    main_state->tile->cur_pic->slicetype = SLICE_I;
+    main_state->tile->cur_pic->type = NAL_IDR_W_RADL;
 
     // Access Unit Delimiter (AUD)
     if (encoder->aud_enable)
@@ -825,7 +1004,7 @@ void encode_one_frame(encoder_state * const main_state)
     encode_pic_parameter_set(main_state);
     bitstream_align(stream);
 
-    if (main_state->frame == 0) {
+    if (main_state->global->frame == 0) {
       // Prefix SEI
       nal_write(stream, PREFIX_SEI_NUT, 0, 0);
       encode_prefix_sei_version(main_state);
@@ -833,8 +1012,8 @@ void encode_one_frame(encoder_state * const main_state)
     }
   } else {
     // When intra period == 1, all pictures are intra
-    main_state->cur_pic->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
-    main_state->cur_pic->type = NAL_TRAIL_R;
+    main_state->tile->cur_pic->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
+    main_state->tile->cur_pic->type = NAL_TRAIL_R;
 
     // Access Unit Delimiter (AUD)
     if (encoder->aud_enable)
@@ -860,24 +1039,21 @@ void encode_one_frame(encoder_state * const main_state)
     #pragma omp parallel for
     for (i = 0; i < encoder->tiles_num_tile_rows * encoder->tiles_num_tile_columns; ++i) {
       encoder_state *subencoder = &(main_state->children[i]);
-
-      //TODO: ref frames
       
-      subencoder->QP = main_state->QP;
+      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
+      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
+      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
       
-      subencoder_blit_pixels(subencoder, subencoder->cur_pic->y_data, main_state, main_state->cur_pic->y_data, 1);
-      subencoder_blit_pixels(subencoder, subencoder->cur_pic->u_data, main_state, main_state->cur_pic->u_data, 0);
-      subencoder_blit_pixels(subencoder, subencoder->cur_pic->v_data, main_state, main_state->cur_pic->v_data, 0);
-      
-      subencoder->cur_pic->slicetype = main_state->cur_pic->slicetype;
-      subencoder->cur_pic->type = main_state->cur_pic->type;
+      //FIXME: remove this once these are in slice
+      subencoder->tile->cur_pic->slicetype = main_state->tile->cur_pic->slicetype;
+      subencoder->tile->cur_pic->type = main_state->tile->cur_pic->type;
       
       substream_encode(subencoder);
       substream_write_bitstream(subencoder, (main_state->children[i+1].encoder_control) != NULL);
       
-      subencoder_blit_pixels(main_state, main_state->cur_pic->y_recdata, subencoder, subencoder->cur_pic->y_recdata, 1);
-      subencoder_blit_pixels(main_state, main_state->cur_pic->u_recdata, subencoder, subencoder->cur_pic->u_recdata, 0);
-      subencoder_blit_pixels(main_state, main_state->cur_pic->v_recdata, subencoder, subencoder->cur_pic->v_recdata, 0);
+      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, subencoder, subencoder->tile->cur_pic->y_recdata, 1);
+      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, subencoder, subencoder->tile->cur_pic->u_recdata, 0);
+      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, subencoder, subencoder->tile->cur_pic->v_recdata, 0);
     }
     
     //We should do the slice header here, because we can have the entry points
@@ -899,7 +1075,8 @@ void encode_one_frame(encoder_state * const main_state)
   // Calculate checksum
   add_checksum(main_state);
 
-  main_state->cur_pic->poc = main_state->poc;
+  //FIXME: Why is this needed?
+  main_state->tile->cur_pic->poc = main_state->global->poc;
 }
 
 static void fill_after_frame(unsigned height, unsigned array_width,
@@ -945,38 +1122,38 @@ int read_one_frame(FILE* file, const encoder_state * const encoder_state)
 {
   unsigned width = encoder_state->encoder_control->in.real_width;
   unsigned height = encoder_state->encoder_control->in.real_height;
-  unsigned array_width = encoder_state->cur_pic->width;
-  unsigned array_height = encoder_state->cur_pic->height;
+  unsigned array_width = encoder_state->tile->cur_pic->width;
+  unsigned array_height = encoder_state->tile->cur_pic->height;
 
   if (width != array_width) {
     // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
     if (!read_and_fill_frame_data(file, width, height, array_width,
-                                  encoder_state->cur_pic->y_data) ||
+                                  encoder_state->tile->cur_pic->y_data) ||
         !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  encoder_state->cur_pic->u_data) ||
+                                  encoder_state->tile->cur_pic->u_data) ||
         !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  encoder_state->cur_pic->v_data))
+                                  encoder_state->tile->cur_pic->v_data))
       return 0;
   } else {
     // Otherwise the data can be read directly to the array.
     unsigned y_size = width * height;
     unsigned uv_size = (width >> 1) * (height >> 1);
-    if (y_size  != fread(encoder_state->cur_pic->y_data, sizeof(unsigned char),
+    if (y_size  != fread(encoder_state->tile->cur_pic->y_data, sizeof(unsigned char),
                          y_size, file) ||
-        uv_size != fread(encoder_state->cur_pic->u_data, sizeof(unsigned char),
+        uv_size != fread(encoder_state->tile->cur_pic->u_data, sizeof(unsigned char),
                          uv_size, file) ||
-        uv_size != fread(encoder_state->cur_pic->v_data, sizeof(unsigned char),
+        uv_size != fread(encoder_state->tile->cur_pic->v_data, sizeof(unsigned char),
                          uv_size, file))
       return 0;
   }
 
   if (height != array_height) {
     fill_after_frame(height, array_width, array_height,
-                     encoder_state->cur_pic->y_data);
+                     encoder_state->tile->cur_pic->y_data);
     fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     encoder_state->cur_pic->u_data);
+                     encoder_state->tile->cur_pic->u_data);
     fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     encoder_state->cur_pic->v_data);
+                     encoder_state->tile->cur_pic->v_data);
   }
   return 1;
 }
@@ -989,7 +1166,7 @@ int read_one_frame(FILE* file, const encoder_state * const encoder_state)
 static void add_checksum(encoder_state * const encoder_state)
 {
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   unsigned char checksum[3][SEI_HASH_MAX_LENGTH];
   uint32_t checksum_val;
   unsigned int i;
@@ -1016,7 +1193,7 @@ static void add_checksum(encoder_state * const encoder_state)
 void encode_access_unit_delimiter(encoder_state * const encoder_state)
 {
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   uint8_t pic_type = cur_pic->slicetype == SLICE_I ? 0
                    : cur_pic->slicetype == SLICE_P ? 1
                    :                                             2;
@@ -1087,7 +1264,7 @@ void encode_pic_parameter_set(encoder_state * const encoder_state)
 
   WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
   WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder_state->QP)-26, "pic_init_qp_minus26");
+  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
   WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
@@ -1250,7 +1427,8 @@ static void encode_scaling_list(encoder_state * const encoder_state)
 void encode_seq_parameter_set(encoder_state * const encoder_state)
 {
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  //FIXME: use encoder_control instead of cur_pic
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
 
 #ifdef _DEBUG
   printf("=========== Sequence Parameter Set ID: 0 ===========\n");
@@ -1493,43 +1671,33 @@ void encoder_next_frame(encoder_state *encoder_state) {
   picture *old_pic;
   
   // Remove the ref pic (if present)
-  if (encoder_state->ref->used_size == (uint32_t)encoder->cfg->ref_frames) {
-    picture_list_rem(encoder_state->ref, encoder_state->ref->used_size-1);
+  if (encoder_state->global->ref->used_size == (uint32_t)encoder->cfg->ref_frames) {
+    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size-1);
   }
   // Add current picture as reference
-  picture_list_add(encoder_state->ref, encoder_state->cur_pic);
+  picture_list_add(encoder_state->global->ref, encoder_state->tile->cur_pic);
   // Allocate new memory to current picture
-  old_pic = encoder_state->cur_pic;
+  old_pic = encoder_state->tile->cur_pic;
   // TODO: reuse memory from old reference
-  encoder_state->cur_pic = picture_alloc(encoder_state->cur_pic->width, encoder_state->cur_pic->height, encoder_state->cur_pic->width_in_lcu, encoder_state->cur_pic->height_in_lcu);
+  encoder_state->tile->cur_pic = picture_alloc(encoder_state->tile->cur_pic->width, encoder_state->tile->cur_pic->height, encoder_state->tile->cur_pic->width_in_lcu, encoder_state->tile->cur_pic->height_in_lcu);
 
+  //FIXME: does the coeff_* really belongs to cur_pic?
   // Copy pointer from the last cur_pic because we don't want to reallocate it
-  MOVE_POINTER(encoder_state->cur_pic->coeff_y,old_pic->coeff_y);
-  MOVE_POINTER(encoder_state->cur_pic->coeff_u,old_pic->coeff_u);
-  MOVE_POINTER(encoder_state->cur_pic->coeff_v,old_pic->coeff_v);
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_y,old_pic->coeff_y);
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_u,old_pic->coeff_u);
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_v,old_pic->coeff_v);
   
   picture_free(old_pic);
 
-  encoder_state->frame++;
-  encoder_state->poc++;
-  
-  if (encoder_state->children) {
-    int x,y;
-    for (y=0; y < encoder->tiles_num_tile_rows; ++y) {
-      for (x=0; x < encoder->tiles_num_tile_columns; ++x) {
-        const int i = y * encoder->tiles_num_tile_columns + x;
-        encoder_state->children[i].frame++;
-        encoder_state->children[i].poc++;
-      }
-    }
-  } 
+  encoder_state->global->frame++;
+  encoder_state->global->poc++;
 }
 
 void encode_slice_header(encoder_state * const encoder_state)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
 
 #ifdef _DEBUG
   printf("=========== Slice ===========\n");
@@ -1557,9 +1725,9 @@ void encode_slice_header(encoder_state * const encoder_state)
   if (cur_pic->type != NAL_IDR_W_RADL
       && cur_pic->type != NAL_IDR_N_LP) {
       int j;
-      int ref_negative = encoder_state->ref->used_size;
+      int ref_negative = encoder_state->global->ref->used_size;
       int ref_positive = 0;
-      WRITE_U(stream, encoder_state->poc&0xf, 4, "pic_order_cnt_lsb");
+      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
       WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
       WRITE_UE(stream, ref_negative, "num_negative_pics");
       WRITE_UE(stream, ref_positive, "num_positive_pics");
@@ -1582,7 +1750,7 @@ void encode_slice_header(encoder_state * const encoder_state)
 
   if (cur_pic->slicetype != SLICE_I) {
       WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-        WRITE_UE(stream, encoder_state->ref->used_size-1, "num_ref_idx_l0_active_minus1");
+        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
       WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
   }
 
@@ -1606,7 +1774,7 @@ static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
                              color_index color_i)
 {
   cabac_data * const cabac = &encoder_state->cabac;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   sao_eo_cat i;
 
   // Skip colors with no SAO.
@@ -1690,14 +1858,14 @@ void encode_coding_tree(encoder_state * const encoder_state,
                         uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
 {
   cabac_data * const cabac = &encoder_state->cabac;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   cu_info *cur_cu = &cur_pic->cu_array[x_ctb + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)];
   uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
   uint8_t split_model = 0;
   
   //Absolute ctb
-  uint16_t abs_x_ctb = x_ctb + (encoder_state->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-  uint16_t abs_y_ctb = y_ctb + (encoder_state->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+  uint16_t abs_x_ctb = x_ctb + (encoder_state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+  uint16_t abs_y_ctb = y_ctb + (encoder_state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
 
   // Check for slice border
   uint8_t border_x = ((encoder_state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
@@ -1852,7 +2020,7 @@ void encode_coding_tree(encoder_state * const encoder_state,
             //if(encoder_state->ref_idx_num[uiRefListIdx] > 0)
             {
           if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
-            if (encoder_state->ref->used_size != 1) { //encoder_state->ref_idx_num[uiRefListIdx] != 1)//NumRefIdx != 1)
+            if (encoder_state->global->ref->used_size != 1) { //encoder_state->ref_idx_num[uiRefListIdx] != 1)//NumRefIdx != 1)
               // parseRefFrmIdx
               int32_t ref_frame = cur_cu->inter.mv_ref;
 
@@ -1861,7 +2029,7 @@ void encode_coding_tree(encoder_state * const encoder_state,
 
               if (ref_frame > 0) {
                 int32_t i;
-                int32_t ref_num = encoder_state->ref->used_size - 2;
+                int32_t ref_num = encoder_state->global->ref->used_size - 2;
 
                 cabac->ctx = &(cabac->ctx_cu_ref_pic_model[1]);
                 ref_frame--;
@@ -1879,7 +2047,7 @@ void encode_coding_tree(encoder_state * const encoder_state,
               }
             }
 
-            if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder_state->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
+            if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder_state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
               const int32_t mvd_hor = cur_cu->inter.mvd[0];
               const int32_t mvd_ver = cur_cu->inter.mvd[1];
               const int8_t hor_abs_gr0 = mvd_hor != 0;
@@ -2393,15 +2561,15 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
           coeffcost += abs((int)temp_coeff[i]);
           coeffcost2 += abs((int)temp_coeff2[i]);
         }
-        cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->cur_lambda_cost+0.5);
-        cost2 += (coeffcost2 + (coeffcost2>>1))*((int)encoder_state->cur_lambda_cost+0.5);
+        cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
+        cost2 += (coeffcost2 + (coeffcost2>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
         // Full RDO
       } else if(encoder->rdo == 2) {
         coeffcost = get_coeff_cost(encoder_state, temp_coeff, 4, 0, scan_idx_luma);
         coeffcost2 = get_coeff_cost(encoder_state, temp_coeff2, 4, 0, scan_idx_luma);
 
-        cost  += coeffcost*((int)encoder_state->cur_lambda_cost+0.5);
-        cost2 += coeffcost2*((int)encoder_state->cur_lambda_cost+0.5);
+        cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
+        cost2 += coeffcost2*((int)encoder_state->global->cur_lambda_cost+0.5);
       }
 
       cur_cu->intra[PU_INDEX(x_pu, y_pu)].tr_skip = (cost < cost2);
@@ -2532,7 +2700,7 @@ void encode_transform_tree(encoder_state * const encoder_state, int32_t x, int32
 static void encode_transform_unit(encoder_state * const encoder_state,
                                   int x_pu, int y_pu, int depth, int tr_depth)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   uint8_t width = LCU_WIDTH >> depth;
   uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
 
@@ -2680,7 +2848,7 @@ void encode_transform_coeff(encoder_state * const encoder_state, int32_t x_pu,in
   cabac_data * const cabac = &encoder_state->cabac;
   int32_t x_cu = x_pu / 2;
   int32_t y_cu = y_pu / 2;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   cu_info *cur_cu = &cur_pic->cu_array[x_cu + y_cu * (cur_pic->width_in_lcu << MAX_DEPTH)];
 
   // NxN signifies implicit transform split at the first transform level.
@@ -3038,4 +3206,3 @@ void encode_last_significant_xy(encoder_state * const encoder_state,
 
   // end LastSignificantXY
 }
-
diff --git a/src/encoder.h b/src/encoder.h
index 4956fe39..bb3f2279 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -129,32 +129,74 @@ typedef struct
   
 } encoder_control;
 
+typedef enum {
+  ENCODER_STATE_TYPE_INVALID = 'i',
+  ENCODER_STATE_TYPE_MAIN = 'M',
+  ENCODER_STATE_TYPE_SLICE = 'S',
+  ENCODER_STATE_TYPE_TILE = 'T',
+  ENCODER_STATE_TYPE_WAVEFRONT_ROW = 'W',
+} encoder_state_type;
+
+
+
+typedef struct {
+  double cur_lambda_cost;
+  
+  int32_t frame;
+  int32_t poc; /*!< \brief picture order count */
+  
+  int8_t QP;   //!< \brief Quantization parameter
+  
+  //Current picture available references
+  picture_list *ref;
+  int8_t ref_list;
+  //int8_t ref_idx_num[2];
+  
+} encoder_state_config_global;
+
+typedef struct {
+  //Current picture to encode
+  picture *cur_pic;
+  
+  //Tile: offset in LCU for current encoder_state in global coordinates
+  int32_t lcu_offset_x;
+  int32_t lcu_offset_y;
+  
+  //Position of the first element in tile scan in global coordinates
+  int32_t lcu_offset_in_ts;
+} encoder_state_config_tile;
+
+typedef struct {
+  //Local coordinates, relative to *tile
+  int32_t start_in_ts;
+  int32_t end_in_ts;
+  
+  //Global coordinates
+  int32_t start_in_rs;
+  int32_t end_in_rs;
+} encoder_state_config_slice;
+
+typedef struct {
+  //Row of the wavefront, relative to *tile
+  int32_t lcu_offset_y;
+} encoder_state_config_wfrow;
+
 typedef struct encoder_state {
   const encoder_control *encoder_control;
-  double cur_lambda_cost;
-  bitstream stream;
-  cabac_data cabac;
-  
+  encoder_state_type type;
+
   //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
   //Use do { } while (encoder_state->children[++i].encoder_control)
   struct encoder_state *children;
   struct encoder_state *parent;
   
-  //Tile: offset in LCU for current encoder_state
-  int32_t lcu_offset_x;
-  int32_t lcu_offset_y;
+  encoder_state_config_global *global;
+  encoder_state_config_tile   *tile;
+  encoder_state_config_slice  *slice;
+  encoder_state_config_wfrow  *wfrow;
   
-  //Current picture to encode
-  picture *cur_pic;
-  int32_t frame;
-  int32_t poc; /*!< \brief picture order count */
-  
-  //Current picture available references
-  picture_list *ref;
-  int8_t ref_list;
-  int8_t ref_idx_num[2];
-  
-  int8_t QP;             //!< \brief Quantization parameter
+  bitstream stream;
+  cabac_data cabac;
 } encoder_state;
 
 int encoder_control_init(encoder_control *encoder, const config *cfg);
@@ -162,8 +204,8 @@ int encoder_control_finalize(encoder_control *encoder);
 
 void encoder_control_input_init(encoder_control *encoder, int32_t width, int32_t height);
 
-int encoder_state_init(encoder_state *encoder_state, const encoder_control * encoder);
-int encoder_state_finalize(encoder_state *encoder_state);
+int encoder_state_init(encoder_state * child_state, encoder_state * parent_state);
+void encoder_state_finalize(encoder_state *encoder_state);
 void encoder_state_init_lambda(encoder_state *encoder_state);
 
 void encode_one_frame(encoder_state *encoder_state);
diff --git a/src/filter.c b/src/filter.c
index 20ac1c4f..bee1629d 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -167,7 +167,7 @@ void filter_deblock_edge_luma(encoder_state * const encoder_state,
                               int32_t xpos, int32_t ypos,
                               int8_t depth, int8_t dir)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   const encoder_control * const encoder = encoder_state->encoder_control;
   
   cu_info *cu_q = &cur_pic->cu_array[(xpos>>MIN_SIZE) + (ypos>>MIN_SIZE) * (cur_pic->width_in_lcu << MAX_DEPTH)];
@@ -194,7 +194,7 @@ void filter_deblock_edge_luma(encoder_state * const encoder_state,
     int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
     int8_t strength = 0;
 
-    int32_t qp              = encoder_state->QP;
+    int32_t qp              = encoder_state->global->QP;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = g_beta_table_8x8[b_index] * bitdepth_scale;
@@ -295,7 +295,7 @@ void filter_deblock_edge_chroma(encoder_state * const encoder_state,
                                 int8_t depth, int8_t dir)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   cu_info *cu_q = &cur_pic->cu_array[(x>>(MIN_SIZE-1)) + (y>>(MIN_SIZE-1)) * (cur_pic->width_in_lcu << MAX_DEPTH)];
 
   // Chroma edges that do not lay on a 8x8 grid are not deblocked.
@@ -327,7 +327,7 @@ void filter_deblock_edge_chroma(encoder_state * const encoder_state,
     int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1);
     int8_t strength = 2;
 
-    int32_t QP             = g_chroma_scale[encoder_state->QP];
+    int32_t QP             = g_chroma_scale[encoder_state->global->QP];
     int32_t bitdepth_scale = 1 << (encoder->bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = g_tc_table_8x8[TC_index]*bitdepth_scale;
@@ -389,7 +389,7 @@ void filter_deblock_edge_chroma(encoder_state * const encoder_state,
  */
 void filter_deblock_cu(encoder_state * const encoder_state, int32_t x, int32_t y, int8_t depth, int32_t edge)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   cu_info *cur_cu = &cur_pic->cu_array[x + y*(cur_pic->width_in_lcu << MAX_DEPTH)];
   uint8_t split_flag = (cur_cu->depth > depth) ? 1 : 0;
   uint8_t border_x = (cur_pic->width  < x*(LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth)) ? 1 : 0;
@@ -437,7 +437,7 @@ void filter_deblock_cu(encoder_state * const encoder_state, int32_t x, int32_t y
  */
 void filter_deblock(encoder_state * const encoder_state)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   int16_t x, y;
 
   // TODO: Optimization: add thread for each LCU
diff --git a/src/inter.c b/src/inter.c
index 8e2b0e9b..ef19219c 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -83,12 +83,12 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
   int32_t ref_width_c = ref->width>>1; //!< Reference picture width in chroma pixels
 
   // negative overflow flag
-  int8_t overflow_neg_x = (encoder_state->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) < 0)?1:0;
-  int8_t overflow_neg_y = (encoder_state->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) < 0)?1:0;
+  int8_t overflow_neg_x = (encoder_state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) < 0)?1:0;
+  int8_t overflow_neg_y = (encoder_state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) < 0)?1:0;
 
   // positive overflow flag
-  int8_t overflow_pos_x = (encoder_state->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) + width > ref->width )?1:0;
-  int8_t overflow_pos_y = (encoder_state->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) + width > ref->height)?1:0;
+  int8_t overflow_pos_x = (encoder_state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv[0]>>2) + width > ref->width )?1:0;
+  int8_t overflow_pos_y = (encoder_state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv[1]>>2) + width > ref->height)?1:0;
 
   // Chroma half-pel
   #define HALFPEL_CHROMA_WIDTH ((LCU_WIDTH>>1) + 8)
@@ -114,7 +114,7 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
     // Fill source blocks with data from reference, -4...width+4
     for (halfpel_y = 0, y = (ypos>>1) - 4; y < ((ypos + width)>>1) + 4; halfpel_y++, y++) {
       // calculate y-pixel offset
-      coord_y = (y + encoder_state->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1);
+      coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1);
 
       // On y-overflow set coord_y accordingly
       overflow_neg_y_temp = (coord_y < 0) ? 1 : 0;
@@ -124,7 +124,7 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
       coord_y *= ref_width_c;
 
       for (halfpel_x = 0, x = (xpos>>1) - 4; x < ((xpos + width)>>1) + 4; halfpel_x++, x++) {
-        coord_x = (x + encoder_state->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1);
+        coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1);
 
         // On x-overflow set coord_x accordingly
         overflow_neg_x_temp = (coord_x < 0) ? 1 : 0;
@@ -161,8 +161,8 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
         int x_in_lcu = (x & ((LCU_WIDTH)-1));
         int y_in_lcu = (y & ((LCU_WIDTH)-1));
 
-        coord_x = (x + encoder_state->lcu_offset_x * LCU_WIDTH) + mv[0];
-        coord_y = (y + encoder_state->lcu_offset_y * LCU_WIDTH) + mv[1];
+        coord_x = (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0];
+        coord_y = (y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1];
         overflow_neg_x = (coord_x < 0)?1:0;
         overflow_neg_y = (coord_y < 0)?1:0;
 
@@ -196,8 +196,8 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
           int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
           int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
 
-          coord_x = (x + encoder_state->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1);
-          coord_y = (y + encoder_state->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1);
+          coord_x = (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv[0]>>1);
+          coord_y = (y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv[1]>>1);
 
           overflow_neg_x = (coord_x < 0)?1:0;
           overflow_neg_y = (y + (mv[1]>>1) < 0)?1:0;
@@ -229,11 +229,11 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
     // Copy Luma
     for (y = ypos; y < ypos + width; y++) {
       int y_in_lcu = (y & ((LCU_WIDTH)-1));
-      coord_y = ((y + encoder_state->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate
+      coord_y = ((y + encoder_state->tile->lcu_offset_y * LCU_WIDTH) + mv[1]) * ref->width; // pre-calculate
       for (x = xpos; x < xpos + width; x++) {
         int x_in_lcu = (x & ((LCU_WIDTH)-1));
 
-        lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->lcu_offset_x * LCU_WIDTH) + mv[0]];
+        lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = ref->y_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * LCU_WIDTH) + mv[0]];
       }
     }
 
@@ -242,11 +242,11 @@ void inter_recon_lcu(const encoder_state * const encoder_state, const picture *
       // TODO: chroma fractional pixel interpolation
       for (y = ypos>>1; y < (ypos + width)>>1; y++) {
         int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-        coord_y = ((y + encoder_state->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate
+        coord_y = ((y + encoder_state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv[1]>>1)) * ref_width_c; // pre-calculate
         for (x = xpos>>1; x < (xpos + width)>>1; x++) {
           int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-          lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
-          lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
+          lcu->rec.u[y_in_lcu*dst_width_c + x_in_lcu] = ref->u_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
+          lcu->rec.v[y_in_lcu*dst_width_c + x_in_lcu] = ref->v_recdata[coord_y + (x + encoder_state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv[0]>>1)];
         }
       }
     }
@@ -332,8 +332,8 @@ void inter_get_mv_cand(const encoder_state * const encoder_state, int32_t x, int
   inter_get_spatial_merge_candidates(x, y, depth, &b0, &b1, &b2, &a0, &a1, lcu);
 
  #define CALCULATE_SCALE(cu,tb,td) ((tb * ((0x4000 + (abs(td)>>1))/td) + 32) >> 6)
-#define APPLY_MV_SCALING(cu, cand) {int td = encoder_state->poc - encoder_state->ref->pics[(cu)->inter.mv_ref]->poc;\
-                                   int tb = encoder_state->poc - encoder_state->ref->pics[cur_cu->inter.mv_ref]->poc;\
+#define APPLY_MV_SCALING(cu, cand) {int td = encoder_state->global->poc - encoder_state->global->ref->pics[(cu)->inter.mv_ref]->poc;\
+                                   int tb = encoder_state->global->poc - encoder_state->global->ref->pics[cur_cu->inter.mv_ref]->poc;\
                                    if (td != tb) { \
                                       int scale = CALCULATE_SCALE(cu,tb,td); \
                                        mv_cand[cand][0] = ((scale * (cu)->inter.mv[0] + 127 + (scale * (cu)->inter.mv[0] < 0)) >> 8 ); \
diff --git a/src/intra.c b/src/intra.c
index c9b5364a..17fcc02c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -375,7 +375,7 @@ int16_t intra_prediction(encoder_state * const encoder_state, pixel *orig, int32
     intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0);
 
     sad = cost_func(pred, orig_block);
-    sad += mode_cost * (int)(encoder_state->cur_lambda_cost + 0.5);
+    sad += mode_cost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
     // When rdo == 2, store best costs to an array and do full RDO later
     if(rdo == 2) {
       int rdo_mode = intra_rdo_cost_compare(rdo_costs, rdo_modes_to_check, sad);
@@ -419,7 +419,7 @@ int16_t intra_prediction(encoder_state * const encoder_state, pixel *orig, int32
       // Bitcost also calculated again for this mode
       rdo_bitcost = intra_pred_ratecost(rdo_modes[rdo_mode],intra_preds);
       // Add bitcost * lambda
-      rdo_costs[rdo_mode] += rdo_bitcost * (int)(encoder_state->cur_lambda_cost + 0.5);
+      rdo_costs[rdo_mode] += rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5);
 
       if(rdo_costs[rdo_mode] < best_sad) {
         best_sad = rdo_costs[rdo_mode];
diff --git a/src/rdo.c b/src/rdo.c
index d9be91db..7c0fb81d 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -112,12 +112,12 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel
       for (i = 0; i < width*width; i++) {
         coeffcost += abs((int)temp_coeff[i]);
       }
-      cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->cur_lambda_cost+0.5);
+      cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5);
       // Full RDO
     } else if(encoder->rdo == 2) {
       coeffcost = get_coeff_cost(encoder_state, temp_coeff, width, 0, luma_scan_mode);
 
-      cost  += coeffcost*((int)encoder_state->cur_lambda_cost+0.5);
+      cost  += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5);
     }
     return cost;
 }
@@ -299,7 +299,7 @@ uint32_t get_coded_level ( encoder_state * const encoder_state, double *coded_co
   cabac_ctx* base_sig_model = type?(cabac->ctx_cu_sig_model_chroma):(cabac->ctx_cu_sig_model_luma);
 
   if( !last && max_abs_level < 3 ) {
-    *coded_cost_sig = encoder_state->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
+    *coded_cost_sig = encoder_state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
     *coded_cost     = *coded_cost0 + *coded_cost_sig;
     if (max_abs_level == 0) return best_abs_level;
   } else {
@@ -307,13 +307,13 @@ uint32_t get_coded_level ( encoder_state * const encoder_state, double *coded_co
   }
 
   if( !last ) {
-    cur_cost_sig = encoder_state->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
+    cur_cost_sig = encoder_state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
   }
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
     double err       = (double)(level_double - ( abs_level << q_bits ) );
-    double cur_cost  = err * err * temp + encoder_state->cur_lambda_cost *
+    double cur_cost  = err * err * temp + encoder_state->global->cur_lambda_cost *
                        get_ic_rate_cost( encoder_state, abs_level, ctx_num_one, ctx_num_abs,
                                          abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
@@ -350,7 +350,7 @@ static double get_rate_last(const encoder_state * const encoder_state,
   if( ctx_y > 3 ) {
     uiCost += 32768.0 * ((ctx_y-2)>>1);
   }
-  return encoder_state->cur_lambda_cost*uiCost;
+  return encoder_state->global->cur_lambda_cost*uiCost;
 }
 
 static void calc_last_bits(encoder_state * const encoder_state, int32_t width, int32_t height, int8_t type,
@@ -402,7 +402,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
   uint32_t max_num_coeff   = width * height;
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
 
-  int32_t qp_scaled = get_scaled_qp(type, encoder_state->QP, 0);
+  int32_t qp_scaled = get_scaled_qp(type, encoder_state->global->QP, 0);
 
   {
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
@@ -591,7 +591,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
         if (sig_coeffgroup_flag[ cg_blkpos ] == 0) {
           uint32_t ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                           cg_pos_y, width);
-          cost_coeffgroup_sig[ cg_scanpos ] = encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+          cost_coeffgroup_sig[ cg_scanpos ] = encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
           base_cost += cost_coeffgroup_sig[ cg_scanpos ]  - rd_stats.sig_cost;
         } else {
           if (cg_scanpos < cg_last_scanpos) {//skip the last coefficient group, which will be handled together with last position below.
@@ -608,9 +608,9 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
             ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                             cg_pos_y, width);
             if (cg_scanpos < cg_last_scanpos) {
-              cost_coeffgroup_sig[cg_scanpos] = encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1);
+              cost_coeffgroup_sig[cg_scanpos] = encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],1);
               base_cost    += cost_coeffgroup_sig[cg_scanpos];
-              cost_zero_cg += encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+              cost_zero_cg += encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
             }
 
             // try to convert the current coeff group from non-zero to all-zero
@@ -624,7 +624,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
               sig_coeffgroup_flag[ cg_blkpos ] = 0;
               base_cost = cost_zero_cg;
               if (cg_scanpos < cg_last_scanpos) {
-                cost_coeffgroup_sig[ cg_scanpos ] = encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+                cost_coeffgroup_sig[ cg_scanpos ] = encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
               }
               // reset coeffs to 0 in this block
               for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
@@ -652,13 +652,13 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
 
 
   if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
-    best_cost  = block_uncoded_cost +   encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx_cu_qt_root_cbf_model),0);
-    base_cost +=   encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx_cu_qt_root_cbf_model),1);
+    best_cost  = block_uncoded_cost +   encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx_cu_qt_root_cbf_model),0);
+    base_cost +=   encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx_cu_qt_root_cbf_model),1);
   } else {
     cabac_ctx* base_cbf_model = type?(cabac->ctx_qt_cbf_model_chroma):(cabac->ctx_qt_cbf_model_luma);
     ctx_cbf   = ( type ? tr_depth : !tr_depth);
-    best_cost  = block_uncoded_cost +  encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
-    base_cost +=   encoder_state->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
+    best_cost  = block_uncoded_cost +  encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
+    base_cost +=   encoder_state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
   }
 
   for (cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
@@ -712,7 +712,7 @@ void  rdoq(encoder_state * const encoder_state, coefficient *coef, coefficient *
   if(*abs_sum >= 2) {
     int64_t rd_factor = (int64_t) (
                      g_inv_quant_scales[qp_scaled%6] * g_inv_quant_scales[qp_scaled%6] * (1<<(2*(qp_scaled/6)))
-                   /  encoder_state->cur_lambda_cost / 16 / (1<<(2*(encoder->bitdepth-8)))
+                   /  encoder_state->global->cur_lambda_cost / 16 / (1<<(2*(encoder->bitdepth-8)))
                    + 0.5);
     int32_t lastCG = -1;
     int32_t absSum = 0;
diff --git a/src/sao.c b/src/sao.c
index aff4cf6f..84c05796 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -669,7 +669,7 @@ static void sao_search_edge_sao(const encoder_state * const encoder_state,
 
     {
       int mode_bits = sao_mode_bits_edge(edge_class, edge_offset, sao_top, sao_left);
-      sum_ddistortion += (int)((double)mode_bits*(encoder_state->cur_lambda_cost+0.5));
+      sum_ddistortion += (int)((double)mode_bits*(encoder_state->global->cur_lambda_cost+0.5));
     }
     // SAO is not applied for category 0.
     edge_offset[SAO_EO_CAT0] = 0;
@@ -711,7 +711,7 @@ static void sao_search_band_sao(const encoder_state * const encoder_state, const
     ddistortion = calc_sao_band_offsets(sao_bands, temp_offsets, &sao_out->band_position);
 
     temp_rate = sao_mode_bits_band(sao_out->band_position, temp_offsets, sao_top, sao_left);
-    ddistortion += (int)((double)temp_rate*(encoder_state->cur_lambda_cost+0.5));
+    ddistortion += (int)((double)temp_rate*(encoder_state->global->cur_lambda_cost+0.5));
 
     // Select band sao over edge sao when distortion is lower
     if (ddistortion < sao_out->ddistortion) {
@@ -745,7 +745,7 @@ static void sao_search_best_mode(const encoder_state * const encoder_state, cons
 
   {
     int mode_bits = sao_mode_bits_edge(edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left);
-    int ddistortion = mode_bits * (int)(encoder_state->cur_lambda_cost + 0.5);
+    int ddistortion = mode_bits * (int)(encoder_state->global->cur_lambda_cost + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -759,7 +759,7 @@ static void sao_search_best_mode(const encoder_state * const encoder_state, cons
 
   {
     int mode_bits = sao_mode_bits_band(band_sao.band_position, &band_sao.offsets[1], sao_top, sao_left);
-    int ddistortion = mode_bits * (int)(encoder_state->cur_lambda_cost + 0.5);
+    int ddistortion = mode_bits * (int)(encoder_state->global->cur_lambda_cost + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -780,7 +780,7 @@ static void sao_search_best_mode(const encoder_state * const encoder_state, cons
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = sao_mode_bits_none(sao_top, sao_left) * (int)(encoder_state->cur_lambda_cost + 0.5);
+    int cost_of_nothing = sao_mode_bits_none(sao_top, sao_left) * (int)(encoder_state->global->cur_lambda_cost + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
     }
@@ -863,7 +863,7 @@ void sao_search_luma(const encoder_state * const encoder_state, const picture *p
 void sao_reconstruct_frame(encoder_state * const encoder_state)
 {
   vector2d lcu;
-  picture * const cur_pic = encoder_state->cur_pic;
+  picture * const cur_pic = encoder_state->tile->cur_pic;
 
   // These are needed because SAO needs the pre-SAO pixels form left and
   // top LCUs. Single pixel wide buffers, like what search_lcu takes, would
diff --git a/src/search.c b/src/search.c
index c3deae4e..99434380 100644
--- a/src/search.c
+++ b/src/search.c
@@ -159,7 +159,7 @@ static int calc_mvd_cost(const encoder_state * const encoder_state, int x, int y
     temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
   }
   *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(encoder_state->cur_lambda_cost+0.5);
+  return temp_bitcost*(int32_t)(encoder_state->global->cur_lambda_cost+0.5);
 }
 
 
@@ -201,8 +201,8 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
   for (i = 0; i < 7; ++i) {
     const vector2d *pattern = &large_hexbs[i];
     unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                             (encoder_state->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
-                             (encoder_state->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
+                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
+                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
                              block_width, block_width);
     cost += calc_mvd_cost(encoder_state, mv.x + pattern->x, mv.y + pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
@@ -216,8 +216,8 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
   // Try the 0,0 vector.
   if (!(mv.x == 0 && mv.y == 0)) {
     unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                             (encoder_state->lcu_offset_x * LCU_WIDTH) + orig->x, 
-                             (encoder_state->lcu_offset_y * LCU_WIDTH) + orig->y,
+                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x, 
+                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
                              block_width, block_width);
     cost += calc_mvd_cost(encoder_state, 0, 0, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
@@ -232,8 +232,8 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
       for (i = 1; i < 7; ++i) {
         const vector2d *pattern = &large_hexbs[i];
         unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                                 (encoder_state->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
-                                 (encoder_state->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
+                                 (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + pattern->x,
+                                 (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + pattern->y,
                                  block_width, block_width);
         cost += calc_mvd_cost(encoder_state, pattern->x, pattern->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
@@ -267,8 +267,8 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
     for (i = 0; i < 3; ++i) {
       const vector2d *offset = &large_hexbs[start + i];
       unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                               (encoder_state->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                               (encoder_state->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                               (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                               (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                                block_width, block_width);
       cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
@@ -290,8 +290,8 @@ static unsigned hexagon_search(const encoder_state * const encoder_state, unsign
   for (i = 1; i < 5; ++i) {
     const vector2d *offset = &small_hexbs[i];
     unsigned cost = calc_sad(pic, ref, orig->x, orig->y,
-                             (encoder_state->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                             (encoder_state->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                             (encoder_state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                             (encoder_state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
                              block_width, block_width);
     cost += calc_mvd_cost(encoder_state, mv.x + offset->x, mv.y + offset->y, mv_cand,merge_cand,num_cand,ref_idx, &bitcost);
 
@@ -374,7 +374,7 @@ static unsigned search_mv_full(unsigned depth,
  */
 static int search_cu_inter(const encoder_state * const encoder_state, int x, int y, int depth, lcu_t *lcu)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   uint32_t ref_idx = 0;
   int x_local = (x&0x3f), y_local = (y&0x3f);
   int x_cu = x>>3;
@@ -394,8 +394,8 @@ static int search_cu_inter(const encoder_state * const encoder_state, int x, int
 
   cur_cu->inter.cost = UINT_MAX;
 
-  for (ref_idx = 0; ref_idx < encoder_state->ref->used_size; ref_idx++) {
-    picture *ref_pic = encoder_state->ref->pics[ref_idx];
+  for (ref_idx = 0; ref_idx < encoder_state->global->ref->used_size; ref_idx++) {
+    picture *ref_pic = encoder_state->global->ref->pics[ref_idx];
     unsigned width_in_scu = NO_SCU_IN_LCU(ref_pic->width_in_lcu);
     cu_info *ref_cu = &ref_pic->cu_array[y_cu * width_in_scu + x_cu];
     uint32_t temp_bitcost = 0;
@@ -670,7 +670,7 @@ static int search_cu_intra(encoder_state * const encoder_state,
                            const int x_px, const int y_px,
                            const int depth, lcu_t *lcu)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f };
   const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
   const int8_t cu_width = (LCU_WIDTH >> (depth));
@@ -776,7 +776,7 @@ static int lcu_get_final_cost(const encoder_state * const encoder_state,
       }
     }
     // Coefficient costs
-    cost += (coeff_cost + (coeff_cost>>1)) * (int32_t)(encoder_state->cur_lambda_cost+0.5);
+    cost += (coeff_cost + (coeff_cost>>1)) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
 
   // Calculate actual bit costs for coding the coeffs
   // RDO
@@ -838,11 +838,11 @@ static int lcu_get_final_cost(const encoder_state * const encoder_state,
       coeff_cost += get_coeff_cost(encoder_state, coeff_temp_v, blockwidth, 2, chroma_scan_mode);
     }
     // Multiply bit count with lambda to get RD-cost
-    cost += coeff_cost * (int32_t)(encoder_state->cur_lambda_cost+0.5);
+    cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost+0.5);
   }
 
   // Bitcost
-  cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].bitcost)*(int32_t)(encoder_state->cur_lambda_cost+0.5);
+  cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].bitcost)*(int32_t)(encoder_state->global->cur_lambda_cost+0.5);
 
   return cost;
 }
@@ -859,7 +859,7 @@ static int lcu_get_final_cost(const encoder_state * const encoder_state,
  */
 static int search_cu(encoder_state * const encoder_state, int x, int y, int depth, lcu_t work_tree[MAX_PU_DEPTH])
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   int cu_width = LCU_WIDTH >> depth;
   int cost = MAX_INT;
   cu_info *cur_cu;
@@ -911,7 +911,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
       intra_recon_lcu(encoder_state, x, y, depth,&work_tree[depth], cur_pic->width, cur_pic->height);
     } else if (cur_cu->type == CU_INTER) {
       int cbf;
-      inter_recon_lcu(encoder_state, encoder_state->ref->pics[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
+      inter_recon_lcu(encoder_state, encoder_state->global->ref->pics[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
       encode_transform_tree(encoder_state, x, y, depth, &work_tree[depth]);
 
       cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
@@ -933,7 +933,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
   // Recursively split all the way to max search depth.
   if (depth < MAX_INTRA_SEARCH_DEPTH || depth < MAX_INTER_SEARCH_DEPTH) {
     int half_cu = cu_width / 2;
-    int split_cost = (int)(4.5 * encoder_state->cur_lambda_cost);
+    int split_cost = (int)(4.5 * encoder_state->global->cur_lambda_cost);
     int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
 
     // If skip mode was selected for the block, skip further search.
@@ -970,7 +970,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
  */
 static void init_lcu_t(const encoder_state * const encoder_state, const int x, const int y, lcu_t *lcu, const yuv_t *hor_buf, const yuv_t *ver_buf)
 {
-  const picture * const cur_pic = encoder_state->cur_pic;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
   
   // Copy reference cu_info structs from neighbouring LCUs.
   {
@@ -1050,7 +1050,7 @@ static void init_lcu_t(const encoder_state * const encoder_state, const int x, c
 
   // Copy LCU pixels.
   {
-    const picture * const pic = encoder_state->cur_pic;
+    const picture * const pic = encoder_state->tile->cur_pic;
     int pic_width = cur_pic->width;
     int x_max = MIN(x + LCU_WIDTH, pic_width) - x;
     int y_max = MIN(y + LCU_WIDTH, cur_pic->height) - y;
@@ -1080,7 +1080,7 @@ static void copy_lcu_to_cu_data(const encoder_state * const encoder_state, int x
   {
     const int x_cu = x_px >> MAX_DEPTH;
     const int y_cu = y_px >> MAX_DEPTH;
-    const picture * const cur_pic = encoder_state->cur_pic;
+    const picture * const cur_pic = encoder_state->tile->cur_pic;
     const int cu_array_width = cur_pic->width_in_lcu << MAX_DEPTH;
     cu_info *const cu_array = cur_pic->cu_array;
 
@@ -1100,7 +1100,7 @@ static void copy_lcu_to_cu_data(const encoder_state * const encoder_state, int x
 
   // Copy pixels to picture.
   {
-    picture * const pic = encoder_state->cur_pic;
+    picture * const pic = encoder_state->tile->cur_pic;
     const int pic_width = pic->width;
     const int x_max = MIN(x_px + LCU_WIDTH, pic_width) - x_px;
     const int y_max = MIN(y_px + LCU_WIDTH, pic->height) - y_px;
diff --git a/src/transform.c b/src/transform.c
index 06fc2a76..ea3d9f40 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -634,7 +634,7 @@ void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_
   int32_t delta_u[LCU_WIDTH*LCU_WIDTH>>2];
   #endif
 
-  int32_t qp_scaled = get_scaled_qp(type, encoder_state->QP, 0);
+  int32_t qp_scaled = get_scaled_qp(type, encoder_state->global->QP, 0);
 
   //New block for variable definitions
   {
@@ -646,7 +646,7 @@ void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_
 
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
-  int32_t add = ((encoder_state->cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
+  int32_t add = ((encoder_state->tile->cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
 
   int32_t q_bits8 = q_bits - 8;
   for (n = 0; n < width * height; n++) {
@@ -762,7 +762,7 @@ void dequant(const encoder_state * const encoder_state, int16_t *q_coef, int16_t
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (g_convert_to_bit[ width ] + 2);
 
-  int32_t qp_scaled = get_scaled_qp(type, encoder_state->QP, 0);
+  int32_t qp_scaled = get_scaled_qp(type, encoder_state->global->QP, 0);
 
   shift = 20 - QUANT_SHIFT - transform_shift;
 

From 699669ee353667adfbcf0a4ec8ce00be34ac14b6 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 6 May 2014 15:45:31 +0200
Subject: [PATCH 08/21] fixed typo

---
 src/encoder.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 14b82907..e76c52b5 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -442,7 +442,7 @@ static int encoder_state_config_slice_init(encoder_state * const encoder_state,
   encoder_state->slice->end_in_ts = end_address_in_ts - encoder_state->tile->lcu_offset_in_ts;
   
   encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
-  encoder_state->slice->end_in_ts = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
+  encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
   return 1;
 }
 
@@ -568,6 +568,8 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
     //Full span to analyze
     start_in_ts = child_state->tile->lcu_offset_in_ts + child_state->slice->start_in_ts;
     end_in_ts = MIN(child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu, child_state->tile->lcu_offset_in_ts + child_state->slice->end_in_ts);
+    
+    //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
     while (start_in_ts < end_in_ts) {
       encoder_state *new_child = NULL;
       int range_start = start_in_ts;
@@ -590,7 +592,7 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
         }
       }
       
-      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed);
+      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d end_in_ts=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed,end_in_ts);
       
       if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
         //Create a slice

From cee6bb0e71b4c218d0d906559f3e24492296504a Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 06:24:22 +0200
Subject: [PATCH 09/21] Fix iteration on children

---
 src/encoder.c | 26 +++++---------------------
 src/encoder.h |  2 +-
 2 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index e76c52b5..20d093f7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -660,32 +660,15 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
       printf("Wavefront\n");
     }
   }
-  
-/*  if (encoder->tiles_enable) {
-    int x,y;
-    //Allocate subencoders (valid subencoder have a non null encoder_control field, so we use a null one to mark the end of the list)
-    encoder_state->children = MALLOC(struct encoder_state, encoder->tiles_num_tile_columns * encoder->tiles_num_tile_rows + 1);
-    encoder_state->children[encoder->tiles_num_tile_columns * encoder->tiles_num_tile_rows].encoder_control = NULL;
-    for (y=0; y < encoder->tiles_num_tile_rows; ++y) {
-      for (x=0; x < encoder->tiles_num_tile_columns; ++x) {
-        const int i = y * encoder->tiles_num_tile_columns + x;
-        encoder_state->children[i].encoder_control = encoder;
-        
-        if (!encoder_state_init_one(&encoder_state->children[i], encoder_state, x, y)) {
-          fprintf(stderr, "Could not initialize encoder state %d!\n", i);
-          return 0;
-        }
-      }
-  */
   return 1;
 }
 
 void encoder_state_finalize(encoder_state * const encoder_state) {
   if (encoder_state->children) {
     int i=0;
-    do {
+    for (i = 0; encoder_state->children[i].encoder_control; ++i) {
       encoder_state_finalize(&encoder_state->children[i]);
-    } while (encoder_state->children[++i].encoder_control);
+    }
     
     FREE_POINTER(encoder_state->children);
   }
@@ -1037,6 +1020,7 @@ void encode_one_frame(encoder_state * const main_state)
   
   if (main_state->children) {
     int i;
+    //FIXME!
     //This can be parallelized, we don't use a do...while loop because we use OpenMP
     #pragma omp parallel for
     for (i = 0; i < encoder->tiles_num_tile_rows * encoder->tiles_num_tile_columns; ++i) {
@@ -1062,11 +1046,11 @@ void encode_one_frame(encoder_state * const main_state)
     
     //This has to be serial
     i = 0;
-    do {
+    for (i = 0; main_state->children[i].encoder_control; ++i) {
       //Append bitstream to main stream
       bitstream_append(&main_state->stream, &main_state->children[i].stream);
       bitstream_clear(&main_state->children[i].stream);
-    } while (main_state->children[++i].encoder_control);
+    }
     
   } else {
     //Encode the whole thing as one stream
diff --git a/src/encoder.h b/src/encoder.h
index bb3f2279..a5b1a7ac 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -186,7 +186,7 @@ typedef struct encoder_state {
   encoder_state_type type;
 
   //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
-  //Use do { } while (encoder_state->children[++i].encoder_control)
+  //Use for (i = 0; encoder_state->children[i].encoder_control; ++i) {
   struct encoder_state *children;
   struct encoder_state *parent;
   

From 8b5cb62237f265434c2feeeb35134281aeaa3ed0 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 07:06:19 +0200
Subject: [PATCH 10/21] Debug code to generate a graph

---
 src/encoder.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/encoder.h |   4 +-
 2 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 20d093f7..ba4d1f30 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -427,6 +427,8 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state,
   encoder_state->tile->lcu_offset_y = lcu_offset_y;
   
   encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
+  
+  encoder_state->tile->id = encoder->tiles_tile_id[encoder_state->tile->lcu_offset_in_ts];
   return 1;
 }
 
@@ -437,9 +439,8 @@ static void encoder_state_config_tile_finalize(encoder_state * const encoder_sta
 
 static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
                                           const int start_address_in_ts, const int end_address_in_ts) {
-  //Has to be called AFTER initializing encoder_state->tile
-  encoder_state->slice->start_in_ts = start_address_in_ts - encoder_state->tile->lcu_offset_in_ts;
-  encoder_state->slice->end_in_ts = end_address_in_ts - encoder_state->tile->lcu_offset_in_ts;
+  encoder_state->slice->start_in_ts = start_address_in_ts;
+  encoder_state->slice->end_in_ts = end_address_in_ts;
   
   encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
   encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
@@ -461,6 +462,108 @@ static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_st
   //Nothing to do (yet?)
 }
 
+#ifdef _DEBUG
+static void encoder_state_dump_graphviz(const encoder_state * const encoder_state) {
+  int i;
+  
+  if (!encoder_state->parent) {
+    const encoder_control * const encoder = encoder_state->encoder_control;
+    int y,x;
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+    //Some styling...
+    printf("digraph EncoderStates {\n");
+    printf(" fontname = \"Bitstream Vera Sans\"\n");
+    printf(" fontsize = 8\n\n");
+    printf(" node [\n");
+    printf("  fontname = \"Bitstream Vera Sans\"\n");
+    printf("  fontsize = 8\n");
+    printf("  shape = \"record\"\n");
+    printf(" ]\n\n");
+    printf(" edge [\n");
+    printf("  arrowtail = \"empty\"\n");
+    printf(" ]\n\n");
+    
+    printf(" \"Map\" [\n");
+    printf("  shape=plaintext\" [\n");
+    printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        
+        printf("<td>%d</td>", lcu_id_rs);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>TS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", lcu_id_ts);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Tile map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", encoder->tiles_tile_id[lcu_id_ts]);
+      }
+      printf("</tr>");
+    }
+    printf("</table>>\n ]\n");
+  }
+  
+  printf(" \"%p\" [\n", encoder_state);
+  printf("  label = \"{encoder_state|");
+  printf("+ type=%c\\l", encoder_state->type);
+  if (!encoder_state->parent || encoder_state->global != encoder_state->parent->global) {
+    printf("|+ global\\l");
+  }
+  if (!encoder_state->parent || encoder_state->tile != encoder_state->parent->tile) {
+    printf("|+ tile\\l");
+    printf(" - id = %d\\l", encoder_state->tile->id);
+    printf(" - lcu_offset_x = %d\\l", encoder_state->tile->lcu_offset_x);
+    printf(" - lcu_offset_y = %d\\l", encoder_state->tile->lcu_offset_y);
+    printf(" - lcu_offset_in_ts = %d\\l", encoder_state->tile->lcu_offset_in_ts);
+  }
+  if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
+    printf("|+ slice\\l");
+    printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
+    printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
+    printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
+    printf(" - end_in_rs = %d\\l", encoder_state->slice->end_in_rs);
+  }
+  if (!encoder_state->parent || encoder_state->wfrow != encoder_state->parent->wfrow) {
+    printf("|+ wfrow\\l");
+    printf(" - lcu_offset_y = %d\\l", encoder_state->wfrow->lcu_offset_y);
+  }
+  printf("}\"\n");
+  printf(" ]\n");
+  
+  if (encoder_state->parent) {
+    printf(" \"%p\" -> \"%p\"\n", encoder_state->parent, encoder_state);
+  }
+  
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    encoder_state_dump_graphviz(&encoder_state->children[i]);
+  }
+  
+  if (!encoder_state->parent) {
+    printf("}\n");
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+  }
+}
+#endif //_DEBUG
 
 int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
   //We require that, if parent_state is NULL:
@@ -473,6 +576,8 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
   //child_state->slice
   //child_state->wfrow
   
+  printf("Init: %p %p\n", child_state, parent_state);
+  
   child_state->parent = parent_state;
   child_state->children = MALLOC(encoder_state, 1);
   child_state->children[0].encoder_control = NULL;
@@ -547,16 +652,22 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
       case ENCODER_STATE_TYPE_MAIN:
         children_allow_slice = 1;
         children_allow_tile = 1;
+        start_in_ts = 0;
+        end_in_ts = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
         break;
       case ENCODER_STATE_TYPE_SLICE:
         assert(child_state->parent);
         if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
         children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->slice->start_in_ts;
+        end_in_ts = child_state->slice->end_in_ts;
         break;
       case ENCODER_STATE_TYPE_TILE:
         assert(child_state->parent);
         if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
         children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->tile->lcu_offset_in_ts;
+        end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
         break;
       case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
         break;
@@ -570,7 +681,7 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
     end_in_ts = MIN(child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu, child_state->tile->lcu_offset_in_ts + child_state->slice->end_in_ts);
     
     //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
-    while (start_in_ts < end_in_ts) {
+    while (start_in_ts < end_in_ts && (children_allow_slice || children_allow_tile)) {
       encoder_state *new_child = NULL;
       int range_start = start_in_ts;
       int range_end_slice = start_in_ts; //Will be incremented to get the range of the "thing"
@@ -596,8 +707,6 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
       
       if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
         //Create a slice
-        
-        printf("%p slice: %d - %d\n", child_state, range_start, range_end_slice);
         new_child = &child_state->children[child_count];
         new_child->encoder_control = encoder;
         new_child->type = ENCODER_STATE_TYPE_SLICE;
@@ -624,7 +733,6 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
         int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
         int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
         
-        printf("%p tile: %d - %d (%d)\n", child_state, range_start, range_end_tile, tile_id);
         new_child = &child_state->children[child_count];
         new_child->encoder_control = encoder;
         new_child->type = ENCODER_STATE_TYPE_TILE;
@@ -646,6 +754,17 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
           fprintf(stderr, "Failed to allocate memory for children...\n");
           return 0;
         }
+        
+        //Fix children parent (since we changed the address)
+        {
+          int i, j;
+          for (i = 0; child_state->children[i].encoder_control; ++i) {
+            for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
+              child_state->children[i].children[j].parent = &child_state->children[i];
+            }
+          }
+        }
+          
         if (!encoder_state_init(&child_state->children[child_count], child_state)) {
           fprintf(stderr, "Unable to init child...\n");
           return 0;
@@ -660,6 +779,9 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
       printf("Wavefront\n");
     }
   }
+#ifdef _DEBUG
+  if (!parent_state) encoder_state_dump_graphviz(child_state);
+#endif //_DEBUG
   return 1;
 }
 
diff --git a/src/encoder.h b/src/encoder.h
index a5b1a7ac..13e37cea 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -158,6 +158,8 @@ typedef struct {
   //Current picture to encode
   picture *cur_pic;
   
+  int32_t id;
+  
   //Tile: offset in LCU for current encoder_state in global coordinates
   int32_t lcu_offset_x;
   int32_t lcu_offset_y;
@@ -167,7 +169,7 @@ typedef struct {
 } encoder_state_config_tile;
 
 typedef struct {
-  //Local coordinates, relative to *tile
+  //Global coordinates
   int32_t start_in_ts;
   int32_t end_in_ts;
   

From 831b221cf84e153078d86ff9f2e7339a3e2da928 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 09:44:02 +0200
Subject: [PATCH 11/21] Parsing seems to work now

---
 src/encoder.c | 134 ++++++++++++++++++++++++++++++++++++++++++++------
 src/encoder.h |   4 +-
 2 files changed, 122 insertions(+), 16 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index ba4d1f30..01a56ba7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -439,6 +439,15 @@ static void encoder_state_config_tile_finalize(encoder_state * const encoder_sta
 
 static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
                                           const int start_address_in_ts, const int end_address_in_ts) {
+  int i = 0, slice_found=0;
+  for (i = 0; i < encoder_state->encoder_control->slice_count; ++i) {
+    if (encoder_state->encoder_control->slice_addresses_in_ts[i] == start_address_in_ts) {
+      encoder_state->slice->id = i;
+      slice_found = 1;
+      break;
+    }
+  }
+  assert(slice_found);
   encoder_state->slice->start_in_ts = start_address_in_ts;
   encoder_state->slice->end_in_ts = end_address_in_ts;
   
@@ -519,6 +528,25 @@ static void encoder_state_dump_graphviz(const encoder_state * const encoder_stat
       }
       printf("</tr>");
     }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Slice map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        int slice_id = 0;
+        
+        //Not efficient, but who cares
+        for (i=0; i < encoder->slice_count; ++i) {
+          if (encoder->slice_addresses_in_ts[i] <= lcu_id_ts) {
+            slice_id = i;
+          }
+        }
+        
+        printf("<td>%d</td>", slice_id);
+      }
+      printf("</tr>");
+    }
     printf("</table>>\n ]\n");
   }
   
@@ -537,6 +565,7 @@ static void encoder_state_dump_graphviz(const encoder_state * const encoder_stat
   }
   if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
     printf("|+ slice\\l");
+    printf(" - id = %d\\l", encoder_state->slice->id);
     printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
     printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
     printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
@@ -645,6 +674,7 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
     int children_allow_wavefront_row = 0;
     int children_allow_slice = 0;
     int children_allow_tile = 0;
+    int range_start;
     
     int start_in_ts, end_in_ts;
     
@@ -676,16 +706,12 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
         assert(0);
     }
     
-    //Full span to analyze
-    start_in_ts = child_state->tile->lcu_offset_in_ts + child_state->slice->start_in_ts;
-    end_in_ts = MIN(child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu, child_state->tile->lcu_offset_in_ts + child_state->slice->end_in_ts);
-    
+    range_start = start_in_ts;
     //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
-    while (start_in_ts < end_in_ts && (children_allow_slice || children_allow_tile)) {
+    while (range_start < end_in_ts && (children_allow_slice || children_allow_tile)) {
       encoder_state *new_child = NULL;
-      int range_start = start_in_ts;
-      int range_end_slice = start_in_ts; //Will be incremented to get the range of the "thing"
-      int range_end_tile = start_in_ts; //Will be incremented to get the range of the "thing"
+      int range_end_slice = range_start; //Will be incremented to get the range of the "thing"
+      int range_end_tile = range_start; //Will be incremented to get the range of the "thing"
       
       int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
       int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
@@ -754,17 +780,17 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
           fprintf(stderr, "Failed to allocate memory for children...\n");
           return 0;
         }
-        
-        //Fix children parent (since we changed the address)
+
+        //Fix children parent (since we changed the address), except for the last one which is not ready yet
         {
           int i, j;
-          for (i = 0; child_state->children[i].encoder_control; ++i) {
+          for (i = 0; child_state->children[i].encoder_control && i < child_count; ++i) {
             for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
               child_state->children[i].children[j].parent = &child_state->children[i];
             }
           }
         }
-          
+        
         if (!encoder_state_init(&child_state->children[child_count], child_state)) {
           fprintf(stderr, "Unable to init child...\n");
           return 0;
@@ -772,13 +798,91 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
         child_count += 1;
       }
       
-      start_in_ts = MAX(range_end_slice, range_end_tile) + 1;
+      range_start = MAX(range_end_slice, range_end_tile) + 1;
     }
     
-    if (children_allow_wavefront_row) {
-      printf("Wavefront\n");
+    //We create wavefronts only if we have no children
+    if (children_allow_wavefront_row && child_count == 0) {
+      int first_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int last_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int num_rows;
+      int i;
+      
+      assert(!(children_allow_slice || children_allow_tile));
+      assert(child_count == 0);
+      
+      for (i=start_in_ts; i<end_in_ts; ++i) {
+        const int row = encoder->tiles_ctb_addr_ts_to_rs[i] / encoder->in.width_in_lcu;
+        if (row < first_row) first_row = row;
+        if (row > last_row) last_row = row;
+      }
+      
+      num_rows = last_row - first_row + 1;
+      
+      //When entropy_coding_sync_enabled_flag is equal to 1 and the first coding tree block in a slice is not the first coding
+      //tree block of a row of coding tree blocks in a tile, it is a requirement of bitstream conformance that the last coding tree
+      //block in the slice shall belong to the same row of coding tree blocks as the first coding tree block in the slice.
+      
+      if (encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] % encoder->in.width_in_lcu != child_state->tile->lcu_offset_x) {
+        if (num_rows > 1) {
+          fprintf(stderr, "Invalid: first CTB in slice %d is not at the tile %d edge, and the slice spans on more than one row.\n", child_state->slice->id, child_state->tile->id);
+          return 0;
+        }
+      }
+      
+      //FIXME Do the same kind of check if we implement slice segments
+    
+      
+      child_state->children = realloc(child_state->children, sizeof(encoder_state) * (num_rows + 1));
+      child_state->children[num_rows].encoder_control = NULL;
+      
+      for (i=0; i < num_rows; ++i) {
+        encoder_state *new_child = &child_state->children[i];
+        
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->slice = child_state->slice;
+        new_child->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+        
+        if (!new_child->wfrow || !encoder_state_config_wfrow_init(new_child, i + first_row)) {
+          fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+          return 0;
+        }
+        
+        if (!encoder_state_init(new_child, child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+      }
     }
   }
+  
+  //Validate the structure
+  if (child_state->type == ENCODER_STATE_TYPE_TILE) {
+    if (child_state->tile->lcu_offset_in_ts < child_state->slice->start_in_ts) {
+      fprintf(stderr, "Tile %d starts before slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+    if (child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1 > child_state->slice->end_in_ts) {
+      fprintf(stderr, "Tile %d ends after slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+  }
+  
+  if (child_state->type == ENCODER_STATE_TYPE_SLICE) {
+    if (child_state->slice->start_in_ts < child_state->tile->lcu_offset_in_ts) {
+      fprintf(stderr, "Slice %d starts before tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+    if (child_state->slice->end_in_ts > child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1) {
+      fprintf(stderr, "Slice %d ends after tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+  }
+  
+  
 #ifdef _DEBUG
   if (!parent_state) encoder_state_dump_graphviz(child_state);
 #endif //_DEBUG
diff --git a/src/encoder.h b/src/encoder.h
index 13e37cea..7c80c5f1 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -169,6 +169,8 @@ typedef struct {
 } encoder_state_config_tile;
 
 typedef struct {
+  int32_t id;
+  
   //Global coordinates
   int32_t start_in_ts;
   int32_t end_in_ts;
@@ -179,7 +181,7 @@ typedef struct {
 } encoder_state_config_slice;
 
 typedef struct {
-  //Row of the wavefront, relative to *tile
+  //Row in image coordinates of the wavefront
   int32_t lcu_offset_y;
 } encoder_state_config_wfrow;
 

From 1e2671ac30a2d790eda89a1de23a726eb7bd3a4a Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 09:53:55 +0200
Subject: [PATCH 12/21] Renamed encoder_clear_refs to encoder_state_clear_refs

---
 src/encoder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 01a56ba7..2ac04a46 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -923,7 +923,7 @@ void encoder_state_finalize(encoder_state * const encoder_state) {
 }
 
 
-static void encoder_clear_refs(encoder_state *encoder_state) {
+static void encoder_state_clear_refs(encoder_state *encoder_state) {
   while (encoder_state->global->ref->used_size) {
     picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
   }
@@ -1191,7 +1191,7 @@ void encode_one_frame(encoder_state * const main_state)
    **/
   if (is_radl_frame) {
     // Clear the reference list
-    encoder_clear_refs(main_state);
+    encoder_state_clear_refs(main_state);
 
     main_state->tile->cur_pic->slicetype = SLICE_I;
     main_state->tile->cur_pic->type = NAL_IDR_W_RADL;

From a03f0cba19c5a02fa76e52de2ed138d39b0d2642 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 09:56:16 +0200
Subject: [PATCH 13/21] encoder_control_input_init near the other
 encoder_control_* functions

---
 src/encoder.c | 84 ++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 2ac04a46..099bc719 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -387,6 +387,49 @@ int encoder_control_finalize(encoder_control * const encoder) {
   return 1;
 }
 
+void encoder_control_input_init(encoder_control * const encoder,
+                        const int32_t width, const int32_t height)
+{
+  encoder->in.width = width;
+  encoder->in.height = height;
+  encoder->in.real_width = width;
+  encoder->in.real_height = height;
+
+  // If input dimensions are not divisible by the smallest block size, add
+  // pixels to the dimensions, so that they are. These extra pixels will be
+  // compressed along with the real ones but they will be cropped out before
+  // rendering.
+  if (encoder->in.width % CU_MIN_SIZE_PIXELS) {
+    encoder->in.width += CU_MIN_SIZE_PIXELS - (width % CU_MIN_SIZE_PIXELS);
+  }
+
+  if (encoder->in.height % CU_MIN_SIZE_PIXELS) {
+    encoder->in.height += CU_MIN_SIZE_PIXELS - (height % CU_MIN_SIZE_PIXELS);
+  }
+
+  encoder->in.height_in_lcu = encoder->in.height / LCU_WIDTH;
+  encoder->in.width_in_lcu  = encoder->in.width / LCU_WIDTH;
+
+  // Add one extra LCU when image not divisible by LCU_WIDTH
+  if (encoder->in.height_in_lcu * LCU_WIDTH < height) {
+    encoder->in.height_in_lcu++;
+  }
+
+  if (encoder->in.width_in_lcu * LCU_WIDTH < width) {
+    encoder->in.width_in_lcu++;
+  }
+
+
+
+  #ifdef _DEBUG
+  if (width != encoder->in.width || height != encoder->in.height) {
+    printf("Picture buffer has been extended to be a multiple of the smallest block size:\r\n");
+    printf("  Width = %d (%d), Height = %d (%d)\r\n", width, encoder->in.width, height,
+           encoder->in.height);
+  }
+  #endif
+}
+
 static int encoder_state_config_global_init(encoder_state * const encoder_state) {
   encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
   if(!encoder_state->global->ref) {
@@ -931,48 +974,7 @@ static void encoder_state_clear_refs(encoder_state *encoder_state) {
   encoder_state->global->poc = 0;
 }
 
-void encoder_control_input_init(encoder_control * const encoder,
-                        const int32_t width, const int32_t height)
-{
-  encoder->in.width = width;
-  encoder->in.height = height;
-  encoder->in.real_width = width;
-  encoder->in.real_height = height;
 
-  // If input dimensions are not divisible by the smallest block size, add
-  // pixels to the dimensions, so that they are. These extra pixels will be
-  // compressed along with the real ones but they will be cropped out before
-  // rendering.
-  if (encoder->in.width % CU_MIN_SIZE_PIXELS) {
-    encoder->in.width += CU_MIN_SIZE_PIXELS - (width % CU_MIN_SIZE_PIXELS);
-  }
-
-  if (encoder->in.height % CU_MIN_SIZE_PIXELS) {
-    encoder->in.height += CU_MIN_SIZE_PIXELS - (height % CU_MIN_SIZE_PIXELS);
-  }
-
-  encoder->in.height_in_lcu = encoder->in.height / LCU_WIDTH;
-  encoder->in.width_in_lcu  = encoder->in.width / LCU_WIDTH;
-
-  // Add one extra LCU when image not divisible by LCU_WIDTH
-  if (encoder->in.height_in_lcu * LCU_WIDTH < height) {
-    encoder->in.height_in_lcu++;
-  }
-
-  if (encoder->in.width_in_lcu * LCU_WIDTH < width) {
-    encoder->in.width_in_lcu++;
-  }
-
-
-
-  #ifdef _DEBUG
-  if (width != encoder->in.width || height != encoder->in.height) {
-    printf("Picture buffer has been extended to be a multiple of the smallest block size:\r\n");
-    printf("  Width = %d (%d), Height = %d (%d)\r\n", width, encoder->in.width, height,
-           encoder->in.height);
-  }
-  #endif
-}
 
 static void write_aud(encoder_state * const encoder_state)
 {

From 24c2bd70ca40497e2b92b67ee7705ffdeec19ea2 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 11:37:53 +0200
Subject: [PATCH 14/21] Fix small bugs with compilation

---
 src/encoder.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 099bc719..5c262a60 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -648,8 +648,6 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
   //child_state->slice
   //child_state->wfrow
   
-  printf("Init: %p %p\n", child_state, parent_state);
-  
   child_state->parent = parent_state;
   child_state->children = MALLOC(encoder_state, 1);
   child_state->children[0].encoder_control = NULL;
@@ -743,10 +741,14 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
         end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
         break;
       case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
+        //GCC tries to be too clever...
+        start_in_ts = -1;
+        end_in_ts = -1;
         break;
       default:
         fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
         assert(0);
+        return 0;
     }
     
     range_start = start_in_ts;

From e144f817ef65b33eff4205fcb27983a3b328b449 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 10:48:30 +0200
Subject: [PATCH 15/21] Works when not using tiles

---
 src/encoder.c | 320 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 202 insertions(+), 118 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 5c262a60..57da3a3c 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -969,6 +969,7 @@ void encoder_state_finalize(encoder_state * const encoder_state) {
 
 
 static void encoder_state_clear_refs(encoder_state *encoder_state) {
+  //FIXME: Do we need to handle children? At present they all share the same global
   while (encoder_state->global->ref->used_size) {
     picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
   }
@@ -976,6 +977,55 @@ static void encoder_state_clear_refs(encoder_state *encoder_state) {
   encoder_state->global->poc = 0;
 }
 
+static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
+  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
+  
+  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
+  
+  int source_stride = source_enc->tile->cur_pic->width;
+  int target_stride = target_enc->tile->cur_pic->width;
+  
+  int width;
+  int height;
+  
+  int source_offset;
+  int target_offset;
+  
+  //Do nothing if the source and the destination is the same!
+  if (source_enc->tile == target_enc->tile) return;
+
+  if (is_y_channel) {
+    target_offset = source_offset_x + source_offset_y * target_enc->tile->cur_pic->width;
+    source_offset = target_offset_x + target_offset_y * source_enc->tile->cur_pic->width;
+  } else {
+    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->cur_pic->width/2;
+    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->cur_pic->width/2;
+  }
+  
+  if (target_enc->children) {
+    //Use information from the source
+    width = MIN(source_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->width - source_offset_x);
+    height = MIN(source_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->height - source_offset_y);
+  } else {
+    //Use information from the target
+    width = MIN(target_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->width - target_offset_x);
+    height = MIN(target_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->height - target_offset_y);
+  }
+  
+  if (!is_y_channel) {
+    width /= 2;
+    height /= 2;
+    
+    source_stride /= 2;
+    target_stride /= 2;
+  }
+  
+  //picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_enc->cur_pic->width, target_enc->cur_pic->width);
+  picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_stride, target_stride);
+}
+
 
 
 static void write_aud(encoder_state * const encoder_state)
@@ -986,36 +1036,8 @@ static void write_aud(encoder_state * const encoder_state)
   bitstream_align(stream);
 }
 
-static void substream_write_bitstream(encoder_state * const encoder_state, const int end_of_sub_stream) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  const picture* const cur_pic = encoder_state->tile->cur_pic;
-  const int lcu_count = cur_pic->width_in_lcu * cur_pic->height_in_lcu;
-  int lcu_id;
-  vector2d lcu;
-  
-  for (lcu_id = 0; lcu_id < lcu_count; ++lcu_id) {
-    lcu.x = lcu_id % cur_pic->width_in_lcu;
-    lcu.y = lcu_id / cur_pic->width_in_lcu;
-    
-    //Write bitstream
-    if (encoder->sao_enable) {
-      encode_sao(encoder_state, lcu.x, lcu.y, &cur_pic->sao_luma[lcu.y * cur_pic->width_in_lcu + lcu.x], &cur_pic->sao_chroma[lcu.y * cur_pic->width_in_lcu + lcu.x]);
-    }
-    
-    encode_coding_tree(encoder_state, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0);
 
-    cabac_encode_bin_trm(&encoder_state->cabac, ((lcu_id == lcu_count - 1) && !end_of_sub_stream) ? 1 : 0);  // end_of_slice_segment_flag
-  }
-  if (end_of_sub_stream) {
-    cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
-    cabac_flush(&encoder_state->cabac);
-  } else {
-    cabac_flush(&encoder_state->cabac);
-    bitstream_align(&encoder_state->stream);
-  }
-}
-
-static void substream_encode(encoder_state * const encoder_state) {
+static void encoder_state_encode_tile(encoder_state * const encoder_state) {
   const encoder_control * const encoder = encoder_state->encoder_control;
 #ifndef NDEBUG
   const unsigned long long int debug_bitstream_position = bitstream_tell(&(encoder_state->stream));
@@ -1129,57 +1151,79 @@ static void substream_encode(encoder_state * const encoder_state) {
   yuv_t_free(ver_buf);
 }
 
-static void subencoder_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
-  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  int source_stride = source_enc->tile->cur_pic->width;
-  int target_stride = target_enc->tile->cur_pic->width;
-  
-  int width;
-  int height;
-  
-  int source_offset;
-  int target_offset;
-  
-  //One of them has to be the main encoder
-  assert(target_enc->children || source_enc->children);
-
-  if (is_y_channel) {
-    target_offset = source_offset_x + source_offset_y * target_enc->tile->cur_pic->width;
-    source_offset = target_offset_x + target_offset_y * source_enc->tile->cur_pic->width;
+static void encoder_state_encode(encoder_state * const main_state) {
+  //If we have children, encode at child level
+  if (main_state->children[0].encoder_control) {
+    int i=0;
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      
+      if (sub_state->tile != main_state->tile) {
+        //FIXME: remove this once these are in slice
+        sub_state->tile->cur_pic->slicetype = main_state->tile->cur_pic->slicetype;
+        sub_state->tile->cur_pic->type = main_state->tile->cur_pic->type;
+        
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
+      }
+      encoder_state_encode(&main_state->children[i]);
+      //FIXME: substream_write_bitstream(subencoder, (main_state->children[i+1].encoder_control) != NULL);
+      
+      if (sub_state->tile != main_state->tile) {
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, sub_state, sub_state->tile->cur_pic->y_recdata, 1);
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, sub_state, sub_state->tile->cur_pic->u_recdata, 0);
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, sub_state, sub_state->tile->cur_pic->v_recdata, 0);
+      }
+    }
   } else {
-    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->cur_pic->width/2;
-    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->cur_pic->width/2;
+    switch (main_state->type) {
+      case ENCODER_STATE_TYPE_TILE:
+        encoder_state_encode_tile(main_state);
+        break;
+      default:
+        fprintf(stderr, "Unsupported leaf type %c!\n", main_state->type);
+        assert(0);
+    }
   }
-  
-  if (target_enc->children) {
-    //Use information from the source
-    width = MIN(source_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->width - source_offset_x);
-    height = MIN(source_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->height - source_offset_y);
-  } else {
-    //Use information from the target
-    width = MIN(target_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->width - target_offset_x);
-    height = MIN(target_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->height - target_offset_y);
-  }
-  
-  if (!is_y_channel) {
-    width /= 2;
-    height /= 2;
-    
-    source_stride /= 2;
-    target_stride /= 2;
-  }
-  
-  //picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_enc->cur_pic->width, target_enc->cur_pic->width);
-  picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_stride, target_stride);
 }
 
-void encode_one_frame(encoder_state * const main_state)
-{
+static void encoder_state_new_frame(encoder_state * const main_state) {
+  int i;
+  //FIXME Move this somewhere else!
+  if (main_state->type == ENCODER_STATE_TYPE_MAIN) {
+    const encoder_control * const encoder = main_state->encoder_control;
+    
+    const int is_first_frame = (main_state->global->frame == 0);
+    const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
+    const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
+    const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
+    
+    if (is_radl_frame) {
+      // Clear the reference list
+      encoder_state_clear_refs(main_state);
+
+      main_state->tile->cur_pic->slicetype = SLICE_I;
+      main_state->tile->cur_pic->type = NAL_IDR_W_RADL;
+    } else {
+      main_state->tile->cur_pic->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
+      main_state->tile->cur_pic->type = NAL_TRAIL_R;
+    }
+  } else {
+    //Clear the bitstream if it's not the main encoder
+    bitstream_clear(&main_state->stream);
+  }
+  
+  init_contexts(main_state, main_state->global->QP, main_state->tile->cur_pic->slicetype);
+  
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    encoder_state_new_frame(&main_state->children[i]);
+  }
+  
+
+}
+
+static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
   const encoder_control * const encoder = main_state->encoder_control;
   bitstream * const stream = &main_state->stream;
 
@@ -1187,6 +1231,8 @@ void encode_one_frame(encoder_state * const main_state)
   const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
   const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
   const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
+  
+  int i;
 
 
   /** IDR picture when: period == 0 and frame == 0
@@ -1243,49 +1289,12 @@ void encode_one_frame(encoder_state * const main_state)
     nal_write(stream,
               is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
   }
-
-  encode_slice_header(main_state);
-  bitstream_align(&main_state->stream);
-
   
-  if (main_state->children) {
-    int i;
-    //FIXME!
-    //This can be parallelized, we don't use a do...while loop because we use OpenMP
-    #pragma omp parallel for
-    for (i = 0; i < encoder->tiles_num_tile_rows * encoder->tiles_num_tile_columns; ++i) {
-      encoder_state *subencoder = &(main_state->children[i]);
-      
-      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
-      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
-      subencoder_blit_pixels(subencoder, subencoder->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
-      
-      //FIXME: remove this once these are in slice
-      subencoder->tile->cur_pic->slicetype = main_state->tile->cur_pic->slicetype;
-      subencoder->tile->cur_pic->type = main_state->tile->cur_pic->type;
-      
-      substream_encode(subencoder);
-      substream_write_bitstream(subencoder, (main_state->children[i+1].encoder_control) != NULL);
-      
-      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, subencoder, subencoder->tile->cur_pic->y_recdata, 1);
-      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, subencoder, subencoder->tile->cur_pic->u_recdata, 0);
-      subencoder_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, subencoder, subencoder->tile->cur_pic->v_recdata, 0);
-    }
-    
-    //We should do the slice header here, because we can have the entry points
-    
-    //This has to be serial
-    i = 0;
-    for (i = 0; main_state->children[i].encoder_control; ++i) {
-      //Append bitstream to main stream
-      bitstream_append(&main_state->stream, &main_state->children[i].stream);
-      bitstream_clear(&main_state->children[i].stream);
-    }
-    
-  } else {
-    //Encode the whole thing as one stream
-    substream_encode(main_state);
-    substream_write_bitstream(main_state, 0);
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+    //FIXME: Move this...
+    bitstream_clear(&main_state->children[i].stream);
   }
   
   // Calculate checksum
@@ -1295,6 +1304,81 @@ void encode_one_frame(encoder_state * const main_state)
   main_state->tile->cur_pic->poc = main_state->global->poc;
 }
 
+static void encoder_state_write_bitstream_tile(encoder_state * const encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  const picture* const cur_pic = encoder_state->tile->cur_pic;
+  const int lcu_count = cur_pic->width_in_lcu * cur_pic->height_in_lcu;
+  int lcu_id;
+  vector2d lcu;
+  
+  for (lcu_id = 0; lcu_id < lcu_count; ++lcu_id) {
+    lcu.x = lcu_id % cur_pic->width_in_lcu;
+    lcu.y = lcu_id / cur_pic->width_in_lcu;
+    
+    //Write bitstream
+    if (encoder->sao_enable) {
+      encode_sao(encoder_state, lcu.x, lcu.y, &cur_pic->sao_luma[lcu.y * cur_pic->width_in_lcu + lcu.x], &cur_pic->sao_chroma[lcu.y * cur_pic->width_in_lcu + lcu.x]);
+    }
+    
+    encode_coding_tree(encoder_state, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0);
+
+    cabac_encode_bin_trm(&encoder_state->cabac, ((lcu_id == lcu_count - 1) && lcu_at_slice_end(encoder, lcu_id + encoder_state->tile->lcu_offset_in_ts)) ? 1 : 0);  // end_of_slice_segment_flag
+  }
+  if (!lcu_at_slice_end(encoder, encoder_state->tile->lcu_offset_in_ts + cur_pic->width_in_lcu * cur_pic->height_in_lcu - 1)) {
+    cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
+    cabac_flush(&encoder_state->cabac);
+  } else {
+    cabac_flush(&encoder_state->cabac);
+    bitstream_align(&encoder_state->stream);
+  }
+  //We do not handle tiles containing something for now
+  assert(!encoder_state->children[0].encoder_control);
+}
+
+static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
+  int i;
+  encode_slice_header(main_state);
+  bitstream_align(&main_state->stream); 
+  
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+    //FIXME: Move this...
+    bitstream_clear(&main_state->children[i].stream);
+  }
+}
+
+
+static void encoder_state_write_bitstream(encoder_state * const main_state) {
+  int i;
+  for (i=0; main_state->children[i].encoder_control; ++i) {
+    encoder_state *sub_state = &(main_state->children[i]);
+    encoder_state_write_bitstream(sub_state);
+  }
+  
+  switch (main_state->type) {
+    case ENCODER_STATE_TYPE_MAIN:
+      encoder_state_write_bitstream_main(main_state);
+      break;
+    case ENCODER_STATE_TYPE_TILE:
+      encoder_state_write_bitstream_tile(main_state);
+      break;
+    case ENCODER_STATE_TYPE_SLICE:
+      encoder_state_write_bitstream_slice(main_state);
+      break;
+    default:
+      fprintf(stderr, "Unsupported leaf type %c!\n", main_state->type);
+      assert(0);
+  }
+}
+
+void encode_one_frame(encoder_state * const main_state)
+{
+  encoder_state_new_frame(main_state);
+  encoder_state_encode(main_state);
+  encoder_state_write_bitstream(main_state);
+}
+
 static void fill_after_frame(unsigned height, unsigned array_width,
                              unsigned array_height, pixel *data)
 {

From 39d96e05466443716fc7a625a198a62083bfe7bf Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 10:58:35 +0200
Subject: [PATCH 16/21] Fix bug with cabac stream pointing to bad data

---
 src/encoder.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/encoder.c b/src/encoder.c
index 57da3a3c..65bc9e45 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -833,6 +833,7 @@ int encoder_state_init(encoder_state * const child_state, encoder_state * const
             for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
               child_state->children[i].children[j].parent = &child_state->children[i];
             }
+            child_state->children[i].cabac.stream = &child_state->children[i].stream;
           }
         }
         

From 0e6f1c99fc8e60fa04199faa26f34eb07f044709 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 11:18:10 +0200
Subject: [PATCH 17/21] Refactor picture to remove hidden dependency between
 slice and tiles

picture.type -> encoder_state->global->pictype
picture.slicetype -> encoder_state->global->slicetype
picture.slice_sao_luma_flag -> 1 (was constant)
picture.slice_sao_chroma_flag -> 1 (was constant)

This may be changed later. For now it's better to avoid having slice related stuff in picture.
---
 src/encmain.c   |  2 +-
 src/encoder.c   | 86 +++++++++++++++++--------------------------------
 src/encoder.h   |  4 +++
 src/picture.c   |  2 --
 src/picture.h   |  4 ---
 src/search.c    |  2 +-
 src/transform.c |  2 +-
 7 files changed, 36 insertions(+), 66 deletions(-)

diff --git a/src/encmain.c b/src/encmain.c
index 9170fc40..43101584 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -355,7 +355,7 @@ int main(int argc, char *argv[])
     temp_psnr[2] = image_psnr(cur_pic->v_data, cur_pic->v_recdata, cfg->width>>1, cfg->height>>1);
 
     fprintf(stderr, "POC %4d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f\n", encoder_state.global->frame,
-           "BPI"[cur_pic->slicetype%3], diff<<3,
+           "BPI"[encoder_state.global->slicetype%3], diff<<3,
            temp_psnr[0], temp_psnr[1], temp_psnr[2]);
 
     // Increment total PSNR
diff --git a/src/encoder.c b/src/encoder.c
index 65bc9e45..bae5d9f7 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -58,7 +58,6 @@ static void encode_sao(encoder_state *encoder,
  */
 void encoder_state_init_lambda(encoder_state * const encoder_state)
 {
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
   double qp = encoder_state->global->QP;
   double lambda_scale = 1.0;
   double qp_temp      = qp - 12;
@@ -67,13 +66,13 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
   // Default QP-factor from HM config
   double qp_factor = 0.4624;
 
-  if (cur_pic->slicetype == SLICE_I) {
+  if (encoder_state->global->slicetype == SLICE_I) {
     qp_factor=0.57*lambda_scale;
   }
 
   lambda = qp_factor*pow( 2.0, qp_temp/3.0 );
 
-  if (cur_pic->slicetype != SLICE_I ) {
+  if (encoder_state->global->slicetype != SLICE_I ) {
     lambda *= 0.95;
   }
 
@@ -1050,7 +1049,7 @@ static void encoder_state_encode_tile(encoder_state * const encoder_state) {
   yuv_t *ver_buf = yuv_t_alloc(LCU_WIDTH + 2);
   
   cabac_start(&encoder_state->cabac);
-  init_contexts(encoder_state, encoder_state->global->QP, encoder_state->tile->cur_pic->slicetype);
+  init_contexts(encoder_state, encoder_state->global->QP, encoder_state->global->slicetype);
 
   // Initialize lambda value(s) to use in search
   encoder_state_init_lambda(encoder_state);
@@ -1160,10 +1159,6 @@ static void encoder_state_encode(encoder_state * const main_state) {
       encoder_state *sub_state = &(main_state->children[i]);
       
       if (sub_state->tile != main_state->tile) {
-        //FIXME: remove this once these are in slice
-        sub_state->tile->cur_pic->slicetype = main_state->tile->cur_pic->slicetype;
-        sub_state->tile->cur_pic->type = main_state->tile->cur_pic->type;
-        
         encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
         encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
         encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
@@ -1198,24 +1193,24 @@ static void encoder_state_new_frame(encoder_state * const main_state) {
     const int is_first_frame = (main_state->global->frame == 0);
     const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
     const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
-    const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
+    main_state->global->is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
     
-    if (is_radl_frame) {
+    if (main_state->global->is_radl_frame) {
       // Clear the reference list
       encoder_state_clear_refs(main_state);
 
-      main_state->tile->cur_pic->slicetype = SLICE_I;
-      main_state->tile->cur_pic->type = NAL_IDR_W_RADL;
+      main_state->global->slicetype = SLICE_I;
+      main_state->global->pictype = NAL_IDR_W_RADL;
     } else {
-      main_state->tile->cur_pic->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
-      main_state->tile->cur_pic->type = NAL_TRAIL_R;
+      main_state->global->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
+      main_state->global->pictype = NAL_TRAIL_R;
     }
   } else {
     //Clear the bitstream if it's not the main encoder
     bitstream_clear(&main_state->stream);
   }
   
-  init_contexts(main_state, main_state->global->QP, main_state->tile->cur_pic->slicetype);
+  init_contexts(main_state, main_state->global->QP, main_state->global->slicetype);
   
   for (i = 0; main_state->children[i].encoder_control; ++i) {
     encoder_state_new_frame(&main_state->children[i]);
@@ -1228,25 +1223,10 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
   const encoder_control * const encoder = main_state->encoder_control;
   bitstream * const stream = &main_state->stream;
 
-  const int is_first_frame = (main_state->global->frame == 0);
-  const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
-  const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
-  const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
-  
   int i;
 
 
-  /** IDR picture when: period == 0 and frame == 0
-   *                    period == 1 && frame%2 == 0
-   *                    period != 0 && frame%period == 0
-   **/
-  if (is_radl_frame) {
-    // Clear the reference list
-    encoder_state_clear_refs(main_state);
-
-    main_state->tile->cur_pic->slicetype = SLICE_I;
-    main_state->tile->cur_pic->type = NAL_IDR_W_RADL;
-
+  if (main_state->global->is_radl_frame) {
     // Access Unit Delimiter (AUD)
     if (encoder->aud_enable)
       write_aud(main_state);
@@ -1273,10 +1253,6 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
       bitstream_align(stream);
     }
   } else {
-    // When intra period == 1, all pictures are intra
-    main_state->tile->cur_pic->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
-    main_state->tile->cur_pic->type = NAL_TRAIL_R;
-
     // Access Unit Delimiter (AUD)
     if (encoder->aud_enable)
       write_aud(main_state);
@@ -1285,10 +1261,10 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
   {
     // Not quite sure if this is correct, but it seems to have worked so far
     // so I tried to not change it's behavior.
-    int long_start_code = is_radl_frame || encoder->aud_enable ? 0 : 1;
+    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
 
     nal_write(stream,
-              is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
+              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
   }
   
   for (i = 0; main_state->children[i].encoder_control; ++i) {
@@ -1344,8 +1320,6 @@ static void encoder_state_write_bitstream_slice(encoder_state * const main_state
   for (i = 0; main_state->children[i].encoder_control; ++i) {
     //Append bitstream to main stream
     bitstream_append(&main_state->stream, &main_state->children[i].stream);
-    //FIXME: Move this...
-    bitstream_clear(&main_state->children[i].stream);
   }
 }
 
@@ -1494,9 +1468,8 @@ static void add_checksum(encoder_state * const encoder_state)
 void encode_access_unit_delimiter(encoder_state * const encoder_state)
 {
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  uint8_t pic_type = cur_pic->slicetype == SLICE_I ? 0
-                   : cur_pic->slicetype == SLICE_P ? 1
+  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
+                   : encoder_state->global->slicetype == SLICE_P ? 1
                    :                                             2;
   WRITE_U(stream, pic_type, 3, "pic_type");
 }
@@ -1998,7 +1971,6 @@ void encode_slice_header(encoder_state * const encoder_state)
 {
   const encoder_control * const encoder = encoder_state->encoder_control;
   bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
 
 #ifdef _DEBUG
   printf("=========== Slice ===========\n");
@@ -2006,8 +1978,8 @@ void encode_slice_header(encoder_state * const encoder_state)
 
   WRITE_U(stream, 1, 1, "first_slice_segment_in_pic_flag");
 
-  if (cur_pic->type >= NAL_BLA_W_LP
-      && cur_pic->type <= NAL_RSV_IRAP_VCL23) {
+  if (encoder_state->global->pictype >= NAL_BLA_W_LP
+      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
     WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
   }
 
@@ -2015,7 +1987,7 @@ void encode_slice_header(encoder_state * const encoder_state)
 
   //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
 
-  WRITE_UE(stream, cur_pic->slicetype, "slice_type");
+  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
 
   // if !entropy_slice_flag
 
@@ -2023,8 +1995,8 @@ void encode_slice_header(encoder_state * const encoder_state)
       //WRITE_U(stream, 1, 1, "pic_output_flag");
     //end if
     //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (cur_pic->type != NAL_IDR_W_RADL
-      && cur_pic->type != NAL_IDR_N_LP) {
+  if (encoder_state->global->pictype != NAL_IDR_W_RADL
+      && encoder_state->global->pictype != NAL_IDR_N_LP) {
       int j;
       int ref_negative = encoder_state->global->ref->used_size;
       int ref_positive = 0;
@@ -2045,17 +2017,17 @@ void encode_slice_header(encoder_state * const encoder_state)
     //end if
   //end if
   if (encoder->sao_enable) {
-    WRITE_U(stream, cur_pic->slice_sao_luma_flag, 1, "slice_sao_luma_flag");
-    WRITE_U(stream, cur_pic->slice_sao_chroma_flag, 1, "slice_sao_chroma_flag");
+    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
+    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
   }
 
-  if (cur_pic->slicetype != SLICE_I) {
+  if (encoder_state->global->slicetype != SLICE_I) {
       WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
         WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
       WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
   }
 
-  if (cur_pic->slicetype == SLICE_B) {
+  if (encoder_state->global->slicetype == SLICE_B) {
       WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
   }
 
@@ -2075,12 +2047,12 @@ static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
                              color_index color_i)
 {
   cabac_data * const cabac = &encoder_state->cabac;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
   sao_eo_cat i;
 
   // Skip colors with no SAO.
-  if (color_i == COLOR_Y && !cur_pic->slice_sao_luma_flag) return;
-  if (color_i != COLOR_Y && !cur_pic->slice_sao_chroma_flag) return;
+  //FIXME: for now, we always have SAO for all channels
+  if (color_i == COLOR_Y && 0) return;
+  if (color_i != COLOR_Y && 0) return;
 
   /// sao_type_idx_luma:   TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
   /// sao_type_idx_chroma: TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
@@ -2214,7 +2186,7 @@ void encode_coding_tree(encoder_state * const encoder_state,
 
 
     // Encode skip flag
-  if (cur_pic->slicetype != SLICE_I) {
+  if (encoder_state->global->slicetype != SLICE_I) {
     int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
     int ui;
     int16_t num_cand = MRG_MAX_NUM_CANDS;
@@ -2253,7 +2225,7 @@ void encode_coding_tree(encoder_state * const encoder_state,
   // ENDIF SKIP
 
   // Prediction mode
-  if (cur_pic->slicetype != SLICE_I) {
+  if (encoder_state->global->slicetype != SLICE_I) {
     cabac->ctx = &(cabac->ctx_cu_pred_mode_model);
     CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode");
   }
diff --git a/src/encoder.h b/src/encoder.h
index 7c80c5f1..688a7249 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -152,6 +152,10 @@ typedef struct {
   int8_t ref_list;
   //int8_t ref_idx_num[2];
   
+  int is_radl_frame;
+  uint8_t pictype;
+  uint8_t slicetype;
+  
 } encoder_state_config_global;
 
 typedef struct {
diff --git a/src/picture.c b/src/picture.c
index 034bb97d..ee0added 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -302,8 +302,6 @@ picture *picture_alloc(const int32_t width, const int32_t height,
 
   pic->coeff_y = NULL; pic->coeff_u = NULL; pic->coeff_v = NULL;
 
-  pic->slice_sao_luma_flag = 1;
-  pic->slice_sao_chroma_flag = 1;
   pic->sao_luma = MALLOC(sao_info, width_in_lcu * height_in_lcu);
   pic->sao_chroma = MALLOC(sao_info, width_in_lcu * height_in_lcu);
 
diff --git a/src/picture.h b/src/picture.h
index 117a69b6..f73859fd 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -126,10 +126,6 @@ typedef struct picture_struct
   uint8_t referenced;     //!< \brief Whether this picture is referenced.
   int32_t refcount;     //!< \brief Number of references in reflist to the picture
   cu_info* cu_array;     //!< \brief Info for each CU at each depth.
-  uint8_t type;
-  uint8_t slicetype;
-  uint8_t slice_sao_luma_flag;
-  uint8_t slice_sao_chroma_flag;
   struct sao_info_struct *sao_luma;   //!< \brief Array of sao parameters for every LCU.
   struct sao_info_struct *sao_chroma;   //!< \brief Array of sao parameters for every LCU.
   int32_t poc;           //!< \brief Picture order count
diff --git a/src/search.c b/src/search.c
index 99434380..ab7f5202 100644
--- a/src/search.c
+++ b/src/search.c
@@ -883,7 +883,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept
       y + cu_width <= cur_pic->height)
   {
 
-    if (cur_pic->slicetype != SLICE_I &&
+    if (encoder_state->global->slicetype != SLICE_I &&
         depth >= MIN_INTER_SEARCH_DEPTH &&
         depth <= MAX_INTER_SEARCH_DEPTH)
     {
diff --git a/src/transform.c b/src/transform.c
index ea3d9f40..dabf971f 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -646,7 +646,7 @@ void quant(const encoder_state * const encoder_state, int16_t *coef, int16_t *q_
 
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
-  int32_t add = ((encoder_state->tile->cur_pic->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
+  int32_t add = ((encoder_state->global->slicetype == SLICE_I) ? 171 : 85) << (q_bits - 9);
 
   int32_t q_bits8 = q_bits - 8;
   for (n = 0; n < width * height; n++) {

From b48a687d3c672c744f206c85d91f0176cdfc85e2 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 11:33:19 +0200
Subject: [PATCH 18/21] Restored parallelism, but it will be done in another
 way... OpenMP is not very efficient in these kind of dynamic situation

---
 src/encoder.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index bae5d9f7..e5b0047b 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1154,8 +1154,32 @@ static void encoder_state_encode_tile(encoder_state * const encoder_state) {
 static void encoder_state_encode(encoder_state * const main_state) {
   //If we have children, encode at child level
   if (main_state->children[0].encoder_control) {
-    int i=0;
-    for (i=0; main_state->children[i].encoder_control; ++i) {
+    int i=0, max_i=0;
+    //OpenMP doesn't like aving a stop condition like main_state->children[i].encoder_control.
+    //We compute max_i to avoid this.
+    for (i=0; main_state->children[i].encoder_control; ++i);
+    max_i = i;
+    if (max_i > 1) {
+#pragma omp parallel for
+      for (i=0; i < max_i; ++i) {
+        encoder_state *sub_state = &(main_state->children[i]);
+        
+        if (sub_state->tile != main_state->tile) {
+          encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
+          encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
+          encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
+        }
+        encoder_state_encode(&main_state->children[i]);
+        //FIXME: substream_write_bitstream(subencoder, (main_state->children[i+1].encoder_control) != NULL);
+        
+        if (sub_state->tile != main_state->tile) {
+          encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, sub_state, sub_state->tile->cur_pic->y_recdata, 1);
+          encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, sub_state, sub_state->tile->cur_pic->u_recdata, 0);
+          encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, sub_state, sub_state->tile->cur_pic->v_recdata, 0);
+        }
+      }
+    } else {
+      i=0;
       encoder_state *sub_state = &(main_state->children[i]);
       
       if (sub_state->tile != main_state->tile) {

From 84e5dbee39098258565c29f04de8e1d26a3dd325 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 13:33:02 +0200
Subject: [PATCH 19/21] Remove quote from graphviz dump

---
 src/encoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encoder.c b/src/encoder.c
index e5b0047b..839ade9d 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -536,7 +536,7 @@ static void encoder_state_dump_graphviz(const encoder_state * const encoder_stat
     printf(" ]\n\n");
     
     printf(" \"Map\" [\n");
-    printf("  shape=plaintext\" [\n");
+    printf("  shape=plaintext [\n");
     printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
     printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
     for (y = 0; y < encoder->in.height_in_lcu; ++y) {

From 05eef82896c7cf13c6654964033d287b05c1052f Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Wed, 7 May 2014 13:40:29 +0200
Subject: [PATCH 20/21] Remove extra [ from graphviz dump

---
 src/encoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encoder.c b/src/encoder.c
index 839ade9d..3e4c363b 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -536,7 +536,7 @@ static void encoder_state_dump_graphviz(const encoder_state * const encoder_stat
     printf(" ]\n\n");
     
     printf(" \"Map\" [\n");
-    printf("  shape=plaintext [\n");
+    printf("  shape=plaintext\n");
     printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
     printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
     for (y = 0; y < encoder->in.height_in_lcu; ++y) {

From 535b42bc9b0440b069a2c7ff32f440286ae5608f Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 7 May 2014 14:10:22 +0300
Subject: [PATCH 21/21] Fix compilation for VS2010.

---
 build/C_Properties.props | 3 ++-
 src/encoder.c            | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/build/C_Properties.props b/build/C_Properties.props
index 934baef2..9d8881e9 100644
--- a/build/C_Properties.props
+++ b/build/C_Properties.props
@@ -14,7 +14,8 @@
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <PreprocessorDefinitions>WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4244;4204;4206</DisableSpecificWarnings>
+      <DisableSpecificWarnings>4244;4204;4206;4028</DisableSpecificWarnings>
+      <OpenMPSupport>true</OpenMPSupport>
     </ClCompile>
     <Link>
       <AdditionalDependencies>Ws2_32.lib;%(AdditionalDependencies)</AdditionalDependencies>
diff --git a/src/encoder.c b/src/encoder.c
index 3e4c363b..02630e57 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1179,8 +1179,9 @@ static void encoder_state_encode(encoder_state * const main_state) {
         }
       }
     } else {
+      encoder_state *sub_state;
       i=0;
-      encoder_state *sub_state = &(main_state->children[i]);
+      sub_state = &(main_state->children[i]);
       
       if (sub_state->tile != main_state->tile) {
         encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);