From 5ed69b063bb2d8ffc25e69cb7ec2b6a9970a1348 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Mon, 2 Jun 2014 11:20:20 +0200
Subject: [PATCH 01/15] Strategy selector for array_checksum, basic
 implementation using precomputed 256*256 block with larger accesses than byte

---
 src/nal.c                    |  35 +--------
 src/strategies/nal-generic.c | 145 +++++++++++++++++++++++++++++++++++
 src/strategies/nal.c         |  14 ++++
 src/strategies/nal.h         |  39 ++++++++++
 src/strategyselector.c       |   6 ++
 src/strategyselector.h       |   2 +
 6 files changed, 207 insertions(+), 34 deletions(-)
 create mode 100644 src/strategies/nal-generic.c
 create mode 100644 src/strategies/nal.c
 create mode 100644 src/strategies/nal.h

diff --git a/src/nal.c b/src/nal.c
index 11907f30..2ca940f0 100644
--- a/src/nal.c
+++ b/src/nal.c
@@ -22,6 +22,7 @@
  */
 
 #include "nal.h"
+#include "strategyselector.h"
 
 #include <stdlib.h>
 #include <string.h>
@@ -64,40 +65,6 @@ void nal_write(bitstream * const bitstream, const uint8_t nal_type,
   bitstream_writebyte(bitstream, byte);
 }
 
-
-/**
- * \brief Calculate checksum for one color of the picture.
- * \param data Beginning of the pixel data for the picture.
- * \param height Height of the picture.
- * \param width Width of the picture.
- * \param stride Width of one row in the pixel array.
- */
-static void array_checksum(const pixel* data,
-                           const int height, const int width,
-                           const int stride,
-                           unsigned char checksum_out[SEI_HASH_MAX_LENGTH])
-{
-  uint8_t mask;
-  uint32_t checksum = 0;
-  int y, x;
-
-  assert(SEI_HASH_MAX_LENGTH >= 4);
-
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8));
-      checksum += (data[(y * stride) + x] & 0xff) ^ mask;
-    }
-  }
-
-  // Unpack uint into byte-array.
-  checksum_out[0] = (checksum >> 24) & 0xff;
-  checksum_out[1] = (checksum >> 16) & 0xff;
-  checksum_out[2] = (checksum >> 8) & 0xff;
-  checksum_out[3] = (checksum) & 0xff;
-}
-
-
 /*!
  \brief Calculate checksums for all colors of the picture.
  \param pic The picture that checksum is calculated for.
diff --git a/src/strategies/nal-generic.c b/src/strategies/nal-generic.c
new file mode 100644
index 00000000..e40282a9
--- /dev/null
+++ b/src/strategies/nal-generic.c
@@ -0,0 +1,145 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include <stdlib.h>
+
+#include "nal.h"
+#include <assert.h>
+#include "../strategyselector.h"
+
+
+static void array_checksum_generic(const pixel* data,
+                                   const int height, const int width,
+                                   const int stride,
+                                   unsigned char checksum_out[SEI_HASH_MAX_LENGTH]) {
+  int x, y;
+  int checksum = 0;
+  
+  assert(SEI_HASH_MAX_LENGTH >= 4);
+  
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      const uint8_t mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8));
+      checksum += (data[(y * stride) + x] & 0xff) ^ mask;
+    }
+  }
+
+  // Unpack uint into byte-array.
+  checksum_out[0] = (checksum >> 24) & 0xff;
+  checksum_out[1] = (checksum >> 16) & 0xff;
+  checksum_out[2] = (checksum >> 8) & 0xff;
+  checksum_out[3] = (checksum) & 0xff;
+}
+
+static void array_checksum_generic4(const pixel* data,
+                                   const int height, const int width,
+                                   const int stride,
+                                   unsigned char checksum_out[SEI_HASH_MAX_LENGTH]) {
+  uint32_t checksum = 0;
+  int y, x, xp;
+  
+  static uint8_t ckmap_initialized = 0;
+  static uint32_t ckmap[64*256];
+  
+  if (!ckmap_initialized) {
+    uint8_t * const ckmap_uint8 = (uint8_t*)&ckmap;
+    int x, y;
+    for (y = 0; y < 256; ++y) {
+      for (x = 0; x < 256; ++x) {
+        ckmap_uint8[y*256+x] = x^y;
+      }
+    }
+    ckmap_initialized = 1;
+  }
+
+  assert(SEI_HASH_MAX_LENGTH >= 4);
+
+  for (y = 0; y < height; ++y) {
+    for (xp = 0; xp < width/4; ++xp) {
+      const int x = xp * 4;
+      const uint32_t mask = ckmap[(xp&63)+64*(y&255)] ^ (((x >> 8) ^ (y >> 8)) * 0x1010101);
+      const uint32_t cksumbytes = (*((uint32_t*)(&data[(y * stride) + x]))) ^ mask;
+      checksum += ((cksumbytes >> 24) & 0xff) + ((cksumbytes >> 16) & 0xff) + ((cksumbytes >> 8) & 0xff) + (cksumbytes & 0xff);
+    }
+    for (x = xp*4; x < width; ++x) {
+      uint8_t mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8));
+      checksum += (data[(y * stride) + x] & 0xff) ^ mask;
+    }
+  }
+
+  // Unpack uint into byte-array.
+  checksum_out[0] = (checksum >> 24) & 0xff;
+  checksum_out[1] = (checksum >> 16) & 0xff;
+  checksum_out[2] = (checksum >> 8) & 0xff;
+  checksum_out[3] = (checksum) & 0xff;
+}
+
+static void array_checksum_generic8(const pixel* data,
+                                   const int height, const int width,
+                                   const int stride,
+                                   unsigned char checksum_out[SEI_HASH_MAX_LENGTH]) {
+  uint32_t checksum = 0;
+  int y, x, xp;
+  
+  static uint8_t ckmap_initialized = 0;
+  static uint64_t ckmap[32*256];
+  
+  if (!ckmap_initialized) {
+    uint8_t * const ckmap_uint8 = (uint8_t*)&ckmap;
+    int x, y;
+    for (y = 0; y < 256; ++y) {
+      for (x = 0; x < 256; ++x) {
+        ckmap_uint8[y*256+x] = x^y;
+      }
+    }
+    ckmap_initialized = 1;
+  }
+
+  assert(SEI_HASH_MAX_LENGTH >= 4);
+
+  for (y = 0; y < height; ++y) {
+    for (xp = 0; xp < width/8; ++xp) {
+      const int x = xp * 8;
+      const uint64_t mask = ckmap[(xp&31)+32*(y&255)] ^ ((uint64_t)((x >> 8) ^ (y >> 8)) * 0x101010101010101);
+      const uint64_t cksumbytes = (*((uint64_t*)(&data[(y * stride) + x]))) ^ mask;
+      checksum += ((cksumbytes >> 56) & 0xff) + ((cksumbytes >> 48) & 0xff) + ((cksumbytes >> 40) & 0xff) + ((cksumbytes >> 32) & 0xff) + ((cksumbytes >> 24) & 0xff) + ((cksumbytes >> 16) & 0xff) + ((cksumbytes >> 8) & 0xff) + (cksumbytes & 0xff);
+    }
+    for (x = xp*8; x < width; ++x) {
+      uint8_t mask = (uint8_t)((x & 0xff) ^ (y & 0xff) ^ (x >> 8) ^ (y >> 8));
+      checksum += (data[(y * stride) + x] & 0xff) ^ mask;
+    }
+  }
+
+  // Unpack uint into byte-array.
+  checksum_out[0] = (checksum >> 24) & 0xff;
+  checksum_out[1] = (checksum >> 16) & 0xff;
+  checksum_out[2] = (checksum >> 8) & 0xff;
+  checksum_out[3] = (checksum) & 0xff;
+}
+
+static int strategy_register_nal_generic(void* opaque) {
+  if (!strategyselector_register(opaque, "array_checksum", "generic", 0, &array_checksum_generic)) return 0;
+  if (!strategyselector_register(opaque, "array_checksum", "generic4", 1, &array_checksum_generic4)) return 0;
+  if (!strategyselector_register(opaque, "array_checksum", "generic8", 2, &array_checksum_generic8)) return 0;
+  return 1;
+}
diff --git a/src/strategies/nal.c b/src/strategies/nal.c
new file mode 100644
index 00000000..d0a15e48
--- /dev/null
+++ b/src/strategies/nal.c
@@ -0,0 +1,14 @@
+#include "nal.h"
+#include "nal-generic.c"
+
+void (*array_checksum)(const pixel* data,
+                       const int height, const int width,
+                       const int stride,
+                       unsigned char checksum_out[SEI_HASH_MAX_LENGTH]);
+
+
+static int strategy_register_nal(void* opaque) {
+  if (!strategy_register_nal_generic(opaque)) return 0;
+  
+  return 1;
+}
diff --git a/src/strategies/nal.h b/src/strategies/nal.h
new file mode 100644
index 00000000..c8baea3a
--- /dev/null
+++ b/src/strategies/nal.h
@@ -0,0 +1,39 @@
+#ifndef STRATEGIES_NAL_H_
+#define STRATEGIES_NAL_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "../nal.h"
+
+//Function pointer to array_checksum
+/**
+ * \brief Calculate checksum for one color of the picture.
+ * \param data Beginning of the pixel data for the picture.
+ * \param height Height of the picture.
+ * \param width Width of the picture.
+ * \param stride Width of one row in the pixel array.
+ */
+extern void (*array_checksum)(const pixel* data,
+                              const int height, const int width,
+                              const int stride,
+                              unsigned char checksum_out[SEI_HASH_MAX_LENGTH]);
+
+#define STRATEGIES_NAL_EXPORTS {"array_checksum", (void**) &array_checksum}
+
+#endif //STRATEGIES_NAL_H_
diff --git a/src/strategyselector.c b/src/strategyselector.c
index e4da6ea6..46b92610 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -33,6 +33,7 @@ static void* strategyselector_choose_for(const strategy_list * const strategies,
 
 //Strategies to include (add new file here)
 #include "strategies/picture.c"
+#include "strategies/nal.c"
 
 //Returns 1 if successful
 int strategyselector_init() {
@@ -51,6 +52,11 @@ int strategyselector_init() {
     return 0;
   }
   
+  if (!strategy_register_nal(&strategies)) {
+    fprintf(stderr, "strategy_register_nal failed!\n");
+    return 0;
+  }
+  
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
     
diff --git a/src/strategyselector.h b/src/strategyselector.h
index 514e900d..519526d1 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -127,9 +127,11 @@ int strategyselector_register(void *opaque, const char *type, const char *strate
 
 
 //Strategy to include
+#include "strategies/nal.h"
 #include "strategies/picture.h"
 
 static const strategy_to_select strategies_to_select[] = {
+  STRATEGIES_NAL_EXPORTS,
   STRATEGIES_PICTURE_EXPORTS,
   {NULL, NULL},
 };

From 58ad3b4d26bbe632fce679bff8e0da0e4aa51460 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 07:39:00 +0200
Subject: [PATCH 02/15] Log more performance data, plot also now many threads
 are running

---
 src/encoder.c                 |  32 ++++++++---
 tools/plot-threadqueue-log.py | 101 ++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 987fe398..886e747e 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1685,7 +1685,6 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
 
   int i;
 
-
   if (main_state->global->is_radl_frame) {
     // Access Unit Delimiter (AUD)
     if (encoder->aud_enable)
@@ -1726,17 +1725,24 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
     nal_write(stream,
               main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
   }
-  
+  {
+    PERFORMANCE_MEASURE_START();
   for (i = 0; main_state->children[i].encoder_control; ++i) {
     //Append bitstream to main stream
     bitstream_append(&main_state->stream, &main_state->children[i].stream);
     //FIXME: Move this...
     bitstream_clear(&main_state->children[i].stream);
   }
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  {
+    PERFORMANCE_MEASURE_START();
+    // Calculate checksum
+    add_checksum(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
   
-  // Calculate checksum
-  add_checksum(main_state);
-
   //FIXME: Why is this needed?
   main_state->tile->cur_pic->poc = main_state->global->poc;
 }
@@ -1817,9 +1823,19 @@ static void encoder_state_write_bitstream(encoder_state * const main_state) {
 
 void encode_one_frame(encoder_state * const main_state)
 {
-  encoder_state_new_frame(main_state);
-  encoder_state_encode(main_state);
-  encoder_state_write_bitstream(main_state);
+  {
+    PERFORMANCE_MEASURE_START();
+    encoder_state_new_frame(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d", main_state->global->frame);
+  }
+  {
+    PERFORMANCE_MEASURE_START();
+    encoder_state_encode(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
+  }
+  {
+    encoder_state_write_bitstream(main_state);
+  }
 }
 
 static void fill_after_frame(unsigned height, unsigned array_width,
diff --git a/tools/plot-threadqueue-log.py b/tools/plot-threadqueue-log.py
index 4a11e358..f1c181bd 100644
--- a/tools/plot-threadqueue-log.py
+++ b/tools/plot-threadqueue-log.py
@@ -98,6 +98,94 @@ class LogThread:
 
   def plot(self, ax, i):
     ax.barh(i, self._stop - self._start, left=self._start, height=0.9, align='center',label="test", color='yellow')
+    
+class IntervalThreadCounter:
+  def __init__(self):
+    self.interval_starts = []
+    self.interval_stops = []
+    
+  def add_interval(self, start, stop):
+    self.interval_starts.append(start)
+    self.interval_stops.append(stop)
+    self.interval_starts.sort()
+    self.interval_stops.sort()
+    
+  def get_values_xd(self):
+    #Double the first and the last items
+    xds = sorted([(x,'+') for x in self.interval_starts] + [(x,'-') for x in self.interval_stops])
+    return xds
+    
+  def get_values_x(self):
+    xs = []
+    for x in self.get_values_xd():
+      xs.append(x[0])
+      xs.append(x[0])
+    return xs
+  
+  def get_values_y(self):
+    xds = self.get_values_xd()
+    ys = []
+    counter = 0
+    for xd in xds:
+      ys.append(counter)
+      
+      if xd[1] == '+':
+        counter += 1
+      elif xd[1] == '-':
+        counter -= 1
+      else:
+        assert False
+      
+      ys.append(counter)
+      
+    return ys
+    
+  def clamp(self, v, minval, maxval):
+    if v < minval:
+      return minval
+    if v > maxval:
+      return maxval
+    return v
+    
+  def get_values_uniform_xy(self, kernel_size, steps):
+    kernel_size=float(kernel_size)
+    xchs = self.get_values_x()
+    ychs = self.get_values_y()
+    
+    minval = xchs[0] - kernel_size
+    maxval = xchs[-1] + kernel_size
+    
+    pos = minval
+    
+    xvalues = []
+    yvalues = []
+
+    while pos < maxval:
+      value = 0
+      for i in range(1,len(xchs)-1):
+        if xchs[i] < pos - kernel_size:
+          continue
+          
+        v1 = self.clamp(xchs[i-1], pos - kernel_size/2., pos+kernel_size/2.)
+        v2 = self.clamp(xchs[i], pos - kernel_size/2., pos+kernel_size/2.)
+        
+        diff=v2-v1
+        
+        value += diff*ychs[i]/kernel_size
+        
+        if xchs[i] > pos + kernel_size:
+          break
+          
+      xvalues.append(pos)
+      yvalues.append(value)
+      
+      pos += kernel_size/steps
+      
+    return xvalues, yvalues
+    
+      
+    
+    
 
 class LogParser:
   def _parse_time(self, base, sign, value):
@@ -231,6 +319,19 @@ class LogParser:
     ax=fig.gca()
 
     yticks = {}
+    
+    #first draw usage
+    itc = IntervalThreadCounter()
+    for o in self._objects:
+      if isinstance(o, LogJob) and o._is_thread_job:
+        itc.add_interval(o._start, o._stop)
+    
+    #exact plot
+    ax.plot(itc.get_values_x(), [y+1.5 for y in itc.get_values_y()])
+    vx,vy = itc.get_values_uniform_xy(0.01,10)
+    ax.plot(vx, [y+1.5 for y in vy], 'r')
+    
+    
 
     #first draw threads
     for o in self._objects:

From 5ee1319c089f1a3b2371bba195574db234cb655d Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 07:39:30 +0200
Subject: [PATCH 03/15] Altivec detection

---
 src/strategyselector.c | 50 ++++++++++++++++++++++++++++++++++++++++++
 src/strategyselector.h |  5 +++++
 2 files changed, 55 insertions(+)

diff --git a/src/strategyselector.c b/src/strategyselector.c
index 46b92610..cdb6433a 100644
--- a/src/strategyselector.c
+++ b/src/strategyselector.c
@@ -188,6 +188,43 @@ INLINE int get_cpuid(unsigned int level, unsigned int *eax, unsigned int *ebx, u
 
 #endif
 
+#if COMPILE_POWERPC
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/auxvec.h>
+#include <asm/cputable.h>
+
+//Source: http://freevec.org/function/altivec_runtime_detection_linux
+static int altivec_available(void)
+{
+    int result = 0;
+    unsigned long buf[64];
+    ssize_t count;
+    int fd, i;
+ 
+    fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd < 0) {
+        return 0;
+    }
+    // loop on reading
+    do {
+        count = read(fd, buf, sizeof(buf));
+        if (count < 0)
+            break;
+        for (i=0; i < (count / sizeof(unsigned long)); i += 2) {
+            if (buf[i] == AT_HWCAP) {
+                result = !!(buf[i+1] & PPC_FEATURE_HAS_ALTIVEC);
+                goto out_close;
+            } else if (buf[i] == AT_NULL)
+                goto out_close;
+        }
+    } while (count == sizeof(buf));
+out_close:
+    close(fd);
+    return result;
+}
+#endif //COMPILE_POWERPC
+
 static void set_hardware_flags() {
   memset(&g_hardware_flags, 0, sizeof(g_hardware_flags));
   
@@ -252,4 +289,17 @@ static void set_hardware_flags() {
     fprintf(stderr, "\n");
   }
 #endif //COMPILE_INTEL
+
+#if COMPILE_POWERPC
+  g_hardware_flags.powerpc_flags.altivec = altivec_available();
+  
+  fprintf(stderr, "Compiled: PowerPC, flags:");
+#if COMPILE_POWERPC_ALTIVEC
+  fprintf(stderr, " AltiVec");
+#endif
+  fprintf(stderr, "\nRun on  : PowerPC, flags:");
+  if (g_hardware_flags.powerpc_flags.altivec) fprintf(stderr, " AltiVec");
+  fprintf(stderr, "\n");
+#endif
+  
 }
diff --git a/src/strategyselector.h b/src/strategyselector.h
index 519526d1..953a19fb 100644
--- a/src/strategyselector.h
+++ b/src/strategyselector.h
@@ -62,6 +62,11 @@
 
 #if defined (_M_PPC) || defined(__powerpc64__) || defined(__powerpc__)
 #define COMPILE_POWERPC 1
+#ifdef __ALTIVEC__
+#define COMPILE_POWERPC_ALTIVEC 1
+#else
+#define COMPILE_POWERPC_ALTIVEC 0
+#endif
 #else
 #define COMPILE_POWERPC 0
 #endif

From 0811dbcfbe2dac0530878bcecd3f1bc4f39c1d9d Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 09:45:17 +0200
Subject: [PATCH 04/15] Remove unneeded cond_broadcast. Limit contention

---
 src/threadqueue.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/threadqueue.c b/src/threadqueue.c
index 48536552..825cbd98 100644
--- a/src/threadqueue.c
+++ b/src/threadqueue.c
@@ -34,6 +34,7 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
 
   for(;;) {
     int task_id = -1, i = 0;
+    int signal_count = 0;
     
     PTHREAD_LOCK(&threadqueue->lock);
 
@@ -99,6 +100,8 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
       
       job->state = THREADQUEUE_JOB_STATE_DONE;
       
+      signal_count = 0;
+      
       //Decrease counter of dependencies
       for (i = 0; i < job->rdepends_count; ++i) {
         threadqueue_job * const depjob = job->rdepends[i];
@@ -109,6 +112,10 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
         assert(depjob->ndepends > 0);
         --depjob->ndepends;
         
+        if (depjob->ndepends == 0) {
+          ++signal_count;
+        }
+        
         PTHREAD_UNLOCK(&depjob->lock);
       }
       //Unlock the job
@@ -116,7 +123,7 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
       
       //Signal the queue that we've done a job
       PTHREAD_LOCK(&threadqueue->lock);
-      pthread_cond_broadcast(&threadqueue->cb_cond);
+      pthread_cond_signal(&threadqueue->cb_cond);
       PTHREAD_UNLOCK(&threadqueue->lock);
     } else {
       PTHREAD_UNLOCK(&threadqueue->lock);
@@ -436,11 +443,14 @@ threadqueue_job * threadqueue_submit(threadqueue_queue * const threadqueue, void
   
   ++threadqueue->queue_waiting;
   
-  //Hope a thread can do it...
-  if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
-      fprintf(stderr, "pthread_cond_signal failed!\n");
-      assert(0);
-      return NULL;
+  if (job->ndepends == 0) {
+    //Hope a thread can do it...
+    fprintf(stderr, "Sent signal from submit\n");
+    if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
+        fprintf(stderr, "pthread_cond_signal failed!\n");
+        assert(0);
+        return NULL;
+    }
   }
   
   PTHREAD_UNLOCK(&threadqueue->lock);
@@ -478,20 +488,25 @@ int threadqueue_job_dep_add(threadqueue_job *job, threadqueue_job *depends_on) {
 }
 
 int threadqueue_job_unwait_job(threadqueue_queue * const threadqueue, threadqueue_job *job) {
+  int ndepends = 0;
+  
   //NULL job =>  no threads, nothing to do
   if (!job) return 1;
   PTHREAD_LOCK(&job->lock);
   job->ndepends--;
+  ndepends = job->ndepends;
   PTHREAD_UNLOCK(&job->lock);
   
-  PTHREAD_LOCK(&threadqueue->lock);
-  //Hope a thread can do it...
-  if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
-      fprintf(stderr, "pthread_cond_signal failed!\n");
-      assert(0);
-      return 0;
+  if (ndepends == 0) {
+    PTHREAD_LOCK(&threadqueue->lock);
+    //Hope a thread can do it...
+    if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
+        fprintf(stderr, "pthread_cond_signal failed!\n");
+        assert(0);
+        return 0;
+    }
+    PTHREAD_UNLOCK(&threadqueue->lock);
   }
-  PTHREAD_UNLOCK(&threadqueue->lock);
   
   return 1;
 }

From 9bdecbe07171499e88d395cddd0700ac9d85c37e Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 10:26:15 +0200
Subject: [PATCH 05/15] Better thread scheduling

---
 src/threadqueue.c | 110 ++++++++++++++++++++++++++++++----------------
 src/threadqueue.h |   3 +-
 2 files changed, 73 insertions(+), 40 deletions(-)

diff --git a/src/threadqueue.c b/src/threadqueue.c
index 825cbd98..205b2212 100644
--- a/src/threadqueue.c
+++ b/src/threadqueue.c
@@ -18,64 +18,84 @@ typedef struct {
 
 #define THREADQUEUE_LIST_REALLOC_SIZE 32
 
+//#define PTHREAD_COND_SIGNAL(c) fprintf(stderr, "%s:%d pthread_cond_signal(%s=%p)\n", __FUNCTION__, __LINE__, #c, c); if (pthread_cond_signal((c)) != 0) { fprintf(stderr, "pthread_cond_signal(%s=%p) failed!\n", #c, c); assert(0); return 0; }
+//#define PTHREAD_COND_BROADCAST(c) fprintf(stderr, "%s:%d pthread_cond_broadcast(%s=%p)\n", __FUNCTION__, __LINE__, #c, c); if (pthread_cond_broadcast((c)) != 0) { fprintf(stderr, "pthread_cond_broadcast(%s=%p) failed!\n", #c, c); assert(0); return 0; }
+//#define PTHREAD_COND_WAIT(c,l) fprintf(stderr, "%s:%d pthread_cond_wait(%s=%p, %s=%p)\n", __FUNCTION__, __LINE__, #c, c, #l, l); if (pthread_cond_wait((c),(l)) != 0) { fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); assert(0); return 0; } else {fprintf(stderr, "%s:%d pthread_cond_wait(%s=%p, %s=%p) (done)\n", __FUNCTION__, __LINE__, #c, c, #l, l);}
 //#define PTHREAD_LOCK(l) fprintf(stderr, "%s:%d pthread_mutex_lock(%s=%p) (try)\n", __FUNCTION__, __LINE__, #l, l); if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s=%p) failed!\n", #l, l); assert(0); return 0; } else {fprintf(stderr, "%s:%d pthread_mutex_lock(%s=%p)\n", __FUNCTION__, __LINE__, #l, l);}
 //#define PTHREAD_UNLOCK(l) if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s=%p) failed!\n", #l, l); assert(0); return 0; }  else {fprintf(stderr, "%s:%d pthread_mutex_unlock(%s=%p)\n", __FUNCTION__, __LINE__, #l, l);}
 
+#define PTHREAD_COND_SIGNAL(c) if (pthread_cond_signal((c)) != 0) { fprintf(stderr, "pthread_cond_signal(%s=%p) failed!\n", #c, c); assert(0); return 0; }
+#define PTHREAD_COND_BROADCAST(c) if (pthread_cond_broadcast((c)) != 0) { fprintf(stderr, "pthread_cond_broadcast(%s=%p) failed!\n", #c, c); assert(0); return 0; }
+#define PTHREAD_COND_WAIT(c,l) if (pthread_cond_wait((c),(l)) != 0) { fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); assert(0); return 0; }
 #define PTHREAD_LOCK(l) if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s) failed!\n", #l); assert(0); return 0; }
 #define PTHREAD_UNLOCK(l) if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s) failed!\n", #l); assert(0); return 0; }
 
 static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
   threadqueue_worker_spec * const threadqueue_worker_spec = threadqueue_worker_spec_opaque;
   threadqueue_queue * const threadqueue = threadqueue_worker_spec->threadqueue;
+  threadqueue_job * next_job = NULL;
   
 #ifdef _DEBUG
   GET_TIME(&threadqueue->debug_clock_thread_start[threadqueue_worker_spec->worker_id]);
 #endif //_DEBUG
 
   for(;;) {
-    int task_id = -1, i = 0;
+    int i = 0;
     int signal_count = 0;
+    threadqueue_job * job_to_run = NULL;
     
     PTHREAD_LOCK(&threadqueue->lock);
 
-    while(!threadqueue->stop && threadqueue->queue_waiting == 0) {
-      if (pthread_cond_wait(&threadqueue->cond, &threadqueue->lock) != 0) {
+    while(!threadqueue->stop && threadqueue->queue_waiting_execution == 0 && !next_job) {
+      PTHREAD_COND_WAIT(&threadqueue->cond, &threadqueue->lock);
+      /*if (pthread_cond_wait(&threadqueue->cond, &threadqueue->lock) != 0) {
         fprintf(stderr, "pthread_cond_wait failed!\n");
         assert(0);
         return 0;
-      }
+      }*/
     }
-
+    
     if(threadqueue->stop) {
-        break;
+      if (next_job) {
+        PTHREAD_LOCK(&next_job->lock);
+        next_job->state = THREADQUEUE_JOB_STATE_QUEUED;
+        PTHREAD_UNLOCK(&next_job->lock);
+      }
+      break;
     }
     
     //Find a task (should be fast enough)
-    task_id = -1;
-    for (i = threadqueue->queue_start; i < threadqueue->queue_count; ++i) {
-      threadqueue_job * const job = threadqueue->queue[i];
-      PTHREAD_LOCK(&job->lock);
-      
-      if (job->state == THREADQUEUE_JOB_STATE_QUEUED && job->ndepends == 0) {
-        task_id = i;
-        break; //Task remains locked
+    job_to_run = NULL;
+    if (next_job) {
+      PTHREAD_LOCK(&next_job->lock);
+      assert(next_job->ndepends == 0);
+      job_to_run = next_job;
+     } else {
+      for (i = threadqueue->queue_start; i < threadqueue->queue_count; ++i) {
+        threadqueue_job * const i_job = threadqueue->queue[i];
+        PTHREAD_LOCK(&i_job->lock);
+        
+        if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
+          job_to_run = i_job;
+          break; //Task remains locked
+        }
+        
+        //Not this task, so unlock it
+        PTHREAD_UNLOCK(&i_job->lock);
       }
-      
-      //Not this task, so unlock it
-      PTHREAD_UNLOCK(&job->lock);
     }
     
     //Ok we got a job (and we have a lock on it)
-    if (task_id != -1) {
-      threadqueue_job * const job = threadqueue->queue[i];
+    if (job_to_run) {
+      threadqueue_job * const job = job_to_run;
 
-      assert(job->state == THREADQUEUE_JOB_STATE_QUEUED);
+      assert(job->state == THREADQUEUE_JOB_STATE_QUEUED || (job == next_job && job->state == THREADQUEUE_JOB_STATE_RUNNING));
       job->state = THREADQUEUE_JOB_STATE_RUNNING;
       
       //Move the queue_start "pointer" if needed
       while (threadqueue->queue_start < threadqueue->queue_count && threadqueue->queue[threadqueue->queue_start]->state != THREADQUEUE_JOB_STATE_QUEUED) threadqueue->queue_start++;
       
-      --threadqueue->queue_waiting;
+      if (!next_job) --threadqueue->queue_waiting_execution;
       
       //We can unlock the job here, since fptr and arg are constant
       PTHREAD_UNLOCK(&job->lock);
@@ -101,6 +121,7 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
       job->state = THREADQUEUE_JOB_STATE_DONE;
       
       signal_count = 0;
+      next_job = NULL;
       
       //Decrease counter of dependencies
       for (i = 0; i < job->rdepends_count; ++i) {
@@ -113,7 +134,15 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
         --depjob->ndepends;
         
         if (depjob->ndepends == 0) {
-          ++signal_count;
+          if (!next_job) {
+            next_job = depjob;
+            depjob->state = THREADQUEUE_JOB_STATE_RUNNING;
+          } else {
+            ++signal_count;
+            ++threadqueue->queue_waiting_execution;
+          }
+          assert(threadqueue->queue_waiting_dependency > 0);
+          --threadqueue->queue_waiting_dependency;
         }
         
         PTHREAD_UNLOCK(&depjob->lock);
@@ -123,6 +152,12 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
       
       //Signal the queue that we've done a job
       PTHREAD_LOCK(&threadqueue->lock);
+      for (i = 0; i < signal_count; ++i) {
+        PTHREAD_COND_SIGNAL(&threadqueue->cond);
+      }
+      //PTHREAD_COND_BROADCAST(&threadqueue->cond);
+      //Don't log this one
+      //PTHREAD_COND_SIGNAL(&threadqueue->cb_cond);
       pthread_cond_signal(&threadqueue->cb_cond);
       PTHREAD_UNLOCK(&threadqueue->lock);
     } else {
@@ -188,7 +223,8 @@ int threadqueue_init(threadqueue_queue * const threadqueue, int thread_count) {
   threadqueue->queue_size = 0;
   threadqueue->queue_count = 0;
   threadqueue->queue_start = 0;
-  threadqueue->queue_waiting = 0;
+  threadqueue->queue_waiting_execution = 0;
+  threadqueue->queue_waiting_dependency = 0;
   
   //Lock the queue before creating threads, to ensure they all have correct information
   PTHREAD_LOCK(&threadqueue->lock);
@@ -333,8 +369,8 @@ int threadqueue_flush(threadqueue_queue * const threadqueue) {
   PTHREAD_LOCK(&threadqueue->lock);
   
   do {
-    if (threadqueue->queue_waiting > 0) {
-      notdone = threadqueue->queue_waiting;
+    if (threadqueue->queue_waiting_execution + threadqueue->queue_waiting_dependency > 0) {
+      notdone = threadqueue->queue_waiting_execution + threadqueue->queue_waiting_dependency;
     } else {
       notdone = 0;
       for (i = 0; i < threadqueue->queue_count; ++i) {
@@ -364,7 +400,7 @@ int threadqueue_flush(threadqueue_queue * const threadqueue) {
   
   threadqueue_free_jobs(threadqueue);
 
-  assert(threadqueue->queue_waiting == 0);
+  assert(threadqueue->queue_waiting_dependency == 0 && threadqueue->queue_waiting_execution == 0);
 
   PTHREAD_UNLOCK(&threadqueue->lock);
 
@@ -441,16 +477,12 @@ threadqueue_job * threadqueue_submit(threadqueue_queue * const threadqueue, void
   }
   threadqueue->queue[threadqueue->queue_count++] = job;
   
-  ++threadqueue->queue_waiting;
-  
   if (job->ndepends == 0) {
+    ++threadqueue->queue_waiting_execution;
     //Hope a thread can do it...
-    fprintf(stderr, "Sent signal from submit\n");
-    if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
-        fprintf(stderr, "pthread_cond_signal failed!\n");
-        assert(0);
-        return NULL;
-    }
+    PTHREAD_COND_SIGNAL(&(threadqueue->cond));
+  } else {
+    ++threadqueue->queue_waiting_dependency;
   }
   
   PTHREAD_UNLOCK(&threadqueue->lock);
@@ -499,12 +531,12 @@ int threadqueue_job_unwait_job(threadqueue_queue * const threadqueue, threadqueu
   
   if (ndepends == 0) {
     PTHREAD_LOCK(&threadqueue->lock);
+    assert(threadqueue->queue_waiting_dependency > 0);
+    --threadqueue->queue_waiting_dependency;
+    ++threadqueue->queue_waiting_execution;
     //Hope a thread can do it...
-    if(pthread_cond_signal(&(threadqueue->cond)) != 0) {
-        fprintf(stderr, "pthread_cond_signal failed!\n");
-        assert(0);
-        return 0;
-    }
+    PTHREAD_COND_SIGNAL(&(threadqueue->cond));
+    
     PTHREAD_UNLOCK(&threadqueue->lock);
   }
   
diff --git a/src/threadqueue.h b/src/threadqueue.h
index d07bd754..139a681a 100644
--- a/src/threadqueue.h
+++ b/src/threadqueue.h
@@ -75,7 +75,8 @@ typedef struct {
   unsigned int queue_start;
   unsigned int queue_count;
   unsigned int queue_size;
-  unsigned int queue_waiting;
+  unsigned int queue_waiting_execution; //Number of jobs without any dependency which could be run
+  unsigned int queue_waiting_dependency; //Number of jobs waiting for a dependency to complete
   
 #ifdef _DEBUG
   //Format: pointer <tab> worker id <tab> time enqueued <tab> time started <tab> time stopped <tab> time dequeued <tab> job description

From 12f5d17a24f5292aa9a46b6e956e63ad05d73556 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 13:24:18 +0200
Subject: [PATCH 06/15] yticks in plot

---
 tools/plot-threadqueue-log.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/plot-threadqueue-log.py b/tools/plot-threadqueue-log.py
index f1c181bd..0f9389af 100644
--- a/tools/plot-threadqueue-log.py
+++ b/tools/plot-threadqueue-log.py
@@ -331,6 +331,9 @@ class LogParser:
     vx,vy = itc.get_values_uniform_xy(0.01,10)
     ax.plot(vx, [y+1.5 for y in vy], 'r')
     
+    for y in set(itc.get_values_y()):
+      yticks[y+1.5] = '{0}'.format(y)
+    
     
 
     #first draw threads
@@ -357,9 +360,12 @@ class LogParser:
           ax.plot([o2._stop, o._start], [-int(o2.position_y()), -int(o.position_y())] , linewidth=1, color='k')
 
     for y in yticks.keys():
-      ax.axhline(y+0.5, color='k')
-      if y - 1 not in yticks.keys():
-        ax.axhline(y-0.5, color='k')
+      if y<1.5:
+        ax.axhline(y+0.5, color='k')
+        if y - 1 not in yticks.keys():
+          ax.axhline(y-0.5, color='k')
+      else:
+        ax.axhline(y, color='k', linestyle='dotted')
       if y == 1:
         yticks[y] = "None"
 

From 2e821b79a9809a90ff541950098907f330a66d4d Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 13:51:30 +0200
Subject: [PATCH 07/15] encoder_state in now in encoder_state.[ch]

---
 src/Makefile       |    2 +-
 src/context.h      |    1 +
 src/encoder.c      | 3206 -------------------------------------------
 src/encoder.h      |  167 ---
 src/encoderstate.c | 3236 ++++++++++++++++++++++++++++++++++++++++++++
 src/encoderstate.h |  204 +++
 src/filter.h       |    1 +
 src/inter.h        |    1 +
 src/intra.h        |    1 +
 src/rdo.h          |    1 +
 src/sao.h          |    1 +
 src/search.h       |    1 +
 src/transform.h    |    1 +
 13 files changed, 3449 insertions(+), 3374 deletions(-)
 create mode 100644 src/encoderstate.c
 create mode 100644 src/encoderstate.h

diff --git a/src/Makefile b/src/Makefile
index 3d5bf7ce..b70d4ab5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -53,7 +53,7 @@ CC = gcc
 CCFLAGS = $(DFLAGS) -I. -Wall -Wtype-limits
 LDFLAGS += -lm
 LD = gcc -pthread -lrt
-OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o encoder.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o scalinglist.o search.o strategyselector.o tables.o threadqueue.o transform.o
+OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o encoder.o encoderstate.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o scalinglist.o search.o strategyselector.o tables.o threadqueue.o transform.o
 PROG  = ./kvazaar
 PROGS = $(PROG)
 
diff --git a/src/context.h b/src/context.h
index 0b13bc02..d57ecfdf 100644
--- a/src/context.h
+++ b/src/context.h
@@ -27,6 +27,7 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "encoderstate.h"
 #include "cabac.h"
 
 
diff --git a/src/encoder.c b/src/encoder.c
index 886e747e..96451f0e 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -44,156 +44,6 @@
 #include "rdo.h"
 
 /* Local functions. */
-static void add_checksum(encoder_state *encoder);
-static void encode_VUI(encoder_state *encoder);
-static void encode_sao(encoder_state *encoder,
-                       unsigned x_lcu, uint16_t y_lcu,
-                       sao_info *sao_luma, sao_info *sao_chroma);
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state);
-static void worker_encoder_state_write_bitstream_leaf(void * opaque);
-
-/*!
-  \brief Initializes lambda-value for current QP
-
-  Implementation closer to HM (Used HM12 as reference)
-   - Still missing functionality when GOP and B-pictures are used
- */
-void encoder_state_init_lambda(encoder_state * const encoder_state)
-{
-  double qp = encoder_state->global->QP;
-  double lambda_scale = 1.0;
-  double qp_temp      = qp - 12;
-  double lambda;
-
-  // Default QP-factor from HM config
-  double qp_factor = 0.4624;
-
-  if (encoder_state->global->slicetype == SLICE_I) {
-    qp_factor=0.57*lambda_scale;
-  }
-
-  lambda = qp_factor*pow( 2.0, qp_temp/3.0 );
-
-  if (encoder_state->global->slicetype != SLICE_I ) {
-    lambda *= 0.95;
-  }
-
-  encoder_state->global->cur_lambda_cost = lambda;
-}
-
-static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == 0) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == 0) return 1;
-  if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
-    return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
-    return 1;
-  }
-  return 0;
-}
-
-//Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y) {
-    return 1;
-  }
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->start_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
-    return 1;
-  }
-  
-  //One row above is before the start of the slice => it's also a boundary
-  if (lcu_addr_in_rs - encoder_state->encoder_control->in.width_in_lcu < encoder_state->slice->start_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-//Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y + encoder_state->tile->cur_pic->height_in_lcu - 1) {
-    return 1;
-  }
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->end_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
-    return 1;
-  }
-  
-  //One row below is before the end of the slice => it's also a boundary
-  if (lcu_addr_in_rs + encoder_state->encoder_control->in.width_in_lcu > encoder_state->slice->end_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-
-//Return 1 if the LCU is at the first column of a structure (tile or slice)
-static int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  //First column of tile?
-  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x) {
-    return 1;
-  }
-  
-  //Slice start may not be aligned with the tile, so we need to allow this
-  if (lcu_addr_in_rs == encoder_state->slice->start_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-//Return 1 if the LCU is at the last column of a structure (tile or slice)
-static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  //First column of tile?
-  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x + encoder_state->tile->cur_pic->width_in_lcu - 1) {
-    return 1;
-  }
-  
-  //Slice start may not be aligned with the tile, so we need to allow this
-  if (lcu_addr_in_rs == encoder_state->slice->end_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
 
 int encoder_control_init(encoder_control * const encoder, const config * const cfg) {
   if (!cfg) {
@@ -515,3059 +365,3 @@ void encoder_control_input_init(encoder_control * const encoder,
   #endif
 }
 
-static int encoder_state_config_global_init(encoder_state * const encoder_state) {
-  encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
-  if(!encoder_state->global->ref) {
-    fprintf(stderr, "Failed to allocate the picture list!\n");
-    return 0;
-  }
-  encoder_state->global->ref_list = REF_PIC_LIST_0;
-  encoder_state->global->frame = 0;
-  encoder_state->global->poc = 0;
-  return 1;
-}
-
-static void encoder_state_config_global_finalize(encoder_state * const encoder_state) {
-  picture_list_destroy(encoder_state->global->ref);
-}
-
-
-
-static int encoder_state_config_tile_init(encoder_state * const encoder_state, 
-                                          const int lcu_offset_x, const int lcu_offset_y,
-                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
-  
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  encoder_state->tile->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
-
-  if (!encoder_state->tile->cur_pic) {
-    printf("Error allocating picture!\r\n");
-    return 0;
-  }
-  
-  // Init coeff data table
-  //FIXME: move them
-  encoder_state->tile->cur_pic->coeff_y = MALLOC(coefficient, width * height);
-  encoder_state->tile->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
-  encoder_state->tile->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
-  
-  encoder_state->tile->lcu_offset_x = lcu_offset_x;
-  encoder_state->tile->lcu_offset_y = lcu_offset_y;
-  
-  encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
-  
-  //Allocate buffers
-  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
-  encoder_state->tile->hor_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
-  encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu);
-  
-  if (encoder->sao_enable) {
-    encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-  } else {
-    encoder_state->tile->hor_buf_before_sao = NULL;
-  }
-  
-  if (encoder->wpp) {
-    encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-    if (!encoder_state->tile->wf_jobs) {
-      printf("Error allocating wf_jobs array!\n");
-      return 0;
-    }
-  } else {
-    encoder_state->tile->wf_jobs = NULL;
-  }
-  
-  encoder_state->tile->id = encoder->tiles_tile_id[encoder_state->tile->lcu_offset_in_ts];
-  return 1;
-}
-
-static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
-  if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao);
-  
-  yuv_t_free(encoder_state->tile->hor_buf_search);
-  yuv_t_free(encoder_state->tile->ver_buf_search);
-  
-  picture_free(encoder_state->tile->cur_pic);
-  encoder_state->tile->cur_pic = NULL;
-  
-  FREE_POINTER(encoder_state->tile->wf_jobs);
-}
-
-static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
-                                          const int start_address_in_ts, const int end_address_in_ts) {
-  int i = 0, slice_found=0;
-  for (i = 0; i < encoder_state->encoder_control->slice_count; ++i) {
-    if (encoder_state->encoder_control->slice_addresses_in_ts[i] == start_address_in_ts) {
-      encoder_state->slice->id = i;
-      slice_found = 1;
-      break;
-    }
-  }
-  assert(slice_found);
-  encoder_state->slice->start_in_ts = start_address_in_ts;
-  encoder_state->slice->end_in_ts = end_address_in_ts;
-  
-  encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
-  encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
-  return 1;
-}
-
-static void encoder_state_config_slice_finalize(encoder_state * const encoder_state) {
-  //Nothing to do (yet?)
-}
-
-static int encoder_state_config_wfrow_init(encoder_state * const encoder_state, 
-                                          const int lcu_offset_y) {
-  
-  encoder_state->wfrow->lcu_offset_y = lcu_offset_y;
-  return 1;
-}
-
-static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_state) {
-  //Nothing to do (yet?)
-}
-
-#ifdef _DEBUG
-static void encoder_state_dump_graphviz(const encoder_state * const encoder_state) {
-  int i;
-  
-  if (!encoder_state->parent) {
-    const encoder_control * const encoder = encoder_state->encoder_control;
-    int y,x;
-    //Empty lines (easier to copy-paste)
-    printf("\n\n\n\n\n");
-    //Some styling...
-    printf("digraph EncoderStates {\n");
-    printf(" fontname = \"Bitstream Vera Sans\"\n");
-    printf(" fontsize = 8\n\n");
-    printf(" node [\n");
-    printf("  fontname = \"Bitstream Vera Sans\"\n");
-    printf("  fontsize = 8\n");
-    printf("  shape = \"record\"\n");
-    printf(" ]\n\n");
-    printf(" edge [\n");
-    printf("  arrowtail = \"empty\"\n");
-    printf(" ]\n\n");
-    
-    printf(" \"Map\" [\n");
-    printf("  shape=plaintext\n");
-    printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        
-        printf("<td>%d</td>", lcu_id_rs);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>TS Map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        
-        printf("<td>%d</td>", lcu_id_ts);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Tile map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        
-        printf("<td>%d</td>", encoder->tiles_tile_id[lcu_id_ts]);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Slice map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        int slice_id = 0;
-        
-        //Not efficient, but who cares
-        for (i=0; i < encoder->slice_count; ++i) {
-          if (encoder->slice_addresses_in_ts[i] <= lcu_id_ts) {
-            slice_id = i;
-          }
-        }
-        
-        printf("<td>%d</td>", slice_id);
-      }
-      printf("</tr>");
-    }
-    printf("</table>>\n ]\n");
-  }
-  
-  printf(" \"%p\" [\n", encoder_state);
-  printf("  label = \"{encoder_state|");
-  printf("+ type=%c\\l", encoder_state->type);
-  if (!encoder_state->parent || encoder_state->global != encoder_state->parent->global) {
-    printf("|+ global\\l");
-  }
-  if (!encoder_state->parent || encoder_state->tile != encoder_state->parent->tile) {
-    printf("|+ tile\\l");
-    printf(" - id = %d\\l", encoder_state->tile->id);
-    printf(" - lcu_offset_x = %d\\l", encoder_state->tile->lcu_offset_x);
-    printf(" - lcu_offset_y = %d\\l", encoder_state->tile->lcu_offset_y);
-    printf(" - lcu_offset_in_ts = %d\\l", encoder_state->tile->lcu_offset_in_ts);
-  }
-  if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
-    printf("|+ slice\\l");
-    printf(" - id = %d\\l", encoder_state->slice->id);
-    printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
-    printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
-    printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
-    printf(" - end_in_rs = %d\\l", encoder_state->slice->end_in_rs);
-  }
-  if (!encoder_state->parent || encoder_state->wfrow != encoder_state->parent->wfrow) {
-    printf("|+ wfrow\\l");
-    printf(" - lcu_offset_y = %d\\l", encoder_state->wfrow->lcu_offset_y);
-  }
-  printf("}\"\n");
-  printf(" ]\n");
-  
-  if (encoder_state->parent) {
-    printf(" \"%p\" -> \"%p\"\n", encoder_state->parent, encoder_state);
-  }
-  
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    encoder_state_dump_graphviz(&encoder_state->children[i]);
-  }
-  
-  if (!encoder_state->parent) {
-    printf("}\n");
-    //Empty lines (easier to copy-paste)
-    printf("\n\n\n\n\n");
-  }
-}
-#endif //_DEBUG
-
-int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
-  //We require that, if parent_state is NULL:
-  //child_state->encoder_control is set
-  //
-  //If parent_state is not NULL, the following variable should either be set to NULL,
-  //in order to inherit from parent, or should point to a valid structure:
-  //child_state->global
-  //child_state->tile
-  //child_state->slice
-  //child_state->wfrow
-  
-  child_state->parent = parent_state;
-  child_state->children = MALLOC(encoder_state, 1);
-  child_state->children[0].encoder_control = NULL;
-  
-  if (!parent_state) {
-    const encoder_control * const encoder = child_state->encoder_control;
-    child_state->type = ENCODER_STATE_TYPE_MAIN;
-    assert(child_state->encoder_control);
-    child_state->global = MALLOC(encoder_state_config_global, 1);
-    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
-      fprintf(stderr, "Could not initialize encoder_state->global!\n");
-      return 0;
-    }
-    child_state->tile = MALLOC(encoder_state_config_tile, 1);
-    if (!child_state->tile || !encoder_state_config_tile_init(child_state, 0, 0, encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu)) {
-      fprintf(stderr, "Could not initialize encoder_state->tile!\n");
-      return 0;
-    }
-    child_state->slice = MALLOC(encoder_state_config_slice, 1);
-    if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
-      fprintf(stderr, "Could not initialize encoder_state->slice!\n");
-      return 0;
-    }
-    child_state->wfrow = MALLOC(encoder_state_config_wfrow, 1);
-    if (!child_state->wfrow || !encoder_state_config_wfrow_init(child_state, 0)) {
-      fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
-      return 0;
-    }
-  } else {
-    child_state->encoder_control = parent_state->encoder_control;
-    if (!child_state->global) child_state->global = parent_state->global;
-    if (!child_state->tile) child_state->tile = parent_state->tile;
-    if (!child_state->slice) child_state->slice = parent_state->slice;
-    if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
-  }
-  
-  //Allocate bitstream
-  if (child_state->type == ENCODER_STATE_TYPE_MAIN) {
-    //Main encoder outputs to file
-    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_FILE)) {
-      fprintf(stderr, "Could not initialize stream!\n");
-      return 0;
-    }
-    child_state->stream.file.output = child_state->encoder_control->out.file;
-  } else {
-    //Other encoders use a memory bitstream
-    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_MEMORY)) {
-      fprintf(stderr, "Could not initialize stream!\n");
-      return 0;
-    }
-  }
-  
-  // Set CABAC output bitstream
-  child_state->cabac.stream = &child_state->stream;
-  
-  //Create sub-encoders
-  {
-    const encoder_control * const encoder = child_state->encoder_control;
-    int child_count = 0;
-    //We first check the type of this element.
-    //If it's a MAIN, it can allow both slices or tiles as child
-    //If it's a TILE, it can allow slices as child, if its parent is not a slice, or wavefront rows if there is no other children
-    //If it's a SLICE, it can allow tiles as child, if its parent is not a tile, or wavefront rows if there is no other children
-    //If it's a WAVEFRONT_ROW, it doesn't allow any children
-    int children_allow_wavefront_row = 0;
-    int children_allow_slice = 0;
-    int children_allow_tile = 0;
-    int range_start;
-    
-    int start_in_ts, end_in_ts;
-    
-    switch(child_state->type) {
-      case ENCODER_STATE_TYPE_MAIN:
-        children_allow_slice = 1;
-        children_allow_tile = 1;
-        start_in_ts = 0;
-        end_in_ts = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-        break;
-      case ENCODER_STATE_TYPE_SLICE:
-        assert(child_state->parent);
-        if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
-        children_allow_wavefront_row = encoder->wpp;
-        start_in_ts = child_state->slice->start_in_ts;
-        end_in_ts = child_state->slice->end_in_ts;
-        break;
-      case ENCODER_STATE_TYPE_TILE:
-        assert(child_state->parent);
-        if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
-        children_allow_wavefront_row = encoder->wpp;
-        start_in_ts = child_state->tile->lcu_offset_in_ts;
-        end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-        break;
-      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
-        //GCC tries to be too clever...
-        start_in_ts = -1;
-        end_in_ts = -1;
-        break;
-      default:
-        fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
-        assert(0);
-        return 0;
-    }
-    
-    range_start = start_in_ts;
-    //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
-    while (range_start < end_in_ts && (children_allow_slice || children_allow_tile)) {
-      encoder_state *new_child = NULL;
-      int range_end_slice = range_start; //Will be incremented to get the range of the "thing"
-      int range_end_tile = range_start; //Will be incremented to get the range of the "thing"
-      
-      int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
-      int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
-      
-      //Find the smallest structure following the cursor
-      if (slice_allowed) {
-        while(!lcu_at_slice_end(encoder, range_end_slice)) {
-          ++range_end_slice;
-        }
-      }
-      
-      if (tile_allowed) {
-        while(!lcu_at_tile_end(encoder, range_end_tile)) {
-          ++range_end_tile;
-        }
-      }
-      
-      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d end_in_ts=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed,end_in_ts);
-      
-      if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
-        //Create a slice
-        new_child = &child_state->children[child_count];
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_SLICE;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
-        new_child->wfrow = child_state->wfrow;
-        new_child->slice = MALLOC(encoder_state_config_slice, 1);
-        if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
-          fprintf(stderr, "Could not initialize encoder_state->slice!\n");
-          return 0;
-        }
-      }
-      
-      if ((!slice_allowed || (range_end_slice < range_end_tile)) && !new_child && tile_allowed) {
-        //Create a tile
-        int tile_id = encoder->tiles_tile_id[range_start];
-        int tile_x = tile_id % encoder->tiles_num_tile_columns;
-        int tile_y = tile_id / encoder->tiles_num_tile_columns;
-        
-        int lcu_offset_x = encoder->tiles_col_bd[tile_x];
-        int lcu_offset_y = encoder->tiles_row_bd[tile_y];
-        int width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
-        int height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
-        int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
-        int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
-        
-        new_child = &child_state->children[child_count];
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_TILE;
-        new_child->global = child_state->global;
-        new_child->tile = MALLOC(encoder_state_config_tile, 1);
-        new_child->slice = child_state->slice;
-        new_child->wfrow = child_state->wfrow;
-        
-        if (!new_child->tile || !encoder_state_config_tile_init(new_child, lcu_offset_x, lcu_offset_y, width, height, width_in_lcu, height_in_lcu)) {
-          fprintf(stderr, "Could not initialize encoder_state->tile!\n");
-          return 0;
-        }
-      }
-      
-      if (new_child) {
-        child_state->children = realloc(child_state->children, sizeof(encoder_state) * (2+child_count));
-        child_state->children[1+child_count].encoder_control = NULL;
-        if (!child_state->children) {
-          fprintf(stderr, "Failed to allocate memory for children...\n");
-          return 0;
-        }
-
-        //Fix children parent (since we changed the address), except for the last one which is not ready yet
-        {
-          int i, j;
-          for (i = 0; child_state->children[i].encoder_control && i < child_count; ++i) {
-            for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
-              child_state->children[i].children[j].parent = &child_state->children[i];
-            }
-            for (j = 0; j < child_state->children[i].lcu_order_count; ++j) {
-              child_state->children[i].lcu_order[j].encoder_state = &child_state->children[i];
-            }
-            child_state->children[i].cabac.stream = &child_state->children[i].stream;
-          }
-        }
-        
-        if (!encoder_state_init(&child_state->children[child_count], child_state)) {
-          fprintf(stderr, "Unable to init child...\n");
-          return 0;
-        }
-        child_count += 1;
-      }
-      
-      range_start = MAX(range_end_slice, range_end_tile) + 1;
-    }
-    
-    //We create wavefronts only if we have no children
-    if (children_allow_wavefront_row && child_count == 0) {
-      int first_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
-      int last_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
-      int num_rows;
-      int i;
-      
-      assert(!(children_allow_slice || children_allow_tile));
-      assert(child_count == 0);
-      
-      for (i=start_in_ts; i<end_in_ts; ++i) {
-        const int row = encoder->tiles_ctb_addr_ts_to_rs[i] / encoder->in.width_in_lcu;
-        if (row < first_row) first_row = row;
-        if (row > last_row) last_row = row;
-      }
-      
-      num_rows = last_row - first_row + 1;
-      
-      //When entropy_coding_sync_enabled_flag is equal to 1 and the first coding tree block in a slice is not the first coding
-      //tree block of a row of coding tree blocks in a tile, it is a requirement of bitstream conformance that the last coding tree
-      //block in the slice shall belong to the same row of coding tree blocks as the first coding tree block in the slice.
-      
-      if (encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] % encoder->in.width_in_lcu != child_state->tile->lcu_offset_x) {
-        if (num_rows > 1) {
-          fprintf(stderr, "Invalid: first CTB in slice %d is not at the tile %d edge, and the slice spans on more than one row.\n", child_state->slice->id, child_state->tile->id);
-          return 0;
-        }
-      }
-      
-      //FIXME Do the same kind of check if we implement slice segments
-    
-      child_count = num_rows;
-      child_state->children = realloc(child_state->children, sizeof(encoder_state) * (num_rows + 1));
-      child_state->children[num_rows].encoder_control = NULL;
-      
-      for (i=0; i < num_rows; ++i) {
-        encoder_state *new_child = &child_state->children[i];
-        
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
-        new_child->slice = child_state->slice;
-        new_child->wfrow = MALLOC(encoder_state_config_wfrow, 1);
-        
-        if (!new_child->wfrow || !encoder_state_config_wfrow_init(new_child, i)) {
-          fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
-          return 0;
-        }
-        
-        if (!encoder_state_init(new_child, child_state)) {
-          fprintf(stderr, "Unable to init child...\n");
-          return 0;
-        }
-      }
-    }
-    
-    child_state->is_leaf = (child_count == 0);
-    //This node is a leaf, compute LCU-order
-    if (child_state->is_leaf) {
-      //All LCU computations are relative to the tile
-      //Remark: this could be optimized, but since it's run only once, it's better to do it in a understandable way.
-      
-      //By default, the full tile
-      int i;
-      int lcu_id;
-      int lcu_start = 0;
-      //End is the element AFTER the end (iterate < lcu_end)
-      int lcu_end = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-      
-      //Restrict to the current slice if needed
-      lcu_start = MAX(lcu_start, child_state->slice->start_in_ts - child_state->tile->lcu_offset_in_ts);
-      lcu_end = MIN(lcu_end, child_state->slice->end_in_ts - child_state->tile->lcu_offset_in_ts + 1);
-      
-      //Restrict to the current wavefront row if needed
-      if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-        lcu_start = MAX(lcu_start, (child_state->wfrow->lcu_offset_y) * child_state->tile->cur_pic->width_in_lcu);
-        lcu_end = MIN(lcu_end, (child_state->wfrow->lcu_offset_y + 1) * child_state->tile->cur_pic->width_in_lcu);
-      }
-      
-      child_state->lcu_order_count = lcu_end - lcu_start;
-      child_state->lcu_order = MALLOC(lcu_order_element, child_state->lcu_order_count);
-      assert(child_state->lcu_order);
-      
-      for (i = 0; i < child_state->lcu_order_count; ++i) {
-        lcu_id = lcu_start + i;
-        child_state->lcu_order[i].encoder_state = child_state;
-        child_state->lcu_order[i].id = lcu_id;
-        child_state->lcu_order[i].index = i;
-        child_state->lcu_order[i].position.x = lcu_id % child_state->tile->cur_pic->width_in_lcu;
-        child_state->lcu_order[i].position.y = lcu_id / child_state->tile->cur_pic->width_in_lcu;
-        child_state->lcu_order[i].position_px.x = child_state->lcu_order[i].position.x * LCU_WIDTH;
-        child_state->lcu_order[i].position_px.y = child_state->lcu_order[i].position.y * LCU_WIDTH;
-        child_state->lcu_order[i].size.x = MIN(LCU_WIDTH, encoder->in.width - (child_state->tile->lcu_offset_x * LCU_WIDTH + child_state->lcu_order[i].position_px.x));
-        child_state->lcu_order[i].size.y = MIN(LCU_WIDTH, encoder->in.height - (child_state->tile->lcu_offset_y * LCU_WIDTH + child_state->lcu_order[i].position_px.y));
-        child_state->lcu_order[i].first_row = lcu_in_first_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].last_row = lcu_in_last_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].first_column = lcu_in_first_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].last_column = lcu_in_last_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        
-        child_state->lcu_order[i].above = NULL;
-        child_state->lcu_order[i].below = NULL;
-        child_state->lcu_order[i].left = NULL;
-        child_state->lcu_order[i].right = NULL;
-        
-        if (!child_state->lcu_order[i].first_row) {
-          //Find LCU above
-          if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-            int j;
-            //For all previous wavefront rows
-            for (j=0; &child_state->parent->children[j] != child_state && child_state->parent->children[j].encoder_control; ++j) {
-              if (child_state->parent->children[j].wfrow->lcu_offset_y == child_state->wfrow->lcu_offset_y - 1) {
-                int k;
-                for (k=0; k < child_state->parent->children[j].lcu_order_count; ++k) {
-                  if (child_state->parent->children[j].lcu_order[k].position.x == child_state->lcu_order[i].position.x) {
-                    assert(child_state->parent->children[j].lcu_order[k].position.y == child_state->lcu_order[i].position.y - 1);
-                    child_state->lcu_order[i].above = &child_state->parent->children[j].lcu_order[k];
-                  }
-                }
-              }
-            }
-          } else {
-            child_state->lcu_order[i].above = &child_state->lcu_order[i-child_state->tile->cur_pic->width_in_lcu];
-          }
-          assert(child_state->lcu_order[i].above);
-          child_state->lcu_order[i].above->below = &child_state->lcu_order[i];
-        }
-        if (!child_state->lcu_order[i].first_column) {
-          child_state->lcu_order[i].left = &child_state->lcu_order[i-1];
-          assert(child_state->lcu_order[i].left->position.x == child_state->lcu_order[i].position.x - 1);
-          child_state->lcu_order[i].left->right = &child_state->lcu_order[i];
-        }
-      }
-    } else {
-      child_state->lcu_order_count = 0;
-      child_state->lcu_order = NULL;
-    }
-  }
-  
-  //Validate the structure
-  if (child_state->type == ENCODER_STATE_TYPE_TILE) {
-    if (child_state->tile->lcu_offset_in_ts < child_state->slice->start_in_ts) {
-      fprintf(stderr, "Tile %d starts before slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
-      return 0;
-    }
-    if (child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1 > child_state->slice->end_in_ts) {
-      fprintf(stderr, "Tile %d ends after slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
-      return 0;
-    }
-  }
-  
-  if (child_state->type == ENCODER_STATE_TYPE_SLICE) {
-    if (child_state->slice->start_in_ts < child_state->tile->lcu_offset_in_ts) {
-      fprintf(stderr, "Slice %d starts before tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
-      return 0;
-    }
-    if (child_state->slice->end_in_ts > child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1) {
-      fprintf(stderr, "Slice %d ends after tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
-      return 0;
-    }
-  }
-  
-  
-#ifdef _DEBUG
-  if (!parent_state) encoder_state_dump_graphviz(child_state);
-#endif //_DEBUG
-  return 1;
-}
-
-void encoder_state_finalize(encoder_state * const encoder_state) {
-  if (encoder_state->children) {
-    int i=0;
-    for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-      encoder_state_finalize(&encoder_state->children[i]);
-    }
-    
-    FREE_POINTER(encoder_state->children);
-  }
-  
-  FREE_POINTER(encoder_state->lcu_order);
-  encoder_state->lcu_order_count = 0;
-  
-  if (!encoder_state->parent || (encoder_state->parent->wfrow != encoder_state->wfrow)) {
-    encoder_state_config_wfrow_finalize(encoder_state);
-    FREE_POINTER(encoder_state->wfrow);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->slice != encoder_state->slice)) {
-    encoder_state_config_slice_finalize(encoder_state);
-    FREE_POINTER(encoder_state->slice);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->tile != encoder_state->tile)) {
-    encoder_state_config_tile_finalize(encoder_state);
-    FREE_POINTER(encoder_state->tile);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->global != encoder_state->global)) {
-    encoder_state_config_global_finalize(encoder_state);
-    FREE_POINTER(encoder_state->global);
-  }
-  
-  bitstream_finalize(&encoder_state->stream);
-}
-
-
-static void encoder_state_clear_refs(encoder_state *encoder_state) {
-  //FIXME: Do we need to handle children? At present they all share the same global
-  while (encoder_state->global->ref->used_size) {
-    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
-  }
-
-  encoder_state->global->poc = 0;
-}
-
-static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
-  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  int source_stride = source_enc->tile->cur_pic->width;
-  int target_stride = target_enc->tile->cur_pic->width;
-  
-  int width;
-  int height;
-  
-  int source_offset;
-  int target_offset;
-  
-  //Do nothing if the source and the destination is the same!
-  if (source_enc->tile == target_enc->tile) return;
-
-  if (is_y_channel) {
-    target_offset = source_offset_x + source_offset_y * target_enc->tile->cur_pic->width;
-    source_offset = target_offset_x + target_offset_y * source_enc->tile->cur_pic->width;
-  } else {
-    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->cur_pic->width/2;
-    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->cur_pic->width/2;
-  }
-  
-  if (target_enc->children) {
-    //Use information from the source
-    width = MIN(source_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->width - source_offset_x);
-    height = MIN(source_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->height - source_offset_y);
-  } else {
-    //Use information from the target
-    width = MIN(target_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->width - target_offset_x);
-    height = MIN(target_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->height - target_offset_y);
-  }
-  
-  if (!is_y_channel) {
-    width /= 2;
-    height /= 2;
-    
-    source_stride /= 2;
-    target_stride /= 2;
-  }
-  
-  //picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_enc->cur_pic->width, target_enc->cur_pic->width);
-  picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_stride, target_stride);
-}
-
-
-
-static void write_aud(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  encode_access_unit_delimiter(encoder_state);
-  nal_write(stream, AUD_NUT, 0, 1);
-  bitstream_align(stream);
-}
-
-static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
-  picture* const cur_pic = encoder_state->tile->cur_pic;
-  
-  if (hor_buf) {
-    const int rdpx = lcu->position_px.x;
-    const int rdpy = lcu->position_px.y + lcu->size.y - 1;
-    const int by = lcu->position.y;
-    
-    //Copy the bottom row of this LCU to the horizontal buffer
-    picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
-                        &hor_buf->y[lcu->position_px.x + by * cur_pic->width],
-                        lcu->size.x, 1, cur_pic->width, cur_pic->width);
-    picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
-                        &hor_buf->u[lcu->position_px.x / 2 + by * cur_pic->width / 2],
-                        lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
-    picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
-                        &hor_buf->v[lcu->position_px.x / 2 + by * cur_pic->width / 2],
-                        lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
-  }
-  
-  if (ver_buf) {
-    const int rdpx = lcu->position_px.x + lcu->size.x - 1;
-    const int rdpy = lcu->position_px.y;
-    const int bx = lcu->position.x;
-    
-    
-    //Copy the right row of this LCU to the vertical buffer.
-    picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
-                        &ver_buf->y[lcu->position_px.y + bx * cur_pic->height],
-                        1, lcu->size.y, cur_pic->width, 1);
-    picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
-                        &ver_buf->u[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
-                        1, lcu->size.y / 2, cur_pic->width / 2, 1);
-    picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
-                        &ver_buf->v[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
-                        1, lcu->size.y / 2, cur_pic->width / 2, 1);
-  }
-  
-}
-
-
-static void worker_encoder_state_encode_lcu(void * opaque) {
-  const lcu_order_element * const lcu = opaque;
-  encoder_state *encoder_state = lcu->encoder_state;
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  picture* const cur_pic = encoder_state->tile->cur_pic;
-  
-  //This part doesn't write to bitstream, it's only search, deblock and sao
-  
-  search_lcu(encoder_state, lcu->position_px.x, lcu->position_px.y, encoder_state->tile->hor_buf_search, encoder_state->tile->ver_buf_search);
-    
-  encoder_state_recdata_to_bufs(encoder_state, lcu, encoder_state->tile->hor_buf_search, encoder_state->tile->ver_buf_search);
-
-  if (encoder->deblock_enable) {
-    filter_deblock_lcu(encoder_state, lcu->position_px.x, lcu->position_px.y);
-  }
-
-  if (encoder->sao_enable) {
-    const int stride = cur_pic->width_in_lcu;
-    sao_info *sao_luma = &cur_pic->sao_luma[lcu->position.y * stride + lcu->position.x];
-    sao_info *sao_chroma = &cur_pic->sao_chroma[lcu->position.y * stride + lcu->position.x];
-    init_sao_info(sao_luma);
-    init_sao_info(sao_chroma);
-
-    {
-      sao_info *sao_top =  lcu->position.y != 0 ? &cur_pic->sao_luma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
-      sao_info *sao_left = lcu->position.x != 0 ? &cur_pic->sao_luma[lcu->position.y * stride + lcu->position.x -1] : NULL;
-      sao_search_luma(encoder_state, cur_pic, lcu->position.x, lcu->position.y, sao_luma, sao_top, sao_left);
-    }
-
-    {
-      sao_info *sao_top =  lcu->position.y != 0 ? &cur_pic->sao_chroma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
-      sao_info *sao_left = lcu->position.x != 0 ? &cur_pic->sao_chroma[lcu->position.y * stride + lcu->position.x - 1] : NULL;
-      sao_search_chroma(encoder_state, cur_pic, lcu->position.x, lcu->position.y, sao_chroma, sao_top, sao_left);
-    }
-
-    // Merge only if both luma and chroma can be merged
-    sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag;
-    sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag;
-  }
-  
-  
-  //Now write data to bitstream (required to have a correct CABAC state)
-  
-  //First LCU, and we are in a slice. We need a slice header
-  if (encoder_state->type == ENCODER_STATE_TYPE_SLICE && lcu->index == 0) {
-    encode_slice_header(encoder_state);
-    bitstream_align(&encoder_state->stream); 
-  }
-  
-  //Encode SAO
-  if (encoder->sao_enable) {
-    encode_sao(encoder_state, lcu->position.x, lcu->position.y, &cur_pic->sao_luma[lcu->position.y * cur_pic->width_in_lcu + lcu->position.x], &cur_pic->sao_chroma[lcu->position.y * cur_pic->width_in_lcu + lcu->position.x]);
-  }
-  
-  //Encode coding tree
-  encode_coding_tree(encoder_state, lcu->position.x << MAX_DEPTH, lcu->position.y << MAX_DEPTH, 0);
-
-  //Terminator
-  if (lcu->index < encoder_state->lcu_order_count - 1) {
-    //Since we don't handle slice segments, end of slice segment == end of slice
-    //Always 0 since otherwise it would be split
-    cabac_encode_bin_trm(&encoder_state->cabac, 0);  // end_of_slice_segment_flag
-  }
-  
-  //Wavefronts need the context to be copied to the next row
-  if (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && lcu->index == 1) {
-    int j;
-    //Find next encoder (next row)
-    for (j=0; encoder_state->parent->children[j].encoder_control; ++j) {
-      if (encoder_state->parent->children[j].wfrow->lcu_offset_y == encoder_state->wfrow->lcu_offset_y + 1) {
-        //And copy context
-        context_copy(&encoder_state->parent->children[j], encoder_state);
-      }
-    }
-  }
-  
-  if (encoder->sao_enable && lcu->above) {
-    //If we're not the first in the row
-    if (lcu->above->left) {
-      encoder_state_recdata_to_bufs(encoder_state, lcu->above->left, encoder_state->tile->hor_buf_before_sao, NULL);
-    }
-    //Latest LCU in the row, copy the data from the one above also
-    if (!lcu->right) {
-      encoder_state_recdata_to_bufs(encoder_state, lcu->above, encoder_state->tile->hor_buf_before_sao, NULL);
-    }
-  }
-}
-
-static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-#ifndef NDEBUG
-//  const unsigned long long int debug_bitstream_position = bitstream_tell(&(encoder_state->stream));
-#endif
-  
-  int i = 0;
-  
-  assert(encoder_state->is_leaf);
-  assert(encoder_state->lcu_order_count > 0);
-  
-  //If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism
-  if (encoder_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !encoder_state->parent->children[1].encoder_control)) {
-    for (i = 0; i < encoder_state->lcu_order_count; ++i) {
-      PERFORMANCE_MEASURE_START();
-
-      worker_encoder_state_encode_lcu(&encoder_state->lcu_order[i]);
-
-#ifdef _DEBUG
-      {
-        const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
-        PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_lcu,frame=%d,tile=%d,slice=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
-      }
-#endif //_DEBUG
-    }
-    
-    if (encoder->sao_enable) {
-      PERFORMANCE_MEASURE_START();
-      sao_reconstruct_frame(encoder_state);
-      PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y);
-    }
-  } else {
-    for (i = 0; i < encoder_state->lcu_order_count; ++i) {
-      const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
-#ifdef _DEBUG
-      char job_description[256];
-      sprintf(job_description, "type=search_lcu,frame=%d,tile=%d,slice=%d,row=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->wfrow->lcu_offset_y, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
-#else
-      char* job_description = NULL;
-#endif
-      encoder_state->tile->wf_jobs[lcu->id] = threadqueue_submit(encoder_state->encoder_control->threadqueue, worker_encoder_state_encode_lcu, (void*)lcu, 1, job_description);
-      if (encoder_state->tile->wf_jobs[lcu->id]) {
-        if (lcu->position.x > 0) {
-          // Wait for the LCU on the left.
-          threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - 1]);
-        }
-        if (lcu->position.y > 0) {
-          if (lcu->position.x < encoder_state->tile->cur_pic->width_in_lcu - 1) {
-            // Wait for the LCU to the top-right of this one.
-            threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - encoder_state->tile->cur_pic->width_in_lcu + 1]);
-          } else {
-            // If there is no top-right LCU, wait for the one above.
-            threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - encoder_state->tile->cur_pic->width_in_lcu]);
-          }
-        }
-        threadqueue_job_unwait_job(encoder_state->encoder_control->threadqueue, encoder_state->tile->wf_jobs[lcu->id]);
-      }
-    }
-  }
-
-
-  
-  //We should not have written to bitstream!
-//  assert(debug_bitstream_position == bitstream_tell(&(encoder_state->stream)));
-}
-
-static void encoder_state_encode(encoder_state * const main_state);
-
-static void worker_encoder_state_encode_children(void * opaque) {
-  encoder_state *sub_state = opaque;
-  encoder_state_encode(sub_state);
-  if (sub_state->is_leaf) {
-    if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-      PERFORMANCE_MEASURE_START();
-      encoder_state_write_bitstream_leaf(sub_state);
-      PERFORMANCE_MEASURE_END(sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position.y + sub_state->tile->lcu_offset_y, sub_state->lcu_order[sub_state->lcu_order_count-1].position.y + sub_state->tile->lcu_offset_y);
-    } else {
-      threadqueue_job *job;
-#ifdef _DEBUG
-      char job_description[256];
-      sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->wfrow->lcu_offset_y);
-#else
-      char* job_description = NULL;
-#endif
-      job = threadqueue_submit(sub_state->encoder_control->threadqueue, worker_encoder_state_write_bitstream_leaf, sub_state, 1, job_description);
-      threadqueue_job_dep_add(job, sub_state->tile->wf_jobs[sub_state->wfrow->lcu_offset_y * sub_state->tile->cur_pic->width_in_lcu + sub_state->lcu_order_count - 1]);
-      threadqueue_job_unwait_job(sub_state->encoder_control->threadqueue, job);
-      return;
-    }
-  }
-}
-
-typedef struct {
-  int y;
-  const encoder_state * encoder_state;
-} worker_sao_reconstruct_lcu_data;
-
-// ./kvazaar -i /scratch/h265-encode/pedestrian_area_1080p25.yuv  --input-res 1920x1080 -o /tmp/out.h265 --qp 23 -p 60  --frames 10
-// Processed 10 frames,    5063552 bits AVG PSNR: 42.9771 46.0609 48.0985
-// Total time: 19.440 s.
-void worker_sao_reconstruct_lcu(void *opaque) {
-  worker_sao_reconstruct_lcu_data *data = opaque;
-  picture * const cur_pic = data->encoder_state->tile->cur_pic;
-  unsigned stride = cur_pic->width_in_lcu;
-  int x;
-  
-  //TODO: copy only needed data
-  pixel *new_y_data = MALLOC(pixel, cur_pic->width * cur_pic->height);
-  pixel *new_u_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
-  pixel *new_v_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
-  
-  const int offset = cur_pic->width * (data->y*LCU_WIDTH);
-  const int offset_c = cur_pic->width/2 * (data->y*LCU_WIDTH_C);
-  int num_pixels = cur_pic->width * (LCU_WIDTH + 2);
-  
-  if (num_pixels + offset > cur_pic->width * cur_pic->height) {
-    num_pixels = cur_pic->width * cur_pic->height - offset;
-  }
-  
-  memcpy(&new_y_data[offset], &cur_pic->y_recdata[offset], sizeof(pixel) * num_pixels);
-  memcpy(&new_u_data[offset_c], &cur_pic->u_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
-  memcpy(&new_v_data[offset_c], &cur_pic->v_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
-  
-  if (data->y>0) {
-    //copy first row from buffer
-    memcpy(&new_y_data[cur_pic->width * (data->y*LCU_WIDTH-1)], &data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y-1)], cur_pic->width * sizeof(pixel));
-    memcpy(&new_u_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
-    memcpy(&new_v_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
-  }
-  //assertions to be sure everything's ok for the next line (don't bother with last one)
-  /*  These assertions may not be true if the row are not processed in order. To avoid having an artificial dependency between rows, it's better to remove them.
-  assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y)], &cur_pic->y_recdata[cur_pic->width * ((data->y + 1)*LCU_WIDTH-1)], cur_pic->width * sizeof(pixel))==0);
-  assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y)], &cur_pic->u_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0);
-  assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y)], &cur_pic->v_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0);*/
-
-  for (x = 0; x < cur_pic->width_in_lcu; x++) {
-  // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
-    sao_info *sao_luma = &cur_pic->sao_luma[data->y * stride + x];
-    sao_info *sao_chroma = &cur_pic->sao_chroma[data->y * stride + x];
-    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_y_data, x, data->y, sao_luma, COLOR_Y);
-    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_u_data, x, data->y, sao_chroma, COLOR_U);
-    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_v_data, x, data->y, sao_chroma, COLOR_V);
-  }
-  
-  free(new_y_data);
-  free(new_u_data);
-  free(new_v_data);
-
-  free(opaque);
-}
-
-
-static int tree_is_a_chain(const encoder_state * const encoder_state) {
-  if (!encoder_state->children[0].encoder_control) return 1;
-  if (encoder_state->children[1].encoder_control) return 0;
-  return tree_is_a_chain(&encoder_state->children[0]);
-}
-
-static void encoder_state_encode(encoder_state * const main_state) {
-  //If we have children, encode at child level
-  if (main_state->children[0].encoder_control) {
-    int i=0;
-    //If we have only one child, than it cannot be the last split in tree
-    int node_is_the_last_split_in_tree = (main_state->children[1].encoder_control != 0);
-    
-    for (i=0; main_state->children[i].encoder_control; ++i) {
-      encoder_state *sub_state = &(main_state->children[i]);
-      
-      if (sub_state->tile != main_state->tile) {
-        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
-        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
-        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
-      }
-      
-      //To be the last split, we require that every child is a chain
-      node_is_the_last_split_in_tree = node_is_the_last_split_in_tree && tree_is_a_chain(&main_state->children[i]);
-    }
-    //If it's the latest split point
-    if (node_is_the_last_split_in_tree) {
-      for (i=0; main_state->children[i].encoder_control; ++i) {
-        //If we don't have wavefronts, parallelize encoding of children.
-        if (main_state->children[i].type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-#ifdef _DEBUG
-          char job_description[256];
-          switch (main_state->children[i].type) {
-            case ENCODER_STATE_TYPE_TILE: 
-              sprintf(job_description, "frame=%d,tile=%d,row=%d-%d,position_x=%d,position_y=%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].tile->lcu_offset_x, main_state->children[i].tile->lcu_offset_y);
-              break;
-            case ENCODER_STATE_TYPE_SLICE:
-              sprintf(job_description, "frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
-              break;
-            default:
-              sprintf(job_description, "frame=%d,invalid", main_state->children[i].global->frame);
-              break;
-          }
-#else
-          char* job_description = NULL;
-#endif
-          threadqueue_submit(main_state->encoder_control->threadqueue, worker_encoder_state_encode_children, &(main_state->children[i]), 0, job_description);
-        } else {
-          //Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
-          //FIXME: add an assert: we can only have wavefront children
-          worker_encoder_state_encode_children(&(main_state->children[i]));
-        }
-      }
-      
-      //If children are wavefront, we need to reconstruct SAO
-      if (main_state->encoder_control->sao_enable && main_state->children[0].type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-        int y;
-        picture * const cur_pic = main_state->tile->cur_pic;
-        threadqueue_job *previous_job = NULL;
-        
-        for (y = 0; y < cur_pic->height_in_lcu; ++y) {
-          worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1);
-          threadqueue_job *job;
-#ifdef _DEBUG
-          char job_description[256];
-          sprintf(job_description, "frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y);
-#else
-          char* job_description = NULL;
-#endif
-          data->y = y;
-          data->encoder_state = main_state;
-          
-          job = threadqueue_submit(main_state->encoder_control->threadqueue, worker_sao_reconstruct_lcu, data, 1, job_description);
-          
-          if (previous_job) {
-            threadqueue_job_dep_add(job, previous_job);
-          }
-          previous_job = job;
-          
-          if (y < cur_pic->height_in_lcu - 1) {
-            //Not last row: depend on the last LCU of the row below
-            threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 1) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
-          } else {
-            //Last row: depend on the last LCU of the row
-            threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 0) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
-          }
-          threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job);
-          
-        }
-      }
-      threadqueue_flush(main_state->encoder_control->threadqueue);
-    } else {
-      for (i=0; main_state->children[i].encoder_control; ++i) {
-        worker_encoder_state_encode_children(&(main_state->children[i]));
-      }
-    }
-    
-    for (i=0; main_state->children[i].encoder_control; ++i) {
-      encoder_state *sub_state = &(main_state->children[i]);
-      if (sub_state->tile != main_state->tile) {
-        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, sub_state, sub_state->tile->cur_pic->y_recdata, 1);
-        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, sub_state, sub_state->tile->cur_pic->u_recdata, 0);
-        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, sub_state, sub_state->tile->cur_pic->v_recdata, 0);
-      }
-    }
-  } else {
-    switch (main_state->type) {
-      case ENCODER_STATE_TYPE_TILE:
-      case ENCODER_STATE_TYPE_SLICE:
-      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
-        encoder_state_encode_leaf(main_state);
-        break;
-      default:
-        fprintf(stderr, "Unsupported leaf type %c!\n", main_state->type);
-        assert(0);
-    }
-  }
-}
-
-static void encoder_state_new_frame(encoder_state * const main_state) {
-  int i;
-  //FIXME Move this somewhere else!
-  if (main_state->type == ENCODER_STATE_TYPE_MAIN) {
-    const encoder_control * const encoder = main_state->encoder_control;
-    
-    const int is_first_frame = (main_state->global->frame == 0);
-    const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
-    const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
-    main_state->global->is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
-    
-    if (main_state->global->is_radl_frame) {
-      // Clear the reference list
-      encoder_state_clear_refs(main_state);
-
-      main_state->global->slicetype = SLICE_I;
-      main_state->global->pictype = NAL_IDR_W_RADL;
-    } else {
-      main_state->global->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
-      main_state->global->pictype = NAL_TRAIL_R;
-    }
-  } else {
-    //Clear the bitstream if it's not the main encoder
-    bitstream_clear(&main_state->stream);
-  }
-  
-  if (main_state->is_leaf) {
-    //Leaf states have cabac and context
-    cabac_start(&main_state->cabac);
-    init_contexts(main_state, main_state->global->QP, main_state->global->slicetype);
-
-    // Initialize lambda value(s) to use in search
-    encoder_state_init_lambda(main_state);
-  }
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    encoder_state_new_frame(&main_state->children[i]);
-  }
-  
-
-}
-
-static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
-  const encoder_control * const encoder = main_state->encoder_control;
-  bitstream * const stream = &main_state->stream;
-
-  int i;
-
-  if (main_state->global->is_radl_frame) {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      write_aud(main_state);
-
-    // Video Parameter Set (VPS)
-    nal_write(stream, NAL_VPS_NUT, 0, 1);
-    encode_vid_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Sequence Parameter Set (SPS)
-    nal_write(stream, NAL_SPS_NUT, 0, 1);
-    encode_seq_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Picture Parameter Set (PPS)
-    nal_write(stream, NAL_PPS_NUT, 0, 1);
-    encode_pic_parameter_set(main_state);
-    bitstream_align(stream);
-
-    if (main_state->global->frame == 0) {
-      // Prefix SEI
-      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
-      encode_prefix_sei_version(main_state);
-      bitstream_align(stream);
-    }
-  } else {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      write_aud(main_state);
-  }
-
-  {
-    // Not quite sure if this is correct, but it seems to have worked so far
-    // so I tried to not change it's behavior.
-    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
-
-    nal_write(stream,
-              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
-  }
-  {
-    PERFORMANCE_MEASURE_START();
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-    //FIXME: Move this...
-    bitstream_clear(&main_state->children[i].stream);
-  }
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  {
-    PERFORMANCE_MEASURE_START();
-    // Calculate checksum
-    add_checksum(main_state);
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  //FIXME: Why is this needed?
-  main_state->tile->cur_pic->poc = main_state->global->poc;
-}
-
-static void worker_encoder_state_write_bitstream_leaf(void * opaque) {
-  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
-}
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  //Write terminator of the leaf
-  assert(encoder_state->is_leaf);
-  
-  //Last LCU
-  {
-    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
-    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
-    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
-  
-    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
-  
-    if (!end_of_slice_segment_flag) {
-      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
-      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
-      cabac_flush(&encoder_state->cabac);
-    } else {
-      cabac_flush(&encoder_state->cabac);
-      bitstream_align(&encoder_state->stream);
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
-  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
-  int i;
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
-  int i;
-  encode_slice_header(main_state);
-  bitstream_align(&main_state->stream); 
-  
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-
-static void encoder_state_write_bitstream(encoder_state * const main_state) {
-  int i;
-  if (!main_state->is_leaf) {
-    for (i=0; main_state->children[i].encoder_control; ++i) {
-      encoder_state *sub_state = &(main_state->children[i]);
-      encoder_state_write_bitstream(sub_state);
-    }
-    
-    switch (main_state->type) {
-      case ENCODER_STATE_TYPE_MAIN:
-        encoder_state_write_bitstream_main(main_state);
-        break;
-      case ENCODER_STATE_TYPE_TILE:
-        encoder_state_write_bitstream_tile(main_state);
-        break;
-      case ENCODER_STATE_TYPE_SLICE:
-        encoder_state_write_bitstream_slice(main_state);
-        break;
-      default:
-        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
-        assert(0);
-    }
-  }
-}
-
-void encode_one_frame(encoder_state * const main_state)
-{
-  {
-    PERFORMANCE_MEASURE_START();
-    encoder_state_new_frame(main_state);
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d", main_state->global->frame);
-  }
-  {
-    PERFORMANCE_MEASURE_START();
-    encoder_state_encode(main_state);
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
-  }
-  {
-    encoder_state_write_bitstream(main_state);
-  }
-}
-
-static void fill_after_frame(unsigned height, unsigned array_width,
-                             unsigned array_height, pixel *data)
-{
-  pixel* p = data + height * array_width;
-  pixel* end = data + array_width * array_height;
-
-  while (p < end) {
-    // Fill the line by copying the line above.
-    memcpy(p, p - array_width, array_width);
-    p += array_width;
-  }
-}
-
-static int read_and_fill_frame_data(FILE *file,
-                                    unsigned width, unsigned height,
-                                    unsigned array_width, pixel *data)
-{
-  pixel* p = data;
-  pixel* end = data + array_width * height;
-  pixel fill_char;
-  unsigned i;
-
-  while (p < end) {
-    // Read the beginning of the line from input.
-    if (width != fread(p, sizeof(unsigned char), width, file))
-      return 0;
-
-    // Fill the rest with the last pixel value.
-    fill_char = p[width - 1];
-
-    for (i = width; i < array_width; ++i) {
-      p[i] = fill_char;
-    }
-
-    p += array_width;
-  }
-  return 1;
-}
-
-int read_one_frame(FILE* file, const encoder_state * const encoder_state)
-{
-  unsigned width = encoder_state->encoder_control->in.real_width;
-  unsigned height = encoder_state->encoder_control->in.real_height;
-  unsigned array_width = encoder_state->tile->cur_pic->width;
-  unsigned array_height = encoder_state->tile->cur_pic->height;
-
-  if (width != array_width) {
-    // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
-    if (!read_and_fill_frame_data(file, width, height, array_width,
-                                  encoder_state->tile->cur_pic->y_data) ||
-        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  encoder_state->tile->cur_pic->u_data) ||
-        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  encoder_state->tile->cur_pic->v_data))
-      return 0;
-  } else {
-    // Otherwise the data can be read directly to the array.
-    unsigned y_size = width * height;
-    unsigned uv_size = (width >> 1) * (height >> 1);
-    if (y_size  != fread(encoder_state->tile->cur_pic->y_data, sizeof(unsigned char),
-                         y_size, file) ||
-        uv_size != fread(encoder_state->tile->cur_pic->u_data, sizeof(unsigned char),
-                         uv_size, file) ||
-        uv_size != fread(encoder_state->tile->cur_pic->v_data, sizeof(unsigned char),
-                         uv_size, file))
-      return 0;
-  }
-
-  if (height != array_height) {
-    fill_after_frame(height, array_width, array_height,
-                     encoder_state->tile->cur_pic->y_data);
-    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     encoder_state->tile->cur_pic->u_data);
-    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     encoder_state->tile->cur_pic->v_data);
-  }
-  return 1;
-}
-
-/**
- * \brief Add a checksum SEI message to the bitstream.
- * \param encoder The encoder.
- * \returns Void
- */
-static void add_checksum(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  unsigned char checksum[3][SEI_HASH_MAX_LENGTH];
-  uint32_t checksum_val;
-  unsigned int i;
-
-  nal_write(stream, NAL_SUFFIT_SEI_NUT, 0, 0);
-
-  picture_checksum(cur_pic, checksum);
-
-  WRITE_U(stream, 132, 8, "sei_type");
-  WRITE_U(stream, 13, 8, "size");
-  WRITE_U(stream, 2, 8, "hash_type"); // 2 = checksum
-
-  for (i = 0; i < 3; ++i) {
-    // Pack bits into a single 32 bit uint instead of pushing them one byte
-    // at a time.
-    checksum_val = (checksum[i][0] << 24) + (checksum[i][1] << 16) +
-                   (checksum[i][2] << 8) + (checksum[i][3]);
-    WRITE_U(stream, checksum_val, 32, "picture_checksum");
-  }
-
-  bitstream_align(stream);
-}
-
-void encode_access_unit_delimiter(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
-                   : encoder_state->global->slicetype == SLICE_P ? 1
-                   :                                             2;
-  WRITE_U(stream, pic_type, 3, "pic_type");
-}
-
-void encode_prefix_sei_version(encoder_state * const encoder_state)
-{
-#define STR_BUF_LEN 1000
-  bitstream * const stream = &encoder_state->stream;
-  int i, length;
-  char buf[STR_BUF_LEN] = { 0 };
-  char *s = buf + 16;
-  const config * const cfg = encoder_state->encoder_control->cfg;
-
-  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
-  static const uint8_t uuid[16] = {
-    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
-    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
-  };
-  memcpy(buf, uuid, 16);
-
-  // user_data_payload_byte
-  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
-                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
-  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
-  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
-               cfg->deblock_beta, cfg->deblock_tc);
-  s += sprintf(s, " sao=%d", cfg->sao_enable);
-  s += sprintf(s, " intra_period=%d", cfg->intra_period);
-  s += sprintf(s, " qp=%d", cfg->qp);
-  s += sprintf(s, " ref=%d", cfg->ref_frames);
-
-  length = (int)(s - buf + 1);  // length, +1 for \0
-
-  // Assert this so that in the future if the message gets longer, we remember
-  // to increase the buf len. Divide by 2 for margin.
-  assert(length < STR_BUF_LEN / 2);
-
-  // payloadType = 5 -> user_data_unregistered
-  WRITE_U(stream, 5, 8, "last_payload_type_byte");
-
-  // payloadSize
-  for (i = 0; i <= length - 255; i += 255)
-    WRITE_U(stream, 255, 8, "ff_byte");
-  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
-
-  for (i = 0; i < length; i++)
-    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
-
-#undef STR_BUF_LEN
-}
-
-void encode_pic_parameter_set(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-#ifdef _DEBUG
-  printf("=========== Picture Parameter Set ID: 0 ===========\n");
-#endif
-  WRITE_UE(stream, 0, "pic_parameter_set_id");
-  WRITE_UE(stream, 0, "seq_parameter_set_id");
-  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
-  WRITE_U(stream, 0, 1, "output_flag_present_flag");
-  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
-  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
-  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
-
-  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
-  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
-  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
-  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
-  //if cu_qp_delta_enabled_flag
-  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
-
-  //TODO: add QP offsets
-  WRITE_SE(stream, 0, "pps_cb_qp_offset");
-  WRITE_SE(stream, 0, "pps_cr_qp_offset");
-  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
-  WRITE_U(stream, 0, 1, "weighted_pred_flag");
-  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
-
-  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
-  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
-  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
-  //wavefronts
-  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
-
-  if (encoder->tiles_enable) {
-    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
-    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
-    
-    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
-    
-    if (!encoder->tiles_uniform_spacing_flag) {
-      int i;
-      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
-      }
-      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
-      }
-    }
-    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
-    
-  }
-  
-  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
-  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
-
-  //IF deblocking_filter
-    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
-          "pps_disable_deblocking_filter_flag");
-
-    //IF !disabled
-  if (encoder_state->encoder_control->deblock_enable) {
-     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
-     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
-    }
-
-    //ENDIF
-  //ENDIF
-  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
-  //IF scaling_list
-  //ENDIF
-  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
-  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
-  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
-  WRITE_U(stream, 0, 1, "pps_extension_flag");
-}
-
-static void encode_PTL(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-  // PTL
-  // Profile Tier
-  WRITE_U(stream, 0, 2, "general_profile_space");
-  WRITE_U(stream, 0, 1, "general_tier_flag");
-  // Main Profile == 1
-  WRITE_U(stream, 1, 5, "general_profile_idc");
-  /* Compatibility flags should be set at general_profile_idc
-   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
-   * According to specification, when compatibility_flag[1] is set,
-   *  compatibility_flag[2] should be set too.
-   */
-  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
-
-  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
-  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
-  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
-  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
-
-  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
-  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
-
-  // end Profile Tier
-
-  // Level 6.2 (general_level_idc is 30 * 6.2)
-  WRITE_U(stream, 186, 8, "general_level_idc");
-
-  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
-  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
-
-  for (i = 1; i < 8; i++) {
-    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
-  }
-
-  // end PTL
-}
-
-static void encode_scaling_list(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-  uint32_t size_id;
-  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
-    int32_t list_id;
-    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
-      uint8_t scaling_list_pred_mode_flag = 1;
-      int32_t pred_list_idx;
-      int32_t i;
-      uint32_t ref_matrix_id = UINT32_MAX;
-
-      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
-        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
-                                     scalinglist_get_default(size_id, pred_list_idx) :
-                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
-
-        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
-            ((size_id < SCALING_LIST_16x16) ||
-             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
-          ref_matrix_id = pred_list_idx;
-          scaling_list_pred_mode_flag = 0;
-          break;
-        }
-      }
-      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
-
-      if (!scaling_list_pred_mode_flag) {
-        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
-      } else {
-        int32_t delta;
-        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
-        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
-        int32_t next_coef = 8;
-        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
-
-        if (size_id >= SCALING_LIST_16x16) {
-          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
-          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
-        }
-
-        for (i = 0; i < coef_num; i++) {
-          delta     = coef_list[scan_cg[i]] - next_coef;
-          next_coef = coef_list[scan_cg[i]];
-          if (delta > 127)
-            delta -= 256;
-          if (delta < -128)
-            delta += 256;
-
-          WRITE_SE(stream, delta, "scaling_list_delta_coef");
-        }
-      }
-    }
-  }
-}
-
-void encode_seq_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  //FIXME: use encoder_control instead of cur_pic
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-
-#ifdef _DEBUG
-  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
-#endif
-
-  // TODO: profile IDC and level IDC should be defined later on
-  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
-  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
-
-  encode_PTL(encoder_state);
-
-  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
-  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
-           "chroma_format_idc");
-
-  if (encoder_state->encoder_control->in.video_format == 3) {
-    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
-  }
-
-  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
-  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
-
-  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
-    // The standard does not seem to allow setting conf_win values such that
-    // the number of luma samples is not a multiple of 2. Options are to either
-    // hide one line or show an extra line of non-video. Neither seems like a
-    // very good option, so let's not even try.
-    assert(!(cur_pic->width % 2));
-    WRITE_U(stream, 1, 1, "conformance_window_flag");
-    WRITE_UE(stream, 0, "conf_win_left_offset");
-    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
-             "conf_win_right_offset");
-    WRITE_UE(stream, 0, "conf_win_top_offset");
-    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
-             "conf_win_bottom_offset");
-  } else {
-    WRITE_U(stream, 0, 1, "conformance_window_flag");
-  }
-
-  //IF window flag
-  //END IF
-
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
-  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "sps_num_reorder_pics");
-  WRITE_UE(stream, 0, "sps_max_latency_increase");
-  //end for
-
-  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
-  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
-  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
-  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
-  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
-  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
-
-  // scaling list
-  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
-  if (encoder_state->encoder_control->scaling_list.enable) {
-    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
-    encode_scaling_list(encoder_state);
-  }
-
-  WRITE_U(stream, 0, 1, "amp_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
-          "sample_adaptive_offset_enabled_flag");
-  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
-  #if ENABLE_PCM == 1
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
-    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
-    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
-    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
-  #endif
-
-  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
-
-  //IF num short term ref pic sets
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
-
-  //IF long_term_ref_pics_present
-  //ENDIF
-
-  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
-          "sps_temporal_mvp_enable_flag");
-  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
-  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
-
-  encode_VUI(encoder_state);
-
-  WRITE_U(stream, 0, 1, "sps_extension_flag");
-}
-
-void encode_vid_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-#ifdef _DEBUG
-  printf("=========== Video Parameter Set ID: 0 ===========\n");
-#endif
-
-  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
-  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
-  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
-  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
-  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
-
-  encode_PTL(encoder_state);
-
-  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  for (i = 0; i < 1; i++) {
-  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "vps_num_reorder_pics");
-  WRITE_UE(stream, 0, "vps_max_latency_increase");
-  }
-
-  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
-  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
-  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
-
-  //IF timing info
-  //END IF
-
-  WRITE_U(stream, 0, 1, "vps_extension_flag");
-}
-
-static void encode_VUI(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  const encoder_control * const encoder = encoder_state->encoder_control;
-#ifdef _DEBUG
-  printf("=========== VUI Set ID: 0 ===========\n");
-#endif
-  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
-    int i;
-    static const struct
-    {
-      uint8_t width;
-      uint8_t height;
-      uint8_t idc;
-    } sar[] = {
-      // aspect_ratio_idc = 0 -> unspecified
-      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
-      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
-      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
-      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
-      // aspect_ratio_idc = [17..254] -> reserved
-      { 0, 0, 255 }
-    };
-
-    for (i = 0; sar[i].idc != 255; i++)
-      if (sar[i].width  == encoder->vui.sar_width &&
-          sar[i].height == encoder->vui.sar_height)
-        break;
-
-    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
-    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
-    if (sar[i].idc == 255) {
-      // EXTENDED_SAR
-      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
-      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
-    }
-  } else
-    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
-
-  //IF aspect ratio info
-  //ENDIF
-
-  if (encoder->vui.overscan > 0) {
-    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
-    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
-  } else
-    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
-
-  //IF overscan info
-  //ENDIF
-
-  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
-      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-      encoder->vui.colormatrix != 2) {
-    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
-    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
-    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
-
-    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-        encoder->vui.colormatrix != 2) {
-      WRITE_U(stream, 1, 1, "colour_description_present_flag");
-      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
-      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
-      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
-    } else
-      WRITE_U(stream, 0, 1, "colour_description_present_flag");
-  } else
-    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
-
-  //IF video type
-  //ENDIF
-
-  if (encoder->vui.chroma_loc > 0) {
-    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
-  } else
-    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
-
-  //IF chroma loc info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
-  WRITE_U(stream, 0, 1, "field_seq_flag");
-  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
-  WRITE_U(stream, 0, 1, "default_display_window_flag");
-
-  //IF default display window
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
-
-  //IF timing info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
-
-  //IF bitstream restriction
-  //ENDIF
-}
-
-void encoder_next_frame(encoder_state *encoder_state) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  picture *old_pic;
-  
-  // Remove the ref pic (if present)
-  if (encoder_state->global->ref->used_size == (uint32_t)encoder->cfg->ref_frames) {
-    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size-1);
-  }
-  // Add current picture as reference
-  picture_list_add(encoder_state->global->ref, encoder_state->tile->cur_pic);
-  // Allocate new memory to current picture
-  old_pic = encoder_state->tile->cur_pic;
-  // TODO: reuse memory from old reference
-  encoder_state->tile->cur_pic = picture_alloc(encoder_state->tile->cur_pic->width, encoder_state->tile->cur_pic->height, encoder_state->tile->cur_pic->width_in_lcu, encoder_state->tile->cur_pic->height_in_lcu);
-
-  //FIXME: does the coeff_* really belongs to cur_pic?
-  // Copy pointer from the last cur_pic because we don't want to reallocate it
-  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_y,old_pic->coeff_y);
-  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_u,old_pic->coeff_u);
-  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_v,old_pic->coeff_v);
-  
-  picture_free(old_pic);
-
-  encoder_state->global->frame++;
-  encoder_state->global->poc++;
-}
-
-static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      if (my_length > *r_max_length) {
-        *r_max_length = my_length;
-      }
-    } else {
-      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
-    }
-  }
-}
-
-static void encoder_state_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      //Don't write the last one
-      if (*r_count < num_entry_points) {
-        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
-      }
-    } else {
-      encoder_state_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
-    }
-  }
-}
-
-static int num_bitcount(unsigned int n) {
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return ((n == 0) ? (-1) : pos);
-}
-
-void encode_slice_header(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-
-#ifdef _DEBUG
-  printf("=========== Slice ===========\n");
-#endif
-  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
-
-  if (encoder_state->global->pictype >= NAL_BLA_W_LP
-      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
-    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
-  }
-
-  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
-  if (encoder_state->slice->start_in_rs > 0) {
-    //For now, we don't support dependent slice segments
-    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
-    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
-  }
-
-  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
-
-  // if !entropy_slice_flag
-
-    //if output_flag_present_flag
-      //WRITE_U(stream, 1, 1, "pic_output_flag");
-    //end if
-    //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (encoder_state->global->pictype != NAL_IDR_W_RADL
-      && encoder_state->global->pictype != NAL_IDR_N_LP) {
-      int j;
-      int ref_negative = encoder_state->global->ref->used_size;
-      int ref_positive = 0;
-      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
-      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
-      WRITE_UE(stream, ref_negative, "num_negative_pics");
-      WRITE_UE(stream, ref_positive, "num_positive_pics");
-
-    for (j = 0; j < ref_negative; j++) {
-      int32_t delta_poc_minus1 = 0;
-      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
-      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
-    }
-
-    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
-  }
-
-    //end if
-  //end if
-  if (encoder->sao_enable) {
-    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
-    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
-  }
-
-  if (encoder_state->global->slicetype != SLICE_I) {
-      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
-      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
-  }
-
-  if (encoder_state->global->slicetype == SLICE_B) {
-      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
-  }
-
-  // Skip flags that are not present
-  // if !entropy_slice_flag
-    WRITE_SE(stream, 0, "slice_qp_delta");
-    //WRITE_U(stream, 1, 1, "alignment");
-   
-  if (encoder->tiles_enable || encoder->wpp) {
-    int num_entry_points = 0;
-    int max_length_seen = 0;
-    
-    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
-    
-    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
-    if (num_entry_points > 0) {
-      int entry_points_written = 0;
-      int offset_len = num_bitcount(max_length_seen) + 1;
-      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
-      encoder_state_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
-    }
-  }
-}
-
-
-static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
-                             color_index color_i)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  sao_eo_cat i;
-
-  // Skip colors with no SAO.
-  //FIXME: for now, we always have SAO for all channels
-  if (color_i == COLOR_Y && 0) return;
-  if (color_i != COLOR_Y && 0) return;
-
-  /// sao_type_idx_luma:   TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
-  /// sao_type_idx_chroma: TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
-  // Encode sao_type_idx for Y and U+V.
-  if (color_i != COLOR_V) {
-    cabac->ctx = &(cabac->ctx_sao_type_idx_model);;
-    CABAC_BIN(cabac, sao->type != SAO_TYPE_NONE, "sao_type_idx");
-    if (sao->type == SAO_TYPE_BAND) {
-      CABAC_BIN_EP(cabac, 0, "sao_type_idx_ep");
-    } else if (sao->type == SAO_TYPE_EDGE) {
-      CABAC_BIN_EP(cabac, 1, "sao_type_idx_ep");
-    }
-  }
-
-  if (sao->type == SAO_TYPE_NONE) return;
-
-  /// sao_offset_abs[][][][]: TR, cMax = (1 << (Min(bitDepth, 10) - 5)) - 1,
-  ///                         cRiceParam = 0, bins = {bypass x N}
-  for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
-    cabac_write_unary_max_symbol_ep(cabac, abs(sao->offsets[i]), SAO_ABS_OFFSET_MAX);
-  }
-
-  /// sao_offset_sign[][][][]: FL, cMax = 1, bins = {bypass}
-  /// sao_band_position[][][]: FL, cMax = 31, bins = {bypass x N}
-  /// sao_eo_class_luma:       FL, cMax = 3, bins = {bypass x 3}
-  /// sao_eo_class_chroma:     FL, cMax = 3, bins = {bypass x 3}
-  if (sao->type == SAO_TYPE_BAND) {
-    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
-      // Positive sign is coded as 0.
-      if(sao->offsets[i] != 0) {
-        CABAC_BIN_EP(cabac, sao->offsets[i] < 0 ? 1 : 0, "sao_offset_sign");
-      }
-    }
-    // TODO: sao_band_position
-    // FL cMax=31 (5 bits)
-    CABAC_BINS_EP(cabac, sao->band_position, 5, "sao_band_position");
-  } else if (color_i != COLOR_V) {
-    CABAC_BINS_EP(cabac, sao->eo_class, 2, "sao_eo_class");
-  }
-}
-
-static void encode_sao_merge_flags(encoder_state * const encoder_state, sao_info *sao, unsigned x_ctb, unsigned y_ctb)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  // SAO merge flags are not present for the first row and column.
-  if (x_ctb > 0) {
-    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
-    CABAC_BIN(cabac, sao->merge_left_flag, "sao_merge_left_flag");
-  }
-  if (y_ctb > 0 && !sao->merge_left_flag) {
-    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
-    CABAC_BIN(cabac, sao->merge_up_flag, "sao_merge_up_flag");
-  }
-}
-
-/**
- * \brief Encode SAO information.
- */
-static void encode_sao(encoder_state * const encoder_state,
-                       unsigned x_lcu, uint16_t y_lcu,
-                       sao_info *sao_luma, sao_info *sao_chroma)
-{
-  // TODO: transmit merge flags outside sao_info
-  encode_sao_merge_flags(encoder_state, sao_luma, x_lcu, y_lcu);
-
-  // If SAO is merged, nothing else needs to be coded.
-  if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
-    encode_sao_color(encoder_state, sao_luma, COLOR_Y);
-    encode_sao_color(encoder_state, sao_chroma, COLOR_U);
-    encode_sao_color(encoder_state, sao_chroma, COLOR_V);
-  }
-}
-
-
-void encode_coding_tree(encoder_state * const encoder_state,
-                        uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  cu_info *cur_cu = &cur_pic->cu_array[x_ctb + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)];
-  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
-  uint8_t split_model = 0;
-  
-  //Absolute ctb
-  uint16_t abs_x_ctb = x_ctb + (encoder_state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-  uint16_t abs_y_ctb = y_ctb + (encoder_state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-
-  // Check for slice border FIXME
-  uint8_t border_x = ((encoder_state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_y = ((encoder_state->encoder_control->in.height) < (abs_y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_split_x = ((encoder_state->encoder_control->in.width)  < ((abs_x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border_split_y = ((encoder_state->encoder_control->in.height) < ((abs_y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border = border_x | border_y; /*!< are we in any border CU */
-
-  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (depth != MAX_DEPTH) {
-    // Implisit split flag when on border
-    if (!border) {
-      // Get left and top block split_flags and if they are present and true, increase model number
-      if (x_ctb > 0 && GET_SPLITDATA(&(cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)]), depth) == 1) {
-        split_model++;
-      }
-
-      if (y_ctb > 0 && GET_SPLITDATA(&(cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)]), depth) == 1) {
-        split_model++;
-      }
-
-      cabac->ctx = &(cabac->ctx_split_flag_model[split_model]);
-      CABAC_BIN(cabac, split_flag, "SplitFlag");
-    }
-
-    if (split_flag || border) {
-      // Split blocks and remember to change x and y block positions
-      uint8_t change = 1<<(MAX_DEPTH-1-depth);
-      encode_coding_tree(encoder_state, x_ctb, y_ctb, depth + 1); // x,y
-
-      // TODO: fix when other half of the block would not be completely over the border
-      if (!border_x || border_split_x) {
-        encode_coding_tree(encoder_state, x_ctb + change, y_ctb, depth + 1);
-      }
-      if (!border_y || border_split_y) {
-        encode_coding_tree(encoder_state, x_ctb, y_ctb + change, depth + 1);
-      }
-      if (!border || (border_split_x && border_split_y)) {
-        encode_coding_tree(encoder_state, x_ctb + change, y_ctb + change, depth + 1);
-      }
-      return;
-    }
-  }
-
-
-
-    // Encode skip flag
-  if (encoder_state->global->slicetype != SLICE_I) {
-    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
-    int ui;
-    int16_t num_cand = MRG_MAX_NUM_CANDS;
-    // Get left and top skipped flags and if they are present and true, increase context number
-    if (x_ctb > 0 && (&cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)])->skipped) {
-      ctx_skip++;
-    }
-
-    if (y_ctb > 0 && (&cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)])->skipped) {
-      ctx_skip++;
-    }
-
-    cabac->ctx = &(cabac->ctx_cu_skip_flag_model[ctx_skip]);
-    CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
-
-    // IF SKIP
-    if (cur_cu->skipped) {
-      if (num_cand > 1) {
-        for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui != cur_cu->merge_idx);
-          if (ui == 0) {
-            cabac->ctx = &(cabac->ctx_cu_merge_idx_ext_model);
-            CABAC_BIN(cabac, symbol, "MergeIndex");
-          } else {
-            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
-          }
-          if (symbol == 0) {
-            break;
-          }
-        }
-      }
-      return;
-    }
-  }
-
-  // ENDIF SKIP
-
-  // Prediction mode
-  if (encoder_state->global->slicetype != SLICE_I) {
-    cabac->ctx = &(cabac->ctx_cu_pred_mode_model);
-    CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode");
-  }
-
-  // part_mode
-  if (cur_cu->type == CU_INTRA) {
-    if (depth == MAX_DEPTH) {
-      cabac->ctx = &(cabac->ctx_part_size_model[0]);
-      if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      } else {
-        CABAC_BIN(cabac, 0, "part_mode NxN");
-      }
-    }
-  } else {
-    // TODO: Handle inter sizes other than 2Nx2N
-    cabac->ctx = &(cabac->ctx_part_size_model[0]);
-    CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-  }
-
-  //end partsize
-  if (cur_cu->type == CU_INTER) {
-    // FOR each part
-    // Mergeflag
-    int16_t num_cand = 0;
-    cabac->ctx = &(cabac->ctx_cu_merge_flag_ext_model);
-    CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
-    num_cand = MRG_MAX_NUM_CANDS;
-    if (cur_cu->merged) { //merge
-      if (num_cand > 1) {
-        int32_t ui;
-        for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui != cur_cu->merge_idx);
-          if (ui == 0) {
-            cabac->ctx = &(cabac->ctx_cu_merge_idx_ext_model);
-            CABAC_BIN(cabac, symbol, "MergeIndex");
-          } else {
-            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
-          }
-          if (symbol == 0) break;
-        }
-      }
-    } else {
-      uint32_t ref_list_idx;
-      /*
-      // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
-      if(cur_pic->slicetype == SLICE_B)
-      {
-        // Code Inter Dir
-        const UInt uiInterDir = pcCU->getInterDir( uiAbsPartIdx ) - 1;
-        const UInt uiCtx      = pcCU->getCtxInterDir( uiAbsPartIdx );
-        ContextModel *pCtx    = m_cCUInterDirSCModel.get( 0 );
-        if (pcCU->getPartitionSize(uiAbsPartIdx) == SIZE_2Nx2N || pcCU->getHeight(uiAbsPartIdx) != 8 )
-        {
-          m_pcBinIf->encodeBin( uiInterDir == 2 ? 1 : 0, *( pCtx + uiCtx ) );
-        }
-        if (uiInterDir < 2)
-        {
-          m_pcBinIf->encodeBin( uiInterDir, *( pCtx + 4 ) );
-        }
-      }
-      */
-
-      for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
-            //if(encoder_state->ref_idx_num[uiRefListIdx] > 0)
-            {
-          if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
-            if (encoder_state->global->ref->used_size != 1) { //encoder_state->ref_idx_num[uiRefListIdx] != 1)//NumRefIdx != 1)
-              // parseRefFrmIdx
-              int32_t ref_frame = cur_cu->inter.mv_ref;
-
-              cabac->ctx = &(cabac->ctx_cu_ref_pic_model[0]);
-              CABAC_BIN(cabac, (ref_frame != 0), "ref_frame_flag");
-
-              if (ref_frame > 0) {
-                int32_t i;
-                int32_t ref_num = encoder_state->global->ref->used_size - 2;
-
-                cabac->ctx = &(cabac->ctx_cu_ref_pic_model[1]);
-                ref_frame--;
-
-                for (i = 0; i < ref_num; ++i) {
-                  const uint32_t symbol = (i == ref_frame) ? 0 : 1;
-
-                  if (i == 0) {
-                    CABAC_BIN(cabac, symbol, "ref_frame_flag2");
-                  } else {
-                    CABAC_BIN_EP(cabac, symbol, "ref_frame_flag2");
-                  }
-                  if (symbol == 0) break;
-                }
-              }
-            }
-
-            if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder_state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
-              const int32_t mvd_hor = cur_cu->inter.mvd[0];
-              const int32_t mvd_ver = cur_cu->inter.mvd[1];
-              const int8_t hor_abs_gr0 = mvd_hor != 0;
-              const int8_t ver_abs_gr0 = mvd_ver != 0;
-              const uint32_t mvd_hor_abs = abs(mvd_hor);
-              const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-              cabac->ctx = &(cabac->ctx_cu_mvd_model[0]);
-              CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-              CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
-
-              cabac->ctx = &(cabac->ctx_cu_mvd_model[1]);
-
-              if (hor_abs_gr0) {
-                CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
-              }
-
-              if (ver_abs_gr0) {
-                CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
-              }
-
-              if (hor_abs_gr0) {
-                if (mvd_hor_abs > 1) {
-                  cabac_write_ep_ex_golomb(cabac,mvd_hor_abs-2, 1);
-                }
-
-                CABAC_BIN_EP(cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
-              }
-
-              if (ver_abs_gr0) {
-                if (mvd_ver_abs > 1) {
-                  cabac_write_ep_ex_golomb(cabac,mvd_ver_abs-2, 1);
-                }
-
-                CABAC_BIN_EP(cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
-              }
-            }
-
-            // Signal which candidate MV to use
-            cabac_write_unary_max_symbol(cabac, cabac->ctx_mvp_idx_model, cur_cu->inter.mv_cand, 1,
-                                        AMVP_MAX_NUM_CANDS - 1);
-          }
-          }
-        } // for ref_list
-    } // if !merge
-
-    {
-      int cbf = (cbf_is_set(cur_cu->cbf.y, depth) ||
-                 cbf_is_set(cur_cu->cbf.u, depth) ||
-                 cbf_is_set(cur_cu->cbf.v, depth));
-
-      // Only need to signal coded block flag if not skipped or merged
-      // skip = no coded residual, merge = coded residual
-      if (!cur_cu->merged) {
-        cabac->ctx = &(cabac->ctx_cu_qt_root_cbf_model);
-        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
-      }
-      // Code (possible) coeffs to bitstream
-
-      if (cbf) {
-        encode_transform_coeff(encoder_state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
-      }
-    }
-
-    // END for each part
-  } else if (cur_cu->type == CU_INTRA) {
-    uint8_t intra_pred_mode[4] = {
-      cur_cu->intra[0].mode, cur_cu->intra[1].mode,
-      cur_cu->intra[2].mode, cur_cu->intra[3].mode };
-      uint8_t intra_pred_mode_chroma = cur_cu->intra[0].mode_chroma;
-    int8_t intra_preds[4][3] = {{-1, -1, -1},{-1, -1, -1},{-1, -1, -1},{-1, -1, -1}};
-    int8_t mpm_preds[4] = {-1, -1, -1, -1};
-    int i, j;
-    uint32_t flag[4];
-    int num_pred_units = (cur_cu->part_size == SIZE_2Nx2N ? 1 : 4);
-
-    #if ENABLE_PCM == 1
-    // Code must start after variable initialization
-    cabac_encode_bin_trm(cabac, 0); // IPCMFlag == 0
-    #endif
-
-    // PREDINFO CODING
-    // If intra prediction mode is found from the predictors,
-    // it can be signaled with two EP's. Otherwise we can send
-    // 5 EP bins with the full predmode
-    for (j = 0; j < num_pred_units; ++j) {
-      static const vector2d offset[4] = {{0,0},{1,0},{0,1},{1,1}};
-      cu_info *left_cu = 0;
-      cu_info *above_cu = 0;
-
-      if (x_ctb > 0) {
-        left_cu = &cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)];
-      }
-      // Don't take the above CU across the LCU boundary.
-      if (y_ctb > 0 && (y_ctb & 7) != 0) {
-        above_cu = &cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)];
-      }
-
-      intra_get_dir_luma_predictor((x_ctb<<3) + (offset[j].x<<2),
-                                   (y_ctb<<3) + (offset[j].y<<2),
-                                   intra_preds[j], cur_cu,
-                                   left_cu, above_cu);
-      for (i = 0; i < 3; i++) {
-        if (intra_preds[j][i] == intra_pred_mode[j]) {
-          mpm_preds[j] = (int8_t)i;
-          break;
-        }
-      }
-      flag[j] = (mpm_preds[j] == -1) ? 0 : 1;
-    }
-
-    cabac->ctx = &(cabac->ctx_intra_mode_model);
-    for (j = 0; j < num_pred_units; ++j) {
-      CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag");
-    }
-
-    for (j = 0; j < num_pred_units; ++j) {
-      // Signal index of the prediction mode in the prediction list.
-      if (flag[j]) {
-        CABAC_BIN_EP(cabac, (mpm_preds[j] == 0 ? 0 : 1), "mpm_idx");
-        if (mpm_preds[j] != 0) {
-          CABAC_BIN_EP(cabac, (mpm_preds[j] == 1 ? 0 : 1), "mpm_idx");
-        }
-      } else {
-        // Signal the actual prediction mode.
-        int32_t tmp_pred = intra_pred_mode[j];
-
-        // Sort prediction list from lowest to highest.
-        if (intra_preds[j][0] > intra_preds[j][1]) SWAP(intra_preds[j][0], intra_preds[j][1], int8_t);
-        if (intra_preds[j][0] > intra_preds[j][2]) SWAP(intra_preds[j][0], intra_preds[j][2], int8_t);
-        if (intra_preds[j][1] > intra_preds[j][2]) SWAP(intra_preds[j][1], intra_preds[j][2], int8_t);
-
-        // Reduce the index of the signaled prediction mode according to the
-        // prediction list, as it has been already signaled that it's not one
-        // of the prediction modes.
-        for (i = 2; i >= 0; i--) {
-          tmp_pred = (tmp_pred > intra_preds[j][i] ? tmp_pred - 1 : tmp_pred);
-        }
-
-        CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode");
-      }
-    }
-
-    {  // start intra chroma pred mode coding
-      unsigned pred_mode = 5;
-      unsigned chroma_pred_modes[4] = {0, 26, 10, 1};
-
-      if (intra_pred_mode_chroma == intra_pred_mode[0]) {
-        pred_mode = 4;
-      } else if (intra_pred_mode_chroma == 34) {
-        // Angular 34 mode is possible only if intra pred mode is one of the
-        // possible chroma pred modes, in which case it is signaled with that
-        // duplicate mode.
-        for (i = 0; i < 4; ++i) {
-          if (intra_pred_mode[0] == chroma_pred_modes[i]) pred_mode = i;
-        }
-      } else {
-        for (i = 0; i < 4; ++i) {
-          if (intra_pred_mode_chroma == chroma_pred_modes[i]) pred_mode = i;
-        }
-      }
-
-      // pred_mode == 5 mean intra_pred_mode_chroma is something that can't
-      // be coded.
-      assert(pred_mode != 5);
-
-      /**
-       * Table 9-35 - Binarization for intra_chroma_pred_mode
-       *   intra_chroma_pred_mode  bin_string
-       *                        4           0
-       *                        0         100
-       *                        1         101
-       *                        2         110
-       *                        3         111
-       * Table 9-37 - Assignment of ctxInc to syntax elements with context coded bins
-       *   intra_chroma_pred_mode[][] = 0, bypass, bypass
-       */
-      cabac->ctx = &(cabac->ctx_chroma_pred_model[0]);
-      if (pred_mode == 4) {
-        CABAC_BIN(cabac, 0, "intra_chroma_pred_mode");
-      } else {
-        CABAC_BIN(cabac, 1, "intra_chroma_pred_mode");
-        CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode");
-      }
-    }  // end intra chroma pred mode coding
-
-    encode_transform_coeff(encoder_state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
-  }
-
-    #if ENABLE_PCM == 1
-  // Code IPCM block
-  if (cur_cu->type == CU_PCM) {
-    cabac_encode_bin_trm(cabac, 1); // IPCMFlag == 1
-      cabac_finish(cabac);
-      bitstream_align(cabac.stream);
-    // PCM sample
-      {
-      unsigned y, x;
-
-      pixel *base_y = &cur_pic->y_data[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))    + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH))) * encoder->in.width];
-      pixel *base_u = &cur_pic->u_data[(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2)];
-      pixel *base_v = &cur_pic->v_data[(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2)];
-
-      // Luma
-      for (y = 0; y < LCU_WIDTH >> depth; y++) {
-        for (x = 0; x < LCU_WIDTH >> depth; x++) {
-          bitstream_put(cabac.stream, base_y[x + y * encoder->in.width], 8);
-          }
-        }
-
-      // Chroma
-      if (encoder->in.video_format != FORMAT_400) {
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            bitstream_put(cabac.stream, base_u[x + y * (encoder->in.width >> 1)], 8);
-          }
-        }
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            bitstream_put(cabac.stream, base_v[x + y * (encoder->in.width >> 1)], 8);
-          }
-        }
-      }
-    }
-    // end PCM sample
-      cabac_start(cabac);
-  } // end Code IPCM block
-#endif /* END ENABLE_PCM */
-  else { /* Should not happend */
-    printf("UNHANDLED TYPE!\r\n");
-    assert(0);
-    exit(1);
-  }
-
-   /* end prediction unit */
-  /* end coding_unit */
-}
-
-
-coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth)
-{
-  // Scan mode is diagonal, except for 4x4+8x8 luma and 4x4 chroma, where:
-  // - angular 6-14 = vertical
-  // - angular 22-30 = horizontal
-  if (cu_type == CU_INTRA && depth >= 3) {
-    if (intra_mode >= 6 && intra_mode <= 14) {
-      return SCAN_VER;
-    } else if (intra_mode >= 22 && intra_mode <= 30) {
-      return SCAN_HOR;
-    }
-  }
-
-  return SCAN_DIAG;
-}
-
-
-static void encode_transform_unit(encoder_state * const encoder_state,
-                                  int x_pu, int y_pu, int depth)
-{
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  uint8_t width = LCU_WIDTH >> depth;
-  uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
-
-  int x_cu = x_pu / 2;
-  int y_cu = y_pu / 2;
-  cu_info *cur_cu = &cur_pic->cu_array[x_cu + y_cu * (cur_pic->width_in_lcu << MAX_DEPTH)];
-
-  coefficient coeff_y[LCU_WIDTH*LCU_WIDTH+1];
-  coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
-  coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
-  int32_t coeff_stride = cur_pic->width;
-
-  int8_t scan_idx = get_scan_order(cur_cu->type, cur_cu->intra[PU_INDEX(x_pu, y_pu)].mode, depth);
-
-  int cbf_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
-
-  if (cbf_y) {
-    int x = x_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    int y = y_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    coefficient *orig_pos = &cur_pic->coeff_y[x + y * cur_pic->width];
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        coeff_y[x+y*width] = orig_pos[x];
-      }
-      orig_pos += coeff_stride;
-    }
-  }
-
-  // CoeffNxN
-  // Residual Coding
-  if (cbf_y) {
-    encode_coeff_nxn(encoder_state, coeff_y, width, 0, scan_idx, cur_cu->intra[PU_INDEX(x_pu, y_pu)].tr_skip);
-  }
-
-  if (depth == MAX_DEPTH + 1 && !(x_pu % 2 && y_pu % 2)) {
-    // For size 4x4 luma transform the corresponding chroma transforms are
-    // also of size 4x4 covering 8x8 luma pixels. The residual is coded
-    // in the last transform unit so for the other ones, don't do anything.
-    return;
-  }
-
-  if (cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth)) {
-    int x, y;
-    coefficient *orig_pos_u, *orig_pos_v;
-
-    if (depth <= MAX_DEPTH) {
-      x = x_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-      y = y_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-    } else {
-      // for 4x4 select top left pixel of the CU.
-      x = x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-      y = y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-    }
-    orig_pos_u = &cur_pic->coeff_u[x + y * (cur_pic->width >> 1)];
-    orig_pos_v = &cur_pic->coeff_v[x + y * (cur_pic->width >> 1)];
-    for (y = 0; y < (width_c); y++) {
-      for (x = 0; x < (width_c); x++) {
-        coeff_u[x+y*(width_c)] = orig_pos_u[x];
-        coeff_v[x+y*(width_c)] = orig_pos_v[x];
-      }
-      orig_pos_u += coeff_stride>>1;
-      orig_pos_v += coeff_stride>>1;
-    }
-
-    scan_idx = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, depth);
-
-    if (cbf_is_set(cur_cu->cbf.u, depth)) {
-      encode_coeff_nxn(encoder_state, coeff_u, width_c, 2, scan_idx, 0);
-    }
-
-    if (cbf_is_set(cur_cu->cbf.v, depth)) {
-      encode_coeff_nxn(encoder_state, coeff_v, width_c, 2, scan_idx, 0);
-    }
-  }
-}
-
-/**
- * \param encoder
- * \param x_pu            Prediction units' x coordinate.
- * \param y_pu            Prediction units' y coordinate.
- * \param depth           Depth from LCU.
- * \param tr_depth        Depth from last CU.
- * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
- * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
- */
-void encode_transform_coeff(encoder_state * const encoder_state, int32_t x_pu,int32_t y_pu,
-                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  int32_t x_cu = x_pu / 2;
-  int32_t y_cu = y_pu / 2;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  cu_info *cur_cu = &cur_pic->cu_array[x_cu + y_cu * (cur_pic->width_in_lcu << MAX_DEPTH)];
-
-  // NxN signifies implicit transform split at the first transform level.
-  // There is a similar implicit split for inter, but it is only used when
-  // transform hierarchy is not in use.
-  int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
-
-  // The implicit split by intra NxN is not counted towards max_tr_depth.
-  int max_tr_depth = (cur_cu->type == CU_INTRA ? TR_DEPTH_INTRA + intra_split_flag : TR_DEPTH_INTER);
-
-  int8_t split = (cur_cu->tr_depth > depth);
-
-  const int cb_flag_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
-  const int cb_flag_u = cbf_is_set(cur_cu->cbf.u, depth);
-  const int cb_flag_v = cbf_is_set(cur_cu->cbf.v, depth);
-
-  // The split_transform_flag is not signaled when:
-  // - transform size is greater than 32 (depth == 0)
-  // - transform size is 4 (depth == MAX_PU_DEPTH)
-  // - transform depth is max
-  // - cu is intra NxN and it's the first split
-  if (depth > 0 &&
-      depth < MAX_PU_DEPTH &&
-      tr_depth < max_tr_depth &&
-      !(intra_split_flag && tr_depth == 0))
-  {
-    cabac->ctx = &(cabac->ctx_trans_subdiv_model[5 - ((g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
-    CABAC_BIN(cabac, split, "split_transform_flag");
-  }
-
-  // Chroma cb flags are not signaled when one of the following:
-  // - transform size is 4 (2x2 chroma transform doesn't exist)
-  // - they have already been signaled to 0 previously
-  // When they are not present they are inferred to be 0, except for size 4
-  // when the flags from previous level are used.
-  if (depth < MAX_PU_DEPTH) {
-    cabac->ctx = &(cabac->ctx_qt_cbf_model_chroma[tr_depth]);
-    if (tr_depth == 0 || parent_coeff_u) {
-      CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
-    }
-    if (tr_depth == 0 || parent_coeff_v) {
-      CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
-    }
-  }
-
-  if (split) {
-    uint8_t pu_offset = 1 << (MAX_PU_DEPTH - (depth + 1));
-    encode_transform_coeff(encoder_state, x_pu, y_pu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(encoder_state, x_pu + pu_offset, y_pu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(encoder_state, x_pu, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(encoder_state, x_pu + pu_offset, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    return;
-  }
-
-  // Luma coded block flag is signaled when one of the following:
-  // - prediction mode is intra
-  // - transform depth > 0
-  // - we have chroma coefficients at this level
-  // When it is not present, it is inferred to be 1.
-  if(cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
-      cabac->ctx = &(cabac->ctx_qt_cbf_model_luma[!tr_depth]);
-      CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
-  }
-
-  if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    encode_transform_unit(encoder_state, x_pu, y_pu, depth);
-  }
-}
-
-void encode_coeff_nxn(encoder_state * const encoder_state, coefficient *coeff, uint8_t width,
-                      uint8_t type, int8_t scan_mode, int8_t tr_skip)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  cabac_data * const cabac = &encoder_state->cabac;
-  int c1 = 1;
-  uint8_t last_coeff_x = 0;
-  uint8_t last_coeff_y = 0;
-  int32_t i;
-  uint32_t sig_coeffgroup_flag[64];
-
-  uint32_t num_nonzero = 0;
-  int32_t scan_pos_last = -1;
-  int32_t pos_last = 0;
-  int32_t shift   = 4>>1;
-  int8_t be_valid = ENABLE_SIGN_HIDING;
-  int32_t scan_pos_sig;
-  int32_t last_scan_set;
-  uint32_t go_rice_param = 0;
-  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
-
-  // CONSTANTS
-  const uint32_t num_blk_side    = width >> shift;
-  const uint32_t log2_block_size = g_convert_to_bit[width] + 2;
-  const uint32_t *scan           =
-    g_sig_last_scan[scan_mode][log2_block_size - 1];
-  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
-
-  // Init base contexts according to block type
-  cabac_ctx *base_coeff_group_ctx = &(cabac->ctx_cu_sig_coeff_group_model[type]);
-  cabac_ctx *baseCtx           = (type == 0) ? &(cabac->ctx_cu_sig_model_luma[0]) :
-                                 &(cabac->ctx_cu_sig_model_chroma[0]);
-  memset(sig_coeffgroup_flag,0,sizeof(uint32_t)*64);
-
-  // Count non-zero coeffs
-  for (i = 0; i < width * width; i++) {
-    if (coeff[i] != 0) {
-      num_nonzero++;
-    }
-  }
-
-  // Transforms with no non-zero coefficients are indicated with CBFs.
-  assert(num_nonzero != 0);
-
-  // transform skip flag
-  if(width == 4 && encoder->trskip_enable) {
-    cabac->ctx = (type == 0) ? &(cabac->ctx_transform_skip_model_luma) : &(cabac->ctx_transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
-  }
-
-  scan_pos_last = -1;
-
-  // Significance mapping
-  while (num_nonzero > 0) {
-    pos_last = scan[++scan_pos_last];
-#define POSY (pos_last >> log2_block_size)
-#define POSX (pos_last - ( POSY << log2_block_size ))
-
-    if (coeff[pos_last] != 0) {
-      sig_coeffgroup_flag[(num_blk_side * (POSY >> shift) + (POSX >> shift))] = 1;
-    }
-
-    num_nonzero -= (coeff[pos_last] != 0) ? 1 : 0;
-    #undef POSY
-    #undef POSX
-  }
-
-  last_coeff_x = pos_last & (width - 1);
-  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
-
-  // Code last_coeff_x and last_coeff_y
-  encode_last_significant_xy(encoder_state, last_coeff_x, last_coeff_y, width, width,
-                             type, scan_mode);
-
-  scan_pos_sig  = scan_pos_last;
-  last_scan_set = (scan_pos_last >> 4);
-
-  // significant_coeff_flag
-  for (i = last_scan_set; i >= 0; i--) {
-    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
-    int32_t abs_coeff[16];
-    int32_t cg_blk_pos     = scan_cg[i];
-    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
-    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
-
-    uint32_t coeff_signs   = 0;
-    int32_t last_nz_pos_in_cg = -1;
-    int32_t first_nz_pos_in_cg = 16;
-    int32_t num_non_zero = 0;
-    go_rice_param = 0;
-
-    if (scan_pos_sig == scan_pos_last) {
-      abs_coeff[0] = abs(coeff[pos_last]);
-      coeff_signs  = (coeff[pos_last] < 0);
-      num_non_zero = 1;
-      last_nz_pos_in_cg  = scan_pos_sig;
-      first_nz_pos_in_cg = scan_pos_sig;
-      scan_pos_sig--;
-    }
-
-    if (i == last_scan_set || i == 0) {
-      sig_coeffgroup_flag[cg_blk_pos] = 1;
-    } else {
-      uint32_t sig_coeff_group   = (sig_coeffgroup_flag[cg_blk_pos] != 0);
-      uint32_t ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                      cg_pos_y, width);
-      cabac->ctx = &base_coeff_group_ctx[ctx_sig];
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
-    }
-
-    if (sig_coeffgroup_flag[cg_blk_pos]) {
-      int32_t pattern_sig_ctx = context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
-                                                             cg_pos_x, cg_pos_y, width);
-
-      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
-        blk_pos = scan[scan_pos_sig];
-        pos_y   = blk_pos >> log2_block_size;
-        pos_x   = blk_pos - (pos_y << log2_block_size);
-        sig    = (coeff[blk_pos] != 0) ? 1 : 0;
-
-        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
-          ctx_sig  = context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
-                                             log2_block_size, type);
-          cabac->ctx = &baseCtx[ctx_sig];
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
-        }
-
-        if (sig) {
-          abs_coeff[num_non_zero] = abs(coeff[blk_pos]);
-          coeff_signs              = 2 * coeff_signs + (coeff[blk_pos] < 0);
-          num_non_zero++;
-
-          if (last_nz_pos_in_cg == -1) {
-            last_nz_pos_in_cg = scan_pos_sig;
-          }
-
-          first_nz_pos_in_cg  = scan_pos_sig;
-        }
-      }
-    } else {
-      scan_pos_sig = sub_pos - 1;
-    }
-
-    if (num_non_zero > 0) {
-      int8_t sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >=
-                            4 /*SBH_THRESHOLD*/) ? 1 : 0;
-      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
-      cabac_ctx *base_ctx_mod;
-      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
-
-      if (c1 == 0) {
-        ctx_set++;
-      }
-
-      c1 = 1;
-
-      base_ctx_mod     = (type == 0) ? &(cabac->ctx_cu_one_model_luma[4 * ctx_set]) :
-                         &(cabac->ctx_cu_one_model_chroma[4 * ctx_set]);
-      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
-      first_c2_flag_idx = -1;
-
-      for (idx = 0; idx < num_c1_flag; idx++) {
-        uint32_t symbol = (abs_coeff[idx] > 1) ? 1 : 0;
-        cabac->ctx = &base_ctx_mod[c1];
-        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
-
-        if (symbol) {
-          c1 = 0;
-
-          if (first_c2_flag_idx == -1) {
-            first_c2_flag_idx = idx;
-          }
-        } else if ((c1 < 3) && (c1 > 0)) {
-          c1++;
-        }
-      }
-
-      if (c1 == 0) {
-        base_ctx_mod = (type == 0) ? &(cabac->ctx_cu_abs_model_luma[ctx_set]) :
-                       &(cabac->ctx_cu_abs_model_chroma[ctx_set]);
-
-        if (first_c2_flag_idx != -1) {
-          uint8_t symbol = (abs_coeff[first_c2_flag_idx] > 2) ? 1 : 0;
-          cabac->ctx      = &base_ctx_mod[0];
-          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
-        }
-      }
-
-      if (be_valid && sign_hidden) {
-        CABAC_BINS_EP(cabac, (coeff_signs >> 1), (num_non_zero - 1), "coeff_sign_flag");
-      } else {
-        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
-      }
-
-      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
-        first_coeff2 = 1;
-
-        for (idx = 0; idx < num_non_zero; idx++) {
-          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
-
-          if (abs_coeff[idx] >= base_level) {
-            cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param);
-
-            if (abs_coeff[idx] > 3 * (1 << go_rice_param)) {
-              go_rice_param = MIN(go_rice_param + 1, 4);
-            }
-          }
-
-          if (abs_coeff[idx] >= 2) {
-            first_coeff2 = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-/*!
- \brief Encode (X,Y) position of the last significant coefficient
- \param lastpos_x X component of last coefficient
- \param lastpos_y Y component of last coefficient
- \param width  Block width
- \param height Block height
- \param type plane type / luminance or chrominance
- \param scan scan type (diag, hor, ver)
-
- This method encodes the X and Y component within a block of the last significant coefficient.
-*/
-void encode_last_significant_xy(encoder_state * const encoder_state,
-                                uint8_t lastpos_x, uint8_t lastpos_y,
-                                uint8_t width, uint8_t height,
-                                uint8_t type, uint8_t scan)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  uint8_t offset_x  = type?0:((TOBITS(width)*3) + ((TOBITS(width)+1)>>2)),offset_y = offset_x;
-  uint8_t shift_x   = type?(TOBITS(width)):((TOBITS(width)+3)>>2), shift_y = shift_x;
-  int group_idx_x;
-  int group_idx_y;
-  int last_x,last_y,i;
-  cabac_ctx *base_ctx_x = (type ? cabac->ctx_cu_ctx_last_x_chroma : cabac->ctx_cu_ctx_last_x_luma);
-  cabac_ctx *base_ctx_y = (type ? cabac->ctx_cu_ctx_last_y_chroma : cabac->ctx_cu_ctx_last_y_luma);
-
-  if (scan == SCAN_VER) {
-    SWAP( lastpos_x, lastpos_y,uint8_t );
-  }
-
-  group_idx_x   = g_group_idx[lastpos_x];
-  group_idx_y   = g_group_idx[lastpos_y];
-
-  // Last X binarization
-  for (last_x = 0; last_x < group_idx_x ; last_x++) {
-    cabac->ctx = &base_ctx_x[offset_x + (last_x >> shift_x)];
-    CABAC_BIN(cabac,1,"last_sig_coeff_x_prefix");
-  }
-
-  if (group_idx_x < g_group_idx[width - 1]) {
-    cabac->ctx = &base_ctx_x[offset_x + (last_x >> shift_x)];
-    CABAC_BIN(cabac,0,"last_sig_coeff_x_prefix");
-  }
-
-  // Last Y binarization
-  for (last_y = 0; last_y < group_idx_y ; last_y++) {
-    cabac->ctx = &base_ctx_y[offset_y + (last_y >> shift_y)];
-    CABAC_BIN(cabac,1,"last_sig_coeff_y_prefix");
-  }
-
-  if (group_idx_y < g_group_idx[height - 1]) {
-    cabac->ctx = &base_ctx_y[offset_y + (last_y >> shift_y)];
-    CABAC_BIN(cabac,0,"last_sig_coeff_y_prefix");
-  }
-
-  // Last X
-  if (group_idx_x > 3) {
-    lastpos_x -= g_min_in_group[group_idx_x];
-
-    for (i = ((group_idx_x - 2) >> 1) - 1; i >= 0; i--) {
-      CABAC_BIN_EP(cabac,(lastpos_x>>i) & 1,"last_sig_coeff_x_suffix");
-    }
-  }
-
-  // Last Y
-  if (group_idx_y > 3) {
-    lastpos_y -= g_min_in_group[group_idx_y];
-
-    for (i = ((group_idx_y - 2) >> 1) - 1; i >= 0; i--) {
-      CABAC_BIN_EP(cabac,(lastpos_y>>i) & 1,"last_sig_coeff_y_suffix");
-    }
-  }
-
-  // end LastSignificantXY
-}
diff --git a/src/encoder.h b/src/encoder.h
index a74e20b2..e8e3b761 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -132,175 +132,8 @@ typedef struct
   
 } encoder_control;
 
-typedef enum {
-  ENCODER_STATE_TYPE_INVALID = 'i',
-  ENCODER_STATE_TYPE_MAIN = 'M',
-  ENCODER_STATE_TYPE_SLICE = 'S',
-  ENCODER_STATE_TYPE_TILE = 'T',
-  ENCODER_STATE_TYPE_WAVEFRONT_ROW = 'W',
-} encoder_state_type;
-
-
-
-typedef struct {
-  double cur_lambda_cost;
-  
-  int32_t frame;
-  int32_t poc; /*!< \brief picture order count */
-  
-  int8_t QP;   //!< \brief Quantization parameter
-  
-  //Current picture available references
-  picture_list *ref;
-  int8_t ref_list;
-  //int8_t ref_idx_num[2];
-  
-  int is_radl_frame;
-  uint8_t pictype;
-  uint8_t slicetype;
-  
-} encoder_state_config_global;
-
-typedef struct {
-  //Current picture to encode
-  picture *cur_pic;
-  
-  int32_t id;
-  
-  //Tile: offset in LCU for current encoder_state in global coordinates
-  int32_t lcu_offset_x;
-  int32_t lcu_offset_y;
-  
-  //Position of the first element in tile scan in global coordinates
-  int32_t lcu_offset_in_ts;
-  
-  //Buffer for search
-  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
-  yuv_t *hor_buf_search;
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
-  yuv_t *ver_buf_search;
-  
-  yuv_t *hor_buf_before_sao;
-  yuv_t *ver_buf_before_sao;
-  
-  //Job pointers for wavefronts
-  threadqueue_job **wf_jobs;
-} encoder_state_config_tile;
-
-typedef struct {
-  int32_t id;
-  
-  //Global coordinates
-  int32_t start_in_ts;
-  int32_t end_in_ts;
-  
-  //Global coordinates
-  int32_t start_in_rs;
-  int32_t end_in_rs;
-} encoder_state_config_slice;
-
-typedef struct {
-  //Row in tile coordinates of the wavefront
-  int32_t lcu_offset_y;
-} encoder_state_config_wfrow;
-
-typedef struct lcu_order_element {
-  //This it used for leaf of the encoding tree. All is relative to the tile.
-  int id;
-  int index;
-  struct encoder_state *encoder_state;
-  vector2d position;
-  vector2d position_px; //Top-left
-  vector2d size;
-  int first_column;
-  int first_row;
-  int last_column;
-  int last_row;
-  
-  struct lcu_order_element *above;
-  struct lcu_order_element *below;
-  struct lcu_order_element *left;
-  struct lcu_order_element *right;
-} lcu_order_element;
-
-typedef struct encoder_state {
-  const encoder_control *encoder_control;
-  encoder_state_type type;
-
-  //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
-  //Use for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-  struct encoder_state *children;
-  struct encoder_state *parent;
-  
-  encoder_state_config_global *global;
-  encoder_state_config_tile   *tile;
-  encoder_state_config_slice  *slice;
-  encoder_state_config_wfrow  *wfrow;
-  
-  int is_leaf; //A leaf encoder state is one which should encode LCUs...
-  lcu_order_element *lcu_order;
-  uint32_t lcu_order_count;
-  
-  bitstream stream;
-  cabac_data cabac;
-} encoder_state;
-
 int encoder_control_init(encoder_control *encoder, const config *cfg);
 int encoder_control_finalize(encoder_control *encoder);
 
 void encoder_control_input_init(encoder_control *encoder, int32_t width, int32_t height);
-
-int encoder_state_init(encoder_state * child_state, encoder_state * parent_state);
-void encoder_state_finalize(encoder_state *encoder_state);
-void encoder_state_init_lambda(encoder_state *encoder_state);
-
-void encode_one_frame(encoder_state *encoder_state);
-int read_one_frame(FILE* file, const encoder_state *encoder);
-
-void encoder_next_frame(encoder_state *encoder_state);
-
-void encode_seq_parameter_set(encoder_state *encoder);
-void encode_pic_parameter_set(encoder_state *encoder);
-void encode_vid_parameter_set(encoder_state *encoder);
-void encode_slice_header(encoder_state * encoder);
-void encode_access_unit_delimiter(encoder_state *encoder);
-void encode_prefix_sei_version(encoder_state *encoder);
-void encode_coding_tree(encoder_state *encoder, uint16_t x_ctb,
-                        uint16_t y_ctb, uint8_t depth);
-
-void encode_last_significant_xy(encoder_state *encoder,
-                                uint8_t lastpos_x, uint8_t lastpos_y,
-                                uint8_t width, uint8_t height,
-                                uint8_t type, uint8_t scan);
-void encode_coeff_nxn(encoder_state *encoder, int16_t *coeff, uint8_t width,
-                      uint8_t type, int8_t scan_mode, int8_t tr_skip);
-void encode_transform_coeff(encoder_state *encoder_state, int32_t x_cu, int32_t y_cu,
-                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
-void encode_block_residual(const encoder_control * const encoder,
-                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
-
-coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth);
-
-static const uint8_t g_group_idx[32] = {
-  0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
-  6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
-  8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
-  9, 9 };
-
-static const uint8_t g_min_in_group[10] = {
-  0, 1, 2, 3, 4, 6, 8, 12, 16, 24 };
-
-
-#define C1FLAG_NUMBER 8 // maximum number of largerThan1 flag coded in one chunk
-#define C2FLAG_NUMBER 1 // maximum number of largerThan2 flag coded in one chunk
-
-//Get the data for vertical buffer position at the left of LCU identified by the position in pixel
-#define OFFSET_VER_BUF(position_x, position_y, cur_pic, i) ((position_y) + i + ((position_x)/LCU_WIDTH - 1) * (cur_pic)->height)
-#define OFFSET_VER_BUF_C(position_x, position_y, cur_pic, i) ((position_y/2) + i + ((position_x)/LCU_WIDTH - 1) * (cur_pic)->height / 2)
-
-//Get the data for horizontal buffer position at the top of LCU identified by the position in pixel
-#define OFFSET_HOR_BUF(position_x, position_y, cur_pic, i) ((position_x) + i + ((position_y)/LCU_WIDTH - 1) * (cur_pic)->width)
-#define OFFSET_HOR_BUF_C(position_x, position_y, cur_pic, i) ((position_x/2) + i + ((position_y)/LCU_WIDTH - 1) * (cur_pic)->width / 2)
-  
-
 #endif
diff --git a/src/encoderstate.c b/src/encoderstate.c
new file mode 100644
index 00000000..54a26979
--- /dev/null
+++ b/src/encoderstate.c
@@ -0,0 +1,3236 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include "encoderstate.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "tables.h"
+#include "config.h"
+#include "cabac.h"
+#include "picture.h"
+#include "nal.h"
+#include "context.h"
+#include "transform.h"
+#include "intra.h"
+#include "inter.h"
+#include "filter.h"
+#include "search.h"
+#include "sao.h"
+#include "rdo.h"
+
+/* Local functions. */
+static void add_checksum(encoder_state *encoder);
+static void encode_VUI(encoder_state *encoder);
+static void encode_sao(encoder_state *encoder,
+                       unsigned x_lcu, uint16_t y_lcu,
+                       sao_info *sao_luma, sao_info *sao_chroma);
+
+static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state);
+static void worker_encoder_state_write_bitstream_leaf(void * opaque);
+
+/*!
+  \brief Initializes lambda-value for current QP
+
+  Implementation closer to HM (Used HM12 as reference)
+   - Still missing functionality when GOP and B-pictures are used
+ */
+void encoder_state_init_lambda(encoder_state * const encoder_state)
+{
+  double qp = encoder_state->global->QP;
+  double lambda_scale = 1.0;
+  double qp_temp      = qp - 12;
+  double lambda;
+
+  // Default QP-factor from HM config
+  double qp_factor = 0.4624;
+
+  if (encoder_state->global->slicetype == SLICE_I) {
+    qp_factor=0.57*lambda_scale;
+  }
+
+  lambda = qp_factor*pow( 2.0, qp_temp/3.0 );
+
+  if (encoder_state->global->slicetype != SLICE_I ) {
+    lambda *= 0.95;
+  }
+
+  encoder_state->global->cur_lambda_cost = lambda;
+}
+
+static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
+    return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
+    return 1;
+  }
+  return 0;
+}
+
+//Return 1 if the LCU is at the first row of a structure (tile or slice)
+static int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y) {
+    return 1;
+  }
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->start_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
+    return 1;
+  }
+  
+  //One row above is before the start of the slice => it's also a boundary
+  if (lcu_addr_in_rs - encoder_state->encoder_control->in.width_in_lcu < encoder_state->slice->start_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+//Return 1 if the LCU is at the first row of a structure (tile or slice)
+static int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y + encoder_state->tile->cur_pic->height_in_lcu - 1) {
+    return 1;
+  }
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->end_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
+    return 1;
+  }
+  
+  //One row below is before the end of the slice => it's also a boundary
+  if (lcu_addr_in_rs + encoder_state->encoder_control->in.width_in_lcu > encoder_state->slice->end_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+
+//Return 1 if the LCU is at the first column of a structure (tile or slice)
+static int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  //First column of tile?
+  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x) {
+    return 1;
+  }
+  
+  //Slice start may not be aligned with the tile, so we need to allow this
+  if (lcu_addr_in_rs == encoder_state->slice->start_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+//Return 1 if the LCU is at the last column of a structure (tile or slice)
+static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  //First column of tile?
+  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x + encoder_state->tile->cur_pic->width_in_lcu - 1) {
+    return 1;
+  }
+  
+  //Slice start may not be aligned with the tile, so we need to allow this
+  if (lcu_addr_in_rs == encoder_state->slice->end_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+
+static int encoder_state_config_global_init(encoder_state * const encoder_state) {
+  encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
+  if(!encoder_state->global->ref) {
+    fprintf(stderr, "Failed to allocate the picture list!\n");
+    return 0;
+  }
+  encoder_state->global->ref_list = REF_PIC_LIST_0;
+  encoder_state->global->frame = 0;
+  encoder_state->global->poc = 0;
+  return 1;
+}
+
+static void encoder_state_config_global_finalize(encoder_state * const encoder_state) {
+  picture_list_destroy(encoder_state->global->ref);
+}
+
+static int encoder_state_config_tile_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_x, const int lcu_offset_y,
+                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
+  
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  encoder_state->tile->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
+
+  if (!encoder_state->tile->cur_pic) {
+    printf("Error allocating picture!\r\n");
+    return 0;
+  }
+  
+  // Init coeff data table
+  //FIXME: move them
+  encoder_state->tile->cur_pic->coeff_y = MALLOC(coefficient, width * height);
+  encoder_state->tile->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
+  encoder_state->tile->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
+  
+  encoder_state->tile->lcu_offset_x = lcu_offset_x;
+  encoder_state->tile->lcu_offset_y = lcu_offset_y;
+  
+  encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
+  
+  //Allocate buffers
+  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
+  encoder_state->tile->hor_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
+  encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu);
+  
+  if (encoder->sao_enable) {
+    encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+  } else {
+    encoder_state->tile->hor_buf_before_sao = NULL;
+  }
+  
+  if (encoder->wpp) {
+    encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+    if (!encoder_state->tile->wf_jobs) {
+      printf("Error allocating wf_jobs array!\n");
+      return 0;
+    }
+  } else {
+    encoder_state->tile->wf_jobs = NULL;
+  }
+  
+  encoder_state->tile->id = encoder->tiles_tile_id[encoder_state->tile->lcu_offset_in_ts];
+  return 1;
+}
+
+static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
+  if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao);
+  
+  yuv_t_free(encoder_state->tile->hor_buf_search);
+  yuv_t_free(encoder_state->tile->ver_buf_search);
+  
+  picture_free(encoder_state->tile->cur_pic);
+  encoder_state->tile->cur_pic = NULL;
+  
+  FREE_POINTER(encoder_state->tile->wf_jobs);
+}
+
+static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
+                                          const int start_address_in_ts, const int end_address_in_ts) {
+  int i = 0, slice_found=0;
+  for (i = 0; i < encoder_state->encoder_control->slice_count; ++i) {
+    if (encoder_state->encoder_control->slice_addresses_in_ts[i] == start_address_in_ts) {
+      encoder_state->slice->id = i;
+      slice_found = 1;
+      break;
+    }
+  }
+  assert(slice_found);
+  encoder_state->slice->start_in_ts = start_address_in_ts;
+  encoder_state->slice->end_in_ts = end_address_in_ts;
+  
+  encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
+  encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
+  return 1;
+}
+
+static void encoder_state_config_slice_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+static int encoder_state_config_wfrow_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_y) {
+  
+  encoder_state->wfrow->lcu_offset_y = lcu_offset_y;
+  return 1;
+}
+
+static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+#ifdef _DEBUG
+static void encoder_state_dump_graphviz(const encoder_state * const encoder_state) {
+  int i;
+  
+  if (!encoder_state->parent) {
+    const encoder_control * const encoder = encoder_state->encoder_control;
+    int y,x;
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+    //Some styling...
+    printf("digraph EncoderStates {\n");
+    printf(" fontname = \"Bitstream Vera Sans\"\n");
+    printf(" fontsize = 8\n\n");
+    printf(" node [\n");
+    printf("  fontname = \"Bitstream Vera Sans\"\n");
+    printf("  fontsize = 8\n");
+    printf("  shape = \"record\"\n");
+    printf(" ]\n\n");
+    printf(" edge [\n");
+    printf("  arrowtail = \"empty\"\n");
+    printf(" ]\n\n");
+    
+    printf(" \"Map\" [\n");
+    printf("  shape=plaintext\n");
+    printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        
+        printf("<td>%d</td>", lcu_id_rs);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>TS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", lcu_id_ts);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Tile map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", encoder->tiles_tile_id[lcu_id_ts]);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Slice map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        int slice_id = 0;
+        
+        //Not efficient, but who cares
+        for (i=0; i < encoder->slice_count; ++i) {
+          if (encoder->slice_addresses_in_ts[i] <= lcu_id_ts) {
+            slice_id = i;
+          }
+        }
+        
+        printf("<td>%d</td>", slice_id);
+      }
+      printf("</tr>");
+    }
+    printf("</table>>\n ]\n");
+  }
+  
+  printf(" \"%p\" [\n", encoder_state);
+  printf("  label = \"{encoder_state|");
+  printf("+ type=%c\\l", encoder_state->type);
+  if (!encoder_state->parent || encoder_state->global != encoder_state->parent->global) {
+    printf("|+ global\\l");
+  }
+  if (!encoder_state->parent || encoder_state->tile != encoder_state->parent->tile) {
+    printf("|+ tile\\l");
+    printf(" - id = %d\\l", encoder_state->tile->id);
+    printf(" - lcu_offset_x = %d\\l", encoder_state->tile->lcu_offset_x);
+    printf(" - lcu_offset_y = %d\\l", encoder_state->tile->lcu_offset_y);
+    printf(" - lcu_offset_in_ts = %d\\l", encoder_state->tile->lcu_offset_in_ts);
+  }
+  if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
+    printf("|+ slice\\l");
+    printf(" - id = %d\\l", encoder_state->slice->id);
+    printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
+    printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
+    printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
+    printf(" - end_in_rs = %d\\l", encoder_state->slice->end_in_rs);
+  }
+  if (!encoder_state->parent || encoder_state->wfrow != encoder_state->parent->wfrow) {
+    printf("|+ wfrow\\l");
+    printf(" - lcu_offset_y = %d\\l", encoder_state->wfrow->lcu_offset_y);
+  }
+  printf("}\"\n");
+  printf(" ]\n");
+  
+  if (encoder_state->parent) {
+    printf(" \"%p\" -> \"%p\"\n", encoder_state->parent, encoder_state);
+  }
+  
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    encoder_state_dump_graphviz(&encoder_state->children[i]);
+  }
+  
+  if (!encoder_state->parent) {
+    printf("}\n");
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+  }
+}
+#endif //_DEBUG
+
+int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
+  //We require that, if parent_state is NULL:
+  //child_state->encoder_control is set
+  //
+  //If parent_state is not NULL, the following variable should either be set to NULL,
+  //in order to inherit from parent, or should point to a valid structure:
+  //child_state->global
+  //child_state->tile
+  //child_state->slice
+  //child_state->wfrow
+  
+  child_state->parent = parent_state;
+  child_state->children = MALLOC(encoder_state, 1);
+  child_state->children[0].encoder_control = NULL;
+  
+  if (!parent_state) {
+    const encoder_control * const encoder = child_state->encoder_control;
+    child_state->type = ENCODER_STATE_TYPE_MAIN;
+    assert(child_state->encoder_control);
+    child_state->global = MALLOC(encoder_state_config_global, 1);
+    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
+      fprintf(stderr, "Could not initialize encoder_state->global!\n");
+      return 0;
+    }
+    child_state->tile = MALLOC(encoder_state_config_tile, 1);
+    if (!child_state->tile || !encoder_state_config_tile_init(child_state, 0, 0, encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu)) {
+      fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+      return 0;
+    }
+    child_state->slice = MALLOC(encoder_state_config_slice, 1);
+    if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
+      fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+      return 0;
+    }
+    child_state->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+    if (!child_state->wfrow || !encoder_state_config_wfrow_init(child_state, 0)) {
+      fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+      return 0;
+    }
+  } else {
+    child_state->encoder_control = parent_state->encoder_control;
+    if (!child_state->global) child_state->global = parent_state->global;
+    if (!child_state->tile) child_state->tile = parent_state->tile;
+    if (!child_state->slice) child_state->slice = parent_state->slice;
+    if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
+  }
+  
+  //Allocate bitstream
+  if (child_state->type == ENCODER_STATE_TYPE_MAIN) {
+    //Main encoder outputs to file
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_FILE)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+    child_state->stream.file.output = child_state->encoder_control->out.file;
+  } else {
+    //Other encoders use a memory bitstream
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_MEMORY)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+  }
+  
+  // Set CABAC output bitstream
+  child_state->cabac.stream = &child_state->stream;
+  
+  //Create sub-encoders
+  {
+    const encoder_control * const encoder = child_state->encoder_control;
+    int child_count = 0;
+    //We first check the type of this element.
+    //If it's a MAIN, it can allow both slices or tiles as child
+    //If it's a TILE, it can allow slices as child, if its parent is not a slice, or wavefront rows if there is no other children
+    //If it's a SLICE, it can allow tiles as child, if its parent is not a tile, or wavefront rows if there is no other children
+    //If it's a WAVEFRONT_ROW, it doesn't allow any children
+    int children_allow_wavefront_row = 0;
+    int children_allow_slice = 0;
+    int children_allow_tile = 0;
+    int range_start;
+    
+    int start_in_ts, end_in_ts;
+    
+    switch(child_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        children_allow_slice = 1;
+        children_allow_tile = 1;
+        start_in_ts = 0;
+        end_in_ts = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->slice->start_in_ts;
+        end_in_ts = child_state->slice->end_in_ts;
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->tile->lcu_offset_in_ts;
+        end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+        break;
+      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
+        //GCC tries to be too clever...
+        start_in_ts = -1;
+        end_in_ts = -1;
+        break;
+      default:
+        fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
+        assert(0);
+        return 0;
+    }
+    
+    range_start = start_in_ts;
+    //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
+    while (range_start < end_in_ts && (children_allow_slice || children_allow_tile)) {
+      encoder_state *new_child = NULL;
+      int range_end_slice = range_start; //Will be incremented to get the range of the "thing"
+      int range_end_tile = range_start; //Will be incremented to get the range of the "thing"
+      
+      int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
+      int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
+      
+      //Find the smallest structure following the cursor
+      if (slice_allowed) {
+        while(!lcu_at_slice_end(encoder, range_end_slice)) {
+          ++range_end_slice;
+        }
+      }
+      
+      if (tile_allowed) {
+        while(!lcu_at_tile_end(encoder, range_end_tile)) {
+          ++range_end_tile;
+        }
+      }
+      
+      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d end_in_ts=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed,end_in_ts);
+      
+      if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
+        //Create a slice
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_SLICE;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->wfrow = child_state->wfrow;
+        new_child->slice = MALLOC(encoder_state_config_slice, 1);
+        if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
+          fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+          return 0;
+        }
+      }
+      
+      if ((!slice_allowed || (range_end_slice < range_end_tile)) && !new_child && tile_allowed) {
+        //Create a tile
+        int tile_id = encoder->tiles_tile_id[range_start];
+        int tile_x = tile_id % encoder->tiles_num_tile_columns;
+        int tile_y = tile_id / encoder->tiles_num_tile_columns;
+        
+        int lcu_offset_x = encoder->tiles_col_bd[tile_x];
+        int lcu_offset_y = encoder->tiles_row_bd[tile_y];
+        int width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
+        int height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
+        int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
+        int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
+        
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_TILE;
+        new_child->global = child_state->global;
+        new_child->tile = MALLOC(encoder_state_config_tile, 1);
+        new_child->slice = child_state->slice;
+        new_child->wfrow = child_state->wfrow;
+        
+        if (!new_child->tile || !encoder_state_config_tile_init(new_child, lcu_offset_x, lcu_offset_y, width, height, width_in_lcu, height_in_lcu)) {
+          fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+          return 0;
+        }
+      }
+      
+      if (new_child) {
+        child_state->children = realloc(child_state->children, sizeof(encoder_state) * (2+child_count));
+        child_state->children[1+child_count].encoder_control = NULL;
+        if (!child_state->children) {
+          fprintf(stderr, "Failed to allocate memory for children...\n");
+          return 0;
+        }
+
+        //Fix children parent (since we changed the address), except for the last one which is not ready yet
+        {
+          int i, j;
+          for (i = 0; child_state->children[i].encoder_control && i < child_count; ++i) {
+            for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
+              child_state->children[i].children[j].parent = &child_state->children[i];
+            }
+            for (j = 0; j < child_state->children[i].lcu_order_count; ++j) {
+              child_state->children[i].lcu_order[j].encoder_state = &child_state->children[i];
+            }
+            child_state->children[i].cabac.stream = &child_state->children[i].stream;
+          }
+        }
+        
+        if (!encoder_state_init(&child_state->children[child_count], child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+        child_count += 1;
+      }
+      
+      range_start = MAX(range_end_slice, range_end_tile) + 1;
+    }
+    
+    //We create wavefronts only if we have no children
+    if (children_allow_wavefront_row && child_count == 0) {
+      int first_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int last_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int num_rows;
+      int i;
+      
+      assert(!(children_allow_slice || children_allow_tile));
+      assert(child_count == 0);
+      
+      for (i=start_in_ts; i<end_in_ts; ++i) {
+        const int row = encoder->tiles_ctb_addr_ts_to_rs[i] / encoder->in.width_in_lcu;
+        if (row < first_row) first_row = row;
+        if (row > last_row) last_row = row;
+      }
+      
+      num_rows = last_row - first_row + 1;
+      
+      //When entropy_coding_sync_enabled_flag is equal to 1 and the first coding tree block in a slice is not the first coding
+      //tree block of a row of coding tree blocks in a tile, it is a requirement of bitstream conformance that the last coding tree
+      //block in the slice shall belong to the same row of coding tree blocks as the first coding tree block in the slice.
+      
+      if (encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] % encoder->in.width_in_lcu != child_state->tile->lcu_offset_x) {
+        if (num_rows > 1) {
+          fprintf(stderr, "Invalid: first CTB in slice %d is not at the tile %d edge, and the slice spans on more than one row.\n", child_state->slice->id, child_state->tile->id);
+          return 0;
+        }
+      }
+      
+      //FIXME Do the same kind of check if we implement slice segments
+    
+      child_count = num_rows;
+      child_state->children = realloc(child_state->children, sizeof(encoder_state) * (num_rows + 1));
+      child_state->children[num_rows].encoder_control = NULL;
+      
+      for (i=0; i < num_rows; ++i) {
+        encoder_state *new_child = &child_state->children[i];
+        
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->slice = child_state->slice;
+        new_child->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+        
+        if (!new_child->wfrow || !encoder_state_config_wfrow_init(new_child, i)) {
+          fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+          return 0;
+        }
+        
+        if (!encoder_state_init(new_child, child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+      }
+    }
+    
+    child_state->is_leaf = (child_count == 0);
+    //This node is a leaf, compute LCU-order
+    if (child_state->is_leaf) {
+      //All LCU computations are relative to the tile
+      //Remark: this could be optimized, but since it's run only once, it's better to do it in a understandable way.
+      
+      //By default, the full tile
+      int i;
+      int lcu_id;
+      int lcu_start = 0;
+      //End is the element AFTER the end (iterate < lcu_end)
+      int lcu_end = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+      
+      //Restrict to the current slice if needed
+      lcu_start = MAX(lcu_start, child_state->slice->start_in_ts - child_state->tile->lcu_offset_in_ts);
+      lcu_end = MIN(lcu_end, child_state->slice->end_in_ts - child_state->tile->lcu_offset_in_ts + 1);
+      
+      //Restrict to the current wavefront row if needed
+      if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+        lcu_start = MAX(lcu_start, (child_state->wfrow->lcu_offset_y) * child_state->tile->cur_pic->width_in_lcu);
+        lcu_end = MIN(lcu_end, (child_state->wfrow->lcu_offset_y + 1) * child_state->tile->cur_pic->width_in_lcu);
+      }
+      
+      child_state->lcu_order_count = lcu_end - lcu_start;
+      child_state->lcu_order = MALLOC(lcu_order_element, child_state->lcu_order_count);
+      assert(child_state->lcu_order);
+      
+      for (i = 0; i < child_state->lcu_order_count; ++i) {
+        lcu_id = lcu_start + i;
+        child_state->lcu_order[i].encoder_state = child_state;
+        child_state->lcu_order[i].id = lcu_id;
+        child_state->lcu_order[i].index = i;
+        child_state->lcu_order[i].position.x = lcu_id % child_state->tile->cur_pic->width_in_lcu;
+        child_state->lcu_order[i].position.y = lcu_id / child_state->tile->cur_pic->width_in_lcu;
+        child_state->lcu_order[i].position_px.x = child_state->lcu_order[i].position.x * LCU_WIDTH;
+        child_state->lcu_order[i].position_px.y = child_state->lcu_order[i].position.y * LCU_WIDTH;
+        child_state->lcu_order[i].size.x = MIN(LCU_WIDTH, encoder->in.width - (child_state->tile->lcu_offset_x * LCU_WIDTH + child_state->lcu_order[i].position_px.x));
+        child_state->lcu_order[i].size.y = MIN(LCU_WIDTH, encoder->in.height - (child_state->tile->lcu_offset_y * LCU_WIDTH + child_state->lcu_order[i].position_px.y));
+        child_state->lcu_order[i].first_row = lcu_in_first_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].last_row = lcu_in_last_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].first_column = lcu_in_first_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].last_column = lcu_in_last_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        
+        child_state->lcu_order[i].above = NULL;
+        child_state->lcu_order[i].below = NULL;
+        child_state->lcu_order[i].left = NULL;
+        child_state->lcu_order[i].right = NULL;
+        
+        if (!child_state->lcu_order[i].first_row) {
+          //Find LCU above
+          if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+            int j;
+            //For all previous wavefront rows
+            for (j=0; &child_state->parent->children[j] != child_state && child_state->parent->children[j].encoder_control; ++j) {
+              if (child_state->parent->children[j].wfrow->lcu_offset_y == child_state->wfrow->lcu_offset_y - 1) {
+                int k;
+                for (k=0; k < child_state->parent->children[j].lcu_order_count; ++k) {
+                  if (child_state->parent->children[j].lcu_order[k].position.x == child_state->lcu_order[i].position.x) {
+                    assert(child_state->parent->children[j].lcu_order[k].position.y == child_state->lcu_order[i].position.y - 1);
+                    child_state->lcu_order[i].above = &child_state->parent->children[j].lcu_order[k];
+                  }
+                }
+              }
+            }
+          } else {
+            child_state->lcu_order[i].above = &child_state->lcu_order[i-child_state->tile->cur_pic->width_in_lcu];
+          }
+          assert(child_state->lcu_order[i].above);
+          child_state->lcu_order[i].above->below = &child_state->lcu_order[i];
+        }
+        if (!child_state->lcu_order[i].first_column) {
+          child_state->lcu_order[i].left = &child_state->lcu_order[i-1];
+          assert(child_state->lcu_order[i].left->position.x == child_state->lcu_order[i].position.x - 1);
+          child_state->lcu_order[i].left->right = &child_state->lcu_order[i];
+        }
+      }
+    } else {
+      child_state->lcu_order_count = 0;
+      child_state->lcu_order = NULL;
+    }
+  }
+  
+  //Validate the structure
+  if (child_state->type == ENCODER_STATE_TYPE_TILE) {
+    if (child_state->tile->lcu_offset_in_ts < child_state->slice->start_in_ts) {
+      fprintf(stderr, "Tile %d starts before slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+    if (child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1 > child_state->slice->end_in_ts) {
+      fprintf(stderr, "Tile %d ends after slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+  }
+  
+  if (child_state->type == ENCODER_STATE_TYPE_SLICE) {
+    if (child_state->slice->start_in_ts < child_state->tile->lcu_offset_in_ts) {
+      fprintf(stderr, "Slice %d starts before tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+    if (child_state->slice->end_in_ts > child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1) {
+      fprintf(stderr, "Slice %d ends after tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+  }
+  
+  
+#ifdef _DEBUG
+  if (!parent_state) encoder_state_dump_graphviz(child_state);
+#endif //_DEBUG
+  return 1;
+}
+
+void encoder_state_finalize(encoder_state * const encoder_state) {
+  if (encoder_state->children) {
+    int i=0;
+    for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+      encoder_state_finalize(&encoder_state->children[i]);
+    }
+    
+    FREE_POINTER(encoder_state->children);
+  }
+  
+  FREE_POINTER(encoder_state->lcu_order);
+  encoder_state->lcu_order_count = 0;
+  
+  if (!encoder_state->parent || (encoder_state->parent->wfrow != encoder_state->wfrow)) {
+    encoder_state_config_wfrow_finalize(encoder_state);
+    FREE_POINTER(encoder_state->wfrow);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->slice != encoder_state->slice)) {
+    encoder_state_config_slice_finalize(encoder_state);
+    FREE_POINTER(encoder_state->slice);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->tile != encoder_state->tile)) {
+    encoder_state_config_tile_finalize(encoder_state);
+    FREE_POINTER(encoder_state->tile);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->global != encoder_state->global)) {
+    encoder_state_config_global_finalize(encoder_state);
+    FREE_POINTER(encoder_state->global);
+  }
+  
+  bitstream_finalize(&encoder_state->stream);
+}
+
+
+static void encoder_state_clear_refs(encoder_state *encoder_state) {
+  //FIXME: Do we need to handle children? At present they all share the same global
+  while (encoder_state->global->ref->used_size) {
+    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
+  }
+
+  encoder_state->global->poc = 0;
+}
+
+static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
+  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
+  
+  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
+  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
+  
+  int source_stride = source_enc->tile->cur_pic->width;
+  int target_stride = target_enc->tile->cur_pic->width;
+  
+  int width;
+  int height;
+  
+  int source_offset;
+  int target_offset;
+  
+  //Do nothing if the source and the destination is the same!
+  if (source_enc->tile == target_enc->tile) return;
+
+  if (is_y_channel) {
+    target_offset = source_offset_x + source_offset_y * target_enc->tile->cur_pic->width;
+    source_offset = target_offset_x + target_offset_y * source_enc->tile->cur_pic->width;
+  } else {
+    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->cur_pic->width/2;
+    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->cur_pic->width/2;
+  }
+  
+  if (target_enc->children) {
+    //Use information from the source
+    width = MIN(source_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->width - source_offset_x);
+    height = MIN(source_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, target_enc->tile->cur_pic->height - source_offset_y);
+  } else {
+    //Use information from the target
+    width = MIN(target_enc->tile->cur_pic->width_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->width - target_offset_x);
+    height = MIN(target_enc->tile->cur_pic->height_in_lcu * LCU_WIDTH, source_enc->tile->cur_pic->height - target_offset_y);
+  }
+  
+  if (!is_y_channel) {
+    width /= 2;
+    height /= 2;
+    
+    source_stride /= 2;
+    target_stride /= 2;
+  }
+  
+  //picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_enc->cur_pic->width, target_enc->cur_pic->width);
+  picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_stride, target_stride);
+}
+
+
+
+static void encoder_state_write_aud(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  encode_access_unit_delimiter(encoder_state);
+  nal_write(stream, AUD_NUT, 0, 1);
+  bitstream_align(stream);
+}
+
+static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
+  picture* const cur_pic = encoder_state->tile->cur_pic;
+  
+  if (hor_buf) {
+    const int rdpx = lcu->position_px.x;
+    const int rdpy = lcu->position_px.y + lcu->size.y - 1;
+    const int by = lcu->position.y;
+    
+    //Copy the bottom row of this LCU to the horizontal buffer
+    picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
+                        &hor_buf->y[lcu->position_px.x + by * cur_pic->width],
+                        lcu->size.x, 1, cur_pic->width, cur_pic->width);
+    picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
+                        &hor_buf->u[lcu->position_px.x / 2 + by * cur_pic->width / 2],
+                        lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
+    picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
+                        &hor_buf->v[lcu->position_px.x / 2 + by * cur_pic->width / 2],
+                        lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
+  }
+  
+  if (ver_buf) {
+    const int rdpx = lcu->position_px.x + lcu->size.x - 1;
+    const int rdpy = lcu->position_px.y;
+    const int bx = lcu->position.x;
+    
+    
+    //Copy the right row of this LCU to the vertical buffer.
+    picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
+                        &ver_buf->y[lcu->position_px.y + bx * cur_pic->height],
+                        1, lcu->size.y, cur_pic->width, 1);
+    picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
+                        &ver_buf->u[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
+                        1, lcu->size.y / 2, cur_pic->width / 2, 1);
+    picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
+                        &ver_buf->v[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
+                        1, lcu->size.y / 2, cur_pic->width / 2, 1);
+  }
+  
+}
+
+
+static void worker_encoder_state_encode_lcu(void * opaque) {
+  const lcu_order_element * const lcu = opaque;
+  encoder_state *encoder_state = lcu->encoder_state;
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  picture* const cur_pic = encoder_state->tile->cur_pic;
+  
+  //This part doesn't write to bitstream, it's only search, deblock and sao
+  
+  search_lcu(encoder_state, lcu->position_px.x, lcu->position_px.y, encoder_state->tile->hor_buf_search, encoder_state->tile->ver_buf_search);
+    
+  encoder_state_recdata_to_bufs(encoder_state, lcu, encoder_state->tile->hor_buf_search, encoder_state->tile->ver_buf_search);
+
+  if (encoder->deblock_enable) {
+    filter_deblock_lcu(encoder_state, lcu->position_px.x, lcu->position_px.y);
+  }
+
+  if (encoder->sao_enable) {
+    const int stride = cur_pic->width_in_lcu;
+    sao_info *sao_luma = &cur_pic->sao_luma[lcu->position.y * stride + lcu->position.x];
+    sao_info *sao_chroma = &cur_pic->sao_chroma[lcu->position.y * stride + lcu->position.x];
+    init_sao_info(sao_luma);
+    init_sao_info(sao_chroma);
+
+    {
+      sao_info *sao_top =  lcu->position.y != 0 ? &cur_pic->sao_luma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
+      sao_info *sao_left = lcu->position.x != 0 ? &cur_pic->sao_luma[lcu->position.y * stride + lcu->position.x -1] : NULL;
+      sao_search_luma(encoder_state, cur_pic, lcu->position.x, lcu->position.y, sao_luma, sao_top, sao_left);
+    }
+
+    {
+      sao_info *sao_top =  lcu->position.y != 0 ? &cur_pic->sao_chroma[(lcu->position.y - 1) * stride + lcu->position.x] : NULL;
+      sao_info *sao_left = lcu->position.x != 0 ? &cur_pic->sao_chroma[lcu->position.y * stride + lcu->position.x - 1] : NULL;
+      sao_search_chroma(encoder_state, cur_pic, lcu->position.x, lcu->position.y, sao_chroma, sao_top, sao_left);
+    }
+
+    // Merge only if both luma and chroma can be merged
+    sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag;
+    sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag;
+  }
+  
+  
+  //Now write data to bitstream (required to have a correct CABAC state)
+  
+  //First LCU, and we are in a slice. We need a slice header
+  if (encoder_state->type == ENCODER_STATE_TYPE_SLICE && lcu->index == 0) {
+    encode_slice_header(encoder_state);
+    bitstream_align(&encoder_state->stream); 
+  }
+  
+  //Encode SAO
+  if (encoder->sao_enable) {
+    encode_sao(encoder_state, lcu->position.x, lcu->position.y, &cur_pic->sao_luma[lcu->position.y * cur_pic->width_in_lcu + lcu->position.x], &cur_pic->sao_chroma[lcu->position.y * cur_pic->width_in_lcu + lcu->position.x]);
+  }
+  
+  //Encode coding tree
+  encode_coding_tree(encoder_state, lcu->position.x << MAX_DEPTH, lcu->position.y << MAX_DEPTH, 0);
+
+  //Terminator
+  if (lcu->index < encoder_state->lcu_order_count - 1) {
+    //Since we don't handle slice segments, end of slice segment == end of slice
+    //Always 0 since otherwise it would be split
+    cabac_encode_bin_trm(&encoder_state->cabac, 0);  // end_of_slice_segment_flag
+  }
+  
+  //Wavefronts need the context to be copied to the next row
+  if (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && lcu->index == 1) {
+    int j;
+    //Find next encoder (next row)
+    for (j=0; encoder_state->parent->children[j].encoder_control; ++j) {
+      if (encoder_state->parent->children[j].wfrow->lcu_offset_y == encoder_state->wfrow->lcu_offset_y + 1) {
+        //And copy context
+        context_copy(&encoder_state->parent->children[j], encoder_state);
+      }
+    }
+  }
+  
+  if (encoder->sao_enable && lcu->above) {
+    //If we're not the first in the row
+    if (lcu->above->left) {
+      encoder_state_recdata_to_bufs(encoder_state, lcu->above->left, encoder_state->tile->hor_buf_before_sao, NULL);
+    }
+    //Latest LCU in the row, copy the data from the one above also
+    if (!lcu->right) {
+      encoder_state_recdata_to_bufs(encoder_state, lcu->above, encoder_state->tile->hor_buf_before_sao, NULL);
+    }
+  }
+}
+
+static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  
+  int i = 0;
+  
+  assert(encoder_state->is_leaf);
+  assert(encoder_state->lcu_order_count > 0);
+  
+  //If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism
+  if (encoder_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (encoder_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !encoder_state->parent->children[1].encoder_control)) {
+    for (i = 0; i < encoder_state->lcu_order_count; ++i) {
+      PERFORMANCE_MEASURE_START();
+
+      worker_encoder_state_encode_lcu(&encoder_state->lcu_order[i]);
+
+#ifdef _DEBUG
+      {
+        const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
+        PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=search_lcu,frame=%d,tile=%d,slice=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
+      }
+#endif //_DEBUG
+    }
+    
+    if (encoder->sao_enable) {
+      PERFORMANCE_MEASURE_START();
+      sao_reconstruct_frame(encoder_state);
+      PERFORMANCE_MEASURE_END(encoder_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->lcu_order[0].position.y + encoder_state->tile->lcu_offset_y, encoder_state->lcu_order[encoder_state->lcu_order_count-1].position.y + encoder_state->tile->lcu_offset_y);
+    }
+  } else {
+    for (i = 0; i < encoder_state->lcu_order_count; ++i) {
+      const lcu_order_element * const lcu = &encoder_state->lcu_order[i];
+#ifdef _DEBUG
+      char job_description[256];
+      sprintf(job_description, "type=search_lcu,frame=%d,tile=%d,slice=%d,row=%d,position_x=%d,position_y=%d", encoder_state->global->frame, encoder_state->tile->id, encoder_state->slice->id, encoder_state->wfrow->lcu_offset_y, lcu->position.x + encoder_state->tile->lcu_offset_x, lcu->position.y + encoder_state->tile->lcu_offset_y);
+#else
+      char* job_description = NULL;
+#endif
+      encoder_state->tile->wf_jobs[lcu->id] = threadqueue_submit(encoder_state->encoder_control->threadqueue, worker_encoder_state_encode_lcu, (void*)lcu, 1, job_description);
+      if (encoder_state->tile->wf_jobs[lcu->id]) {
+        if (lcu->position.x > 0) {
+          // Wait for the LCU on the left.
+          threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - 1]);
+        }
+        if (lcu->position.y > 0) {
+          if (lcu->position.x < encoder_state->tile->cur_pic->width_in_lcu - 1) {
+            // Wait for the LCU to the top-right of this one.
+            threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - encoder_state->tile->cur_pic->width_in_lcu + 1]);
+          } else {
+            // If there is no top-right LCU, wait for the one above.
+            threadqueue_job_dep_add(encoder_state->tile->wf_jobs[lcu->id], encoder_state->tile->wf_jobs[lcu->id - encoder_state->tile->cur_pic->width_in_lcu]);
+          }
+        }
+        threadqueue_job_unwait_job(encoder_state->encoder_control->threadqueue, encoder_state->tile->wf_jobs[lcu->id]);
+      }
+    }
+  }
+}
+
+static void encoder_state_encode(encoder_state * const main_state);
+
+static void worker_encoder_state_encode_children(void * opaque) {
+  encoder_state *sub_state = opaque;
+  encoder_state_encode(sub_state);
+  if (sub_state->is_leaf) {
+    if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+      PERFORMANCE_MEASURE_START();
+      encoder_state_write_bitstream_leaf(sub_state);
+      PERFORMANCE_MEASURE_END(sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order[0].position.y + sub_state->tile->lcu_offset_y, sub_state->lcu_order[sub_state->lcu_order_count-1].position.y + sub_state->tile->lcu_offset_y);
+    } else {
+      threadqueue_job *job;
+#ifdef _DEBUG
+      char job_description[256];
+      sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,row=%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->wfrow->lcu_offset_y);
+#else
+      char* job_description = NULL;
+#endif
+      job = threadqueue_submit(sub_state->encoder_control->threadqueue, worker_encoder_state_write_bitstream_leaf, sub_state, 1, job_description);
+      threadqueue_job_dep_add(job, sub_state->tile->wf_jobs[sub_state->wfrow->lcu_offset_y * sub_state->tile->cur_pic->width_in_lcu + sub_state->lcu_order_count - 1]);
+      threadqueue_job_unwait_job(sub_state->encoder_control->threadqueue, job);
+      return;
+    }
+  }
+}
+
+typedef struct {
+  int y;
+  const encoder_state * encoder_state;
+} worker_sao_reconstruct_lcu_data;
+
+static void worker_sao_reconstruct_lcu(void *opaque) {
+  worker_sao_reconstruct_lcu_data *data = opaque;
+  picture * const cur_pic = data->encoder_state->tile->cur_pic;
+  unsigned stride = cur_pic->width_in_lcu;
+  int x;
+  
+  //TODO: copy only needed data
+  pixel *new_y_data = MALLOC(pixel, cur_pic->width * cur_pic->height);
+  pixel *new_u_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
+  pixel *new_v_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
+  
+  const int offset = cur_pic->width * (data->y*LCU_WIDTH);
+  const int offset_c = cur_pic->width/2 * (data->y*LCU_WIDTH_C);
+  int num_pixels = cur_pic->width * (LCU_WIDTH + 2);
+  
+  if (num_pixels + offset > cur_pic->width * cur_pic->height) {
+    num_pixels = cur_pic->width * cur_pic->height - offset;
+  }
+  
+  memcpy(&new_y_data[offset], &cur_pic->y_recdata[offset], sizeof(pixel) * num_pixels);
+  memcpy(&new_u_data[offset_c], &cur_pic->u_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
+  memcpy(&new_v_data[offset_c], &cur_pic->v_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
+  
+  if (data->y>0) {
+    //copy first row from buffer
+    memcpy(&new_y_data[cur_pic->width * (data->y*LCU_WIDTH-1)], &data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y-1)], cur_pic->width * sizeof(pixel));
+    memcpy(&new_u_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
+    memcpy(&new_v_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
+  }
+
+  for (x = 0; x < cur_pic->width_in_lcu; x++) {
+  // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
+    sao_info *sao_luma = &cur_pic->sao_luma[data->y * stride + x];
+    sao_info *sao_chroma = &cur_pic->sao_chroma[data->y * stride + x];
+    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_y_data, x, data->y, sao_luma, COLOR_Y);
+    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_u_data, x, data->y, sao_chroma, COLOR_U);
+    sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_v_data, x, data->y, sao_chroma, COLOR_V);
+  }
+  
+  free(new_y_data);
+  free(new_u_data);
+  free(new_v_data);
+
+  free(opaque);
+}
+
+
+static int encoder_state_tree_is_a_chain(const encoder_state * const encoder_state) {
+  if (!encoder_state->children[0].encoder_control) return 1;
+  if (encoder_state->children[1].encoder_control) return 0;
+  return encoder_state_tree_is_a_chain(&encoder_state->children[0]);
+}
+
+static void encoder_state_encode(encoder_state * const main_state) {
+  //If we have children, encode at child level
+  if (main_state->children[0].encoder_control) {
+    int i=0;
+    //If we have only one child, than it cannot be the last split in tree
+    int node_is_the_last_split_in_tree = (main_state->children[1].encoder_control != 0);
+    
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      
+      if (sub_state->tile != main_state->tile) {
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->y_data, main_state, main_state->tile->cur_pic->y_data, 1);
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->u_data, main_state, main_state->tile->cur_pic->u_data, 0);
+        encoder_state_blit_pixels(sub_state, sub_state->tile->cur_pic->v_data, main_state, main_state->tile->cur_pic->v_data, 0);
+      }
+      
+      //To be the last split, we require that every child is a chain
+      node_is_the_last_split_in_tree = node_is_the_last_split_in_tree && encoder_state_tree_is_a_chain(&main_state->children[i]);
+    }
+    //If it's the latest split point
+    if (node_is_the_last_split_in_tree) {
+      for (i=0; main_state->children[i].encoder_control; ++i) {
+        //If we don't have wavefronts, parallelize encoding of children.
+        if (main_state->children[i].type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+#ifdef _DEBUG
+          char job_description[256];
+          switch (main_state->children[i].type) {
+            case ENCODER_STATE_TYPE_TILE: 
+              sprintf(job_description, "frame=%d,tile=%d,row=%d-%d,position_x=%d,position_y=%d", main_state->children[i].global->frame, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].tile->lcu_offset_x, main_state->children[i].tile->lcu_offset_y);
+              break;
+            case ENCODER_STATE_TYPE_SLICE:
+              sprintf(job_description, "frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].global->frame, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
+              break;
+            default:
+              sprintf(job_description, "frame=%d,invalid", main_state->children[i].global->frame);
+              break;
+          }
+#else
+          char* job_description = NULL;
+#endif
+          threadqueue_submit(main_state->encoder_control->threadqueue, worker_encoder_state_encode_children, &(main_state->children[i]), 0, job_description);
+        } else {
+          //Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
+          //FIXME: add an assert: we can only have wavefront children
+          worker_encoder_state_encode_children(&(main_state->children[i]));
+        }
+      }
+      
+      //If children are wavefront, we need to reconstruct SAO
+      if (main_state->encoder_control->sao_enable && main_state->children[0].type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+        int y;
+        picture * const cur_pic = main_state->tile->cur_pic;
+        threadqueue_job *previous_job = NULL;
+        
+        for (y = 0; y < cur_pic->height_in_lcu; ++y) {
+          worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1);
+          threadqueue_job *job;
+#ifdef _DEBUG
+          char job_description[256];
+          sprintf(job_description, "frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y);
+#else
+          char* job_description = NULL;
+#endif
+          data->y = y;
+          data->encoder_state = main_state;
+          
+          job = threadqueue_submit(main_state->encoder_control->threadqueue, worker_sao_reconstruct_lcu, data, 1, job_description);
+          
+          if (previous_job) {
+            threadqueue_job_dep_add(job, previous_job);
+          }
+          previous_job = job;
+          
+          if (y < cur_pic->height_in_lcu - 1) {
+            //Not last row: depend on the last LCU of the row below
+            threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 1) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
+          } else {
+            //Last row: depend on the last LCU of the row
+            threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 0) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
+          }
+          threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job);
+          
+        }
+      }
+      threadqueue_flush(main_state->encoder_control->threadqueue);
+    } else {
+      for (i=0; main_state->children[i].encoder_control; ++i) {
+        worker_encoder_state_encode_children(&(main_state->children[i]));
+      }
+    }
+    
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      if (sub_state->tile != main_state->tile) {
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->y_recdata, sub_state, sub_state->tile->cur_pic->y_recdata, 1);
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->u_recdata, sub_state, sub_state->tile->cur_pic->u_recdata, 0);
+        encoder_state_blit_pixels(main_state, main_state->tile->cur_pic->v_recdata, sub_state, sub_state->tile->cur_pic->v_recdata, 0);
+      }
+    }
+  } else {
+    switch (main_state->type) {
+      case ENCODER_STATE_TYPE_TILE:
+      case ENCODER_STATE_TYPE_SLICE:
+      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
+        encoder_state_encode_leaf(main_state);
+        break;
+      default:
+        fprintf(stderr, "Unsupported leaf type %c!\n", main_state->type);
+        assert(0);
+    }
+  }
+}
+
+static void encoder_state_new_frame(encoder_state * const main_state) {
+  int i;
+  //FIXME Move this somewhere else!
+  if (main_state->type == ENCODER_STATE_TYPE_MAIN) {
+    const encoder_control * const encoder = main_state->encoder_control;
+    
+    const int is_first_frame = (main_state->global->frame == 0);
+    const int is_i_radl = (encoder->cfg->intra_period == 1 && main_state->global->frame % 2 == 0);
+    const int is_p_radl = (encoder->cfg->intra_period > 1 && (main_state->global->frame % encoder->cfg->intra_period) == 0);
+    main_state->global->is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
+    
+    if (main_state->global->is_radl_frame) {
+      // Clear the reference list
+      encoder_state_clear_refs(main_state);
+
+      main_state->global->slicetype = SLICE_I;
+      main_state->global->pictype = NAL_IDR_W_RADL;
+    } else {
+      main_state->global->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
+      main_state->global->pictype = NAL_TRAIL_R;
+    }
+  } else {
+    //Clear the bitstream if it's not the main encoder
+    bitstream_clear(&main_state->stream);
+  }
+  
+  if (main_state->is_leaf) {
+    //Leaf states have cabac and context
+    cabac_start(&main_state->cabac);
+    init_contexts(main_state, main_state->global->QP, main_state->global->slicetype);
+
+    // Initialize lambda value(s) to use in search
+    encoder_state_init_lambda(main_state);
+  }
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    encoder_state_new_frame(&main_state->children[i]);
+  }
+  
+
+}
+
+static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
+  const encoder_control * const encoder = main_state->encoder_control;
+  bitstream * const stream = &main_state->stream;
+
+  int i;
+
+  if (main_state->global->is_radl_frame) {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_aud(main_state);
+
+    // Video Parameter Set (VPS)
+    nal_write(stream, NAL_VPS_NUT, 0, 1);
+    encode_vid_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Sequence Parameter Set (SPS)
+    nal_write(stream, NAL_SPS_NUT, 0, 1);
+    encode_seq_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Picture Parameter Set (PPS)
+    nal_write(stream, NAL_PPS_NUT, 0, 1);
+    encode_pic_parameter_set(main_state);
+    bitstream_align(stream);
+
+    if (main_state->global->frame == 0) {
+      // Prefix SEI
+      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
+      encode_prefix_sei_version(main_state);
+      bitstream_align(stream);
+    }
+  } else {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_aud(main_state);
+  }
+
+  {
+    // Not quite sure if this is correct, but it seems to have worked so far
+    // so I tried to not change it's behavior.
+    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
+
+    nal_write(stream,
+              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
+  }
+  {
+    PERFORMANCE_MEASURE_START();
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+    //FIXME: Move this...
+    bitstream_clear(&main_state->children[i].stream);
+  }
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  {
+    PERFORMANCE_MEASURE_START();
+    // Calculate checksum
+    add_checksum(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  //FIXME: Why is this needed?
+  main_state->tile->cur_pic->poc = main_state->global->poc;
+}
+
+static void worker_encoder_state_write_bitstream_leaf(void * opaque) {
+  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
+}
+
+static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  //Write terminator of the leaf
+  assert(encoder_state->is_leaf);
+  
+  //Last LCU
+  {
+    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
+    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
+    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
+  
+    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
+  
+    if (!end_of_slice_segment_flag) {
+      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
+      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
+      cabac_flush(&encoder_state->cabac);
+    } else {
+      cabac_flush(&encoder_state->cabac);
+      bitstream_align(&encoder_state->stream);
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
+  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
+  int i;
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
+  int i;
+  encode_slice_header(main_state);
+  bitstream_align(&main_state->stream); 
+  
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+
+static void encoder_state_write_bitstream(encoder_state * const main_state) {
+  int i;
+  if (!main_state->is_leaf) {
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      encoder_state_write_bitstream(sub_state);
+    }
+    
+    switch (main_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        encoder_state_write_bitstream_main(main_state);
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        encoder_state_write_bitstream_tile(main_state);
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        encoder_state_write_bitstream_slice(main_state);
+        break;
+      default:
+        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
+        assert(0);
+    }
+  }
+}
+
+void encode_one_frame(encoder_state * const main_state)
+{
+  {
+    PERFORMANCE_MEASURE_START();
+    encoder_state_new_frame(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=new_frame,frame=%d", main_state->global->frame);
+  }
+  {
+    PERFORMANCE_MEASURE_START();
+    encoder_state_encode(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=encode,frame=%d", main_state->global->frame);
+  }
+  {
+    encoder_state_write_bitstream(main_state);
+  }
+}
+
+static void fill_after_frame(unsigned height, unsigned array_width,
+                             unsigned array_height, pixel *data)
+{
+  pixel* p = data + height * array_width;
+  pixel* end = data + array_width * array_height;
+
+  while (p < end) {
+    // Fill the line by copying the line above.
+    memcpy(p, p - array_width, array_width);
+    p += array_width;
+  }
+}
+
+static int read_and_fill_frame_data(FILE *file,
+                                    unsigned width, unsigned height,
+                                    unsigned array_width, pixel *data)
+{
+  pixel* p = data;
+  pixel* end = data + array_width * height;
+  pixel fill_char;
+  unsigned i;
+
+  while (p < end) {
+    // Read the beginning of the line from input.
+    if (width != fread(p, sizeof(unsigned char), width, file))
+      return 0;
+
+    // Fill the rest with the last pixel value.
+    fill_char = p[width - 1];
+
+    for (i = width; i < array_width; ++i) {
+      p[i] = fill_char;
+    }
+
+    p += array_width;
+  }
+  return 1;
+}
+
+int read_one_frame(FILE* file, const encoder_state * const encoder_state)
+{
+  unsigned width = encoder_state->encoder_control->in.real_width;
+  unsigned height = encoder_state->encoder_control->in.real_height;
+  unsigned array_width = encoder_state->tile->cur_pic->width;
+  unsigned array_height = encoder_state->tile->cur_pic->height;
+
+  if (width != array_width) {
+    // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
+    if (!read_and_fill_frame_data(file, width, height, array_width,
+                                  encoder_state->tile->cur_pic->y_data) ||
+        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
+                                  encoder_state->tile->cur_pic->u_data) ||
+        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
+                                  encoder_state->tile->cur_pic->v_data))
+      return 0;
+  } else {
+    // Otherwise the data can be read directly to the array.
+    unsigned y_size = width * height;
+    unsigned uv_size = (width >> 1) * (height >> 1);
+    if (y_size  != fread(encoder_state->tile->cur_pic->y_data, sizeof(unsigned char),
+                         y_size, file) ||
+        uv_size != fread(encoder_state->tile->cur_pic->u_data, sizeof(unsigned char),
+                         uv_size, file) ||
+        uv_size != fread(encoder_state->tile->cur_pic->v_data, sizeof(unsigned char),
+                         uv_size, file))
+      return 0;
+  }
+
+  if (height != array_height) {
+    fill_after_frame(height, array_width, array_height,
+                     encoder_state->tile->cur_pic->y_data);
+    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
+                     encoder_state->tile->cur_pic->u_data);
+    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
+                     encoder_state->tile->cur_pic->v_data);
+  }
+  return 1;
+}
+
+/**
+ * \brief Add a checksum SEI message to the bitstream.
+ * \param encoder The encoder.
+ * \returns Void
+ */
+static void add_checksum(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  unsigned char checksum[3][SEI_HASH_MAX_LENGTH];
+  uint32_t checksum_val;
+  unsigned int i;
+
+  nal_write(stream, NAL_SUFFIT_SEI_NUT, 0, 0);
+
+  picture_checksum(cur_pic, checksum);
+
+  WRITE_U(stream, 132, 8, "sei_type");
+  WRITE_U(stream, 13, 8, "size");
+  WRITE_U(stream, 2, 8, "hash_type"); // 2 = checksum
+
+  for (i = 0; i < 3; ++i) {
+    // Pack bits into a single 32 bit uint instead of pushing them one byte
+    // at a time.
+    checksum_val = (checksum[i][0] << 24) + (checksum[i][1] << 16) +
+                   (checksum[i][2] << 8) + (checksum[i][3]);
+    WRITE_U(stream, checksum_val, 32, "picture_checksum");
+  }
+
+  bitstream_align(stream);
+}
+
+void encode_access_unit_delimiter(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
+                   : encoder_state->global->slicetype == SLICE_P ? 1
+                   :                                             2;
+  WRITE_U(stream, pic_type, 3, "pic_type");
+}
+
+void encode_prefix_sei_version(encoder_state * const encoder_state)
+{
+#define STR_BUF_LEN 1000
+  bitstream * const stream = &encoder_state->stream;
+  int i, length;
+  char buf[STR_BUF_LEN] = { 0 };
+  char *s = buf + 16;
+  const config * const cfg = encoder_state->encoder_control->cfg;
+
+  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
+  static const uint8_t uuid[16] = {
+    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
+    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
+  };
+  memcpy(buf, uuid, 16);
+
+  // user_data_payload_byte
+  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
+                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
+  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
+  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
+               cfg->deblock_beta, cfg->deblock_tc);
+  s += sprintf(s, " sao=%d", cfg->sao_enable);
+  s += sprintf(s, " intra_period=%d", cfg->intra_period);
+  s += sprintf(s, " qp=%d", cfg->qp);
+  s += sprintf(s, " ref=%d", cfg->ref_frames);
+
+  length = (int)(s - buf + 1);  // length, +1 for \0
+
+  // Assert this so that in the future if the message gets longer, we remember
+  // to increase the buf len. Divide by 2 for margin.
+  assert(length < STR_BUF_LEN / 2);
+
+  // payloadType = 5 -> user_data_unregistered
+  WRITE_U(stream, 5, 8, "last_payload_type_byte");
+
+  // payloadSize
+  for (i = 0; i <= length - 255; i += 255)
+    WRITE_U(stream, 255, 8, "ff_byte");
+  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
+
+  for (i = 0; i < length; i++)
+    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
+
+#undef STR_BUF_LEN
+}
+
+void encode_pic_parameter_set(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+#ifdef _DEBUG
+  printf("=========== Picture Parameter Set ID: 0 ===========\n");
+#endif
+  WRITE_UE(stream, 0, "pic_parameter_set_id");
+  WRITE_UE(stream, 0, "seq_parameter_set_id");
+  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
+  WRITE_U(stream, 0, 1, "output_flag_present_flag");
+  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
+  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
+  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
+
+  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
+  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
+  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
+  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
+  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
+  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+  //if cu_qp_delta_enabled_flag
+  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+
+  //TODO: add QP offsets
+  WRITE_SE(stream, 0, "pps_cb_qp_offset");
+  WRITE_SE(stream, 0, "pps_cr_qp_offset");
+  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
+  WRITE_U(stream, 0, 1, "weighted_pred_flag");
+  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
+
+  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
+  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
+  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
+  //wavefronts
+  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
+
+  if (encoder->tiles_enable) {
+    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
+    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
+    
+    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
+    
+    if (!encoder->tiles_uniform_spacing_flag) {
+      int i;
+      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
+      }
+      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
+      }
+    }
+    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
+    
+  }
+  
+  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
+  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
+
+  //IF deblocking_filter
+    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
+          "pps_disable_deblocking_filter_flag");
+
+    //IF !disabled
+  if (encoder_state->encoder_control->deblock_enable) {
+     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
+     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
+    }
+
+    //ENDIF
+  //ENDIF
+  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
+  //IF scaling_list
+  //ENDIF
+  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
+  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
+  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
+  WRITE_U(stream, 0, 1, "pps_extension_flag");
+}
+
+static void encode_PTL(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+  // PTL
+  // Profile Tier
+  WRITE_U(stream, 0, 2, "general_profile_space");
+  WRITE_U(stream, 0, 1, "general_tier_flag");
+  // Main Profile == 1
+  WRITE_U(stream, 1, 5, "general_profile_idc");
+  /* Compatibility flags should be set at general_profile_idc
+   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
+   * According to specification, when compatibility_flag[1] is set,
+   *  compatibility_flag[2] should be set too.
+   */
+  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
+
+  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
+  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
+  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
+  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
+
+  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
+  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
+
+  // end Profile Tier
+
+  // Level 6.2 (general_level_idc is 30 * 6.2)
+  WRITE_U(stream, 186, 8, "general_level_idc");
+
+  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
+  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
+
+  for (i = 1; i < 8; i++) {
+    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
+  }
+
+  // end PTL
+}
+
+static void encode_scaling_list(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+  uint32_t size_id;
+  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
+    int32_t list_id;
+    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
+      uint8_t scaling_list_pred_mode_flag = 1;
+      int32_t pred_list_idx;
+      int32_t i;
+      uint32_t ref_matrix_id = UINT32_MAX;
+
+      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
+        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
+                                     scalinglist_get_default(size_id, pred_list_idx) :
+                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
+
+        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
+            ((size_id < SCALING_LIST_16x16) ||
+             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
+          ref_matrix_id = pred_list_idx;
+          scaling_list_pred_mode_flag = 0;
+          break;
+        }
+      }
+      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
+
+      if (!scaling_list_pred_mode_flag) {
+        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
+      } else {
+        int32_t delta;
+        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
+        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
+        int32_t next_coef = 8;
+        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
+
+        if (size_id >= SCALING_LIST_16x16) {
+          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
+          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
+        }
+
+        for (i = 0; i < coef_num; i++) {
+          delta     = coef_list[scan_cg[i]] - next_coef;
+          next_coef = coef_list[scan_cg[i]];
+          if (delta > 127)
+            delta -= 256;
+          if (delta < -128)
+            delta += 256;
+
+          WRITE_SE(stream, delta, "scaling_list_delta_coef");
+        }
+      }
+    }
+  }
+}
+
+void encode_seq_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  //FIXME: use encoder_control instead of cur_pic
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+
+#ifdef _DEBUG
+  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
+#endif
+
+  // TODO: profile IDC and level IDC should be defined later on
+  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
+  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
+
+  encode_PTL(encoder_state);
+
+  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
+  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
+           "chroma_format_idc");
+
+  if (encoder_state->encoder_control->in.video_format == 3) {
+    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
+  }
+
+  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
+  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
+
+  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
+    // The standard does not seem to allow setting conf_win values such that
+    // the number of luma samples is not a multiple of 2. Options are to either
+    // hide one line or show an extra line of non-video. Neither seems like a
+    // very good option, so let's not even try.
+    assert(!(cur_pic->width % 2));
+    WRITE_U(stream, 1, 1, "conformance_window_flag");
+    WRITE_UE(stream, 0, "conf_win_left_offset");
+    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
+             "conf_win_right_offset");
+    WRITE_UE(stream, 0, "conf_win_top_offset");
+    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
+             "conf_win_bottom_offset");
+  } else {
+    WRITE_U(stream, 0, 1, "conformance_window_flag");
+  }
+
+  //IF window flag
+  //END IF
+
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
+  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "sps_num_reorder_pics");
+  WRITE_UE(stream, 0, "sps_max_latency_increase");
+  //end for
+
+  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
+  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
+  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
+  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
+  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
+  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
+
+  // scaling list
+  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
+  if (encoder_state->encoder_control->scaling_list.enable) {
+    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
+    encode_scaling_list(encoder_state);
+  }
+
+  WRITE_U(stream, 0, 1, "amp_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
+          "sample_adaptive_offset_enabled_flag");
+  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
+  #if ENABLE_PCM == 1
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
+    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
+    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
+    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
+  #endif
+
+  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
+
+  //IF num short term ref pic sets
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
+
+  //IF long_term_ref_pics_present
+  //ENDIF
+
+  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
+          "sps_temporal_mvp_enable_flag");
+  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
+  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
+
+  encode_VUI(encoder_state);
+
+  WRITE_U(stream, 0, 1, "sps_extension_flag");
+}
+
+void encode_vid_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+#ifdef _DEBUG
+  printf("=========== Video Parameter Set ID: 0 ===========\n");
+#endif
+
+  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
+  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
+  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
+  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
+  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
+
+  encode_PTL(encoder_state);
+
+  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  for (i = 0; i < 1; i++) {
+  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "vps_num_reorder_pics");
+  WRITE_UE(stream, 0, "vps_max_latency_increase");
+  }
+
+  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
+  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
+  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
+
+  //IF timing info
+  //END IF
+
+  WRITE_U(stream, 0, 1, "vps_extension_flag");
+}
+
+static void encode_VUI(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  const encoder_control * const encoder = encoder_state->encoder_control;
+#ifdef _DEBUG
+  printf("=========== VUI Set ID: 0 ===========\n");
+#endif
+  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
+    int i;
+    static const struct
+    {
+      uint8_t width;
+      uint8_t height;
+      uint8_t idc;
+    } sar[] = {
+      // aspect_ratio_idc = 0 -> unspecified
+      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
+      // aspect_ratio_idc = [17..254] -> reserved
+      { 0, 0, 255 }
+    };
+
+    for (i = 0; sar[i].idc != 255; i++)
+      if (sar[i].width  == encoder->vui.sar_width &&
+          sar[i].height == encoder->vui.sar_height)
+        break;
+
+    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
+    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
+    if (sar[i].idc == 255) {
+      // EXTENDED_SAR
+      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
+      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
+    }
+  } else
+    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
+
+  //IF aspect ratio info
+  //ENDIF
+
+  if (encoder->vui.overscan > 0) {
+    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
+    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
+  } else
+    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
+
+  //IF overscan info
+  //ENDIF
+
+  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
+      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+      encoder->vui.colormatrix != 2) {
+    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
+    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
+    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
+
+    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+        encoder->vui.colormatrix != 2) {
+      WRITE_U(stream, 1, 1, "colour_description_present_flag");
+      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
+      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
+      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
+    } else
+      WRITE_U(stream, 0, 1, "colour_description_present_flag");
+  } else
+    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
+
+  //IF video type
+  //ENDIF
+
+  if (encoder->vui.chroma_loc > 0) {
+    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
+  } else
+    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
+
+  //IF chroma loc info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
+  WRITE_U(stream, 0, 1, "field_seq_flag");
+  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
+  WRITE_U(stream, 0, 1, "default_display_window_flag");
+
+  //IF default display window
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
+
+  //IF timing info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
+
+  //IF bitstream restriction
+  //ENDIF
+}
+
+void encoder_next_frame(encoder_state *encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  picture *old_pic;
+  
+  // Remove the ref pic (if present)
+  if (encoder_state->global->ref->used_size == (uint32_t)encoder->cfg->ref_frames) {
+    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size-1);
+  }
+  // Add current picture as reference
+  picture_list_add(encoder_state->global->ref, encoder_state->tile->cur_pic);
+  // Allocate new memory to current picture
+  old_pic = encoder_state->tile->cur_pic;
+  // TODO: reuse memory from old reference
+  encoder_state->tile->cur_pic = picture_alloc(encoder_state->tile->cur_pic->width, encoder_state->tile->cur_pic->height, encoder_state->tile->cur_pic->width_in_lcu, encoder_state->tile->cur_pic->height_in_lcu);
+
+  //FIXME: does the coeff_* really belongs to cur_pic?
+  // Copy pointer from the last cur_pic because we don't want to reallocate it
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_y,old_pic->coeff_y);
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_u,old_pic->coeff_u);
+  MOVE_POINTER(encoder_state->tile->cur_pic->coeff_v,old_pic->coeff_v);
+  
+  picture_free(old_pic);
+
+  encoder_state->global->frame++;
+  encoder_state->global->poc++;
+}
+
+static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      if (my_length > *r_max_length) {
+        *r_max_length = my_length;
+      }
+    } else {
+      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
+    }
+  }
+}
+
+static void encoder_state_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      //Don't write the last one
+      if (*r_count < num_entry_points) {
+        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
+      }
+    } else {
+      encoder_state_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
+    }
+  }
+}
+
+static int num_bitcount(unsigned int n) {
+  int pos = 0;
+  if (n >= 1<<16) { n >>= 16; pos += 16; }
+  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
+  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
+  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
+  if (n >= 1<< 1) {           pos +=  1; }
+  return ((n == 0) ? (-1) : pos);
+}
+
+void encode_slice_header(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+
+#ifdef _DEBUG
+  printf("=========== Slice ===========\n");
+#endif
+  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
+
+  if (encoder_state->global->pictype >= NAL_BLA_W_LP
+      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
+    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
+  }
+
+  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
+  if (encoder_state->slice->start_in_rs > 0) {
+    //For now, we don't support dependent slice segments
+    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
+    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
+  }
+
+  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
+
+  // if !entropy_slice_flag
+
+    //if output_flag_present_flag
+      //WRITE_U(stream, 1, 1, "pic_output_flag");
+    //end if
+    //if( IdrPicFlag ) <- nal_unit_type == 5
+  if (encoder_state->global->pictype != NAL_IDR_W_RADL
+      && encoder_state->global->pictype != NAL_IDR_N_LP) {
+      int j;
+      int ref_negative = encoder_state->global->ref->used_size;
+      int ref_positive = 0;
+      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
+      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
+      WRITE_UE(stream, ref_negative, "num_negative_pics");
+      WRITE_UE(stream, ref_positive, "num_positive_pics");
+
+    for (j = 0; j < ref_negative; j++) {
+      int32_t delta_poc_minus1 = 0;
+      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
+      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
+    }
+
+    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
+  }
+
+    //end if
+  //end if
+  if (encoder->sao_enable) {
+    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
+    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
+  }
+
+  if (encoder_state->global->slicetype != SLICE_I) {
+      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
+        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+  }
+
+  if (encoder_state->global->slicetype == SLICE_B) {
+      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
+  }
+
+  // Skip flags that are not present
+  // if !entropy_slice_flag
+    WRITE_SE(stream, 0, "slice_qp_delta");
+    //WRITE_U(stream, 1, 1, "alignment");
+   
+  if (encoder->tiles_enable || encoder->wpp) {
+    int num_entry_points = 0;
+    int max_length_seen = 0;
+    
+    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
+    
+    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
+    if (num_entry_points > 0) {
+      int entry_points_written = 0;
+      int offset_len = num_bitcount(max_length_seen) + 1;
+      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
+      encoder_state_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
+    }
+  }
+}
+
+
+static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
+                             color_index color_i)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  sao_eo_cat i;
+
+  // Skip colors with no SAO.
+  //FIXME: for now, we always have SAO for all channels
+  if (color_i == COLOR_Y && 0) return;
+  if (color_i != COLOR_Y && 0) return;
+
+  /// sao_type_idx_luma:   TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
+  /// sao_type_idx_chroma: TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
+  // Encode sao_type_idx for Y and U+V.
+  if (color_i != COLOR_V) {
+    cabac->ctx = &(cabac->ctx_sao_type_idx_model);;
+    CABAC_BIN(cabac, sao->type != SAO_TYPE_NONE, "sao_type_idx");
+    if (sao->type == SAO_TYPE_BAND) {
+      CABAC_BIN_EP(cabac, 0, "sao_type_idx_ep");
+    } else if (sao->type == SAO_TYPE_EDGE) {
+      CABAC_BIN_EP(cabac, 1, "sao_type_idx_ep");
+    }
+  }
+
+  if (sao->type == SAO_TYPE_NONE) return;
+
+  /// sao_offset_abs[][][][]: TR, cMax = (1 << (Min(bitDepth, 10) - 5)) - 1,
+  ///                         cRiceParam = 0, bins = {bypass x N}
+  for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
+    cabac_write_unary_max_symbol_ep(cabac, abs(sao->offsets[i]), SAO_ABS_OFFSET_MAX);
+  }
+
+  /// sao_offset_sign[][][][]: FL, cMax = 1, bins = {bypass}
+  /// sao_band_position[][][]: FL, cMax = 31, bins = {bypass x N}
+  /// sao_eo_class_luma:       FL, cMax = 3, bins = {bypass x 3}
+  /// sao_eo_class_chroma:     FL, cMax = 3, bins = {bypass x 3}
+  if (sao->type == SAO_TYPE_BAND) {
+    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
+      // Positive sign is coded as 0.
+      if(sao->offsets[i] != 0) {
+        CABAC_BIN_EP(cabac, sao->offsets[i] < 0 ? 1 : 0, "sao_offset_sign");
+      }
+    }
+    // TODO: sao_band_position
+    // FL cMax=31 (5 bits)
+    CABAC_BINS_EP(cabac, sao->band_position, 5, "sao_band_position");
+  } else if (color_i != COLOR_V) {
+    CABAC_BINS_EP(cabac, sao->eo_class, 2, "sao_eo_class");
+  }
+}
+
+static void encode_sao_merge_flags(encoder_state * const encoder_state, sao_info *sao, unsigned x_ctb, unsigned y_ctb)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  // SAO merge flags are not present for the first row and column.
+  if (x_ctb > 0) {
+    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
+    CABAC_BIN(cabac, sao->merge_left_flag, "sao_merge_left_flag");
+  }
+  if (y_ctb > 0 && !sao->merge_left_flag) {
+    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
+    CABAC_BIN(cabac, sao->merge_up_flag, "sao_merge_up_flag");
+  }
+}
+
+/**
+ * \brief Encode SAO information.
+ */
+static void encode_sao(encoder_state * const encoder_state,
+                       unsigned x_lcu, uint16_t y_lcu,
+                       sao_info *sao_luma, sao_info *sao_chroma)
+{
+  // TODO: transmit merge flags outside sao_info
+  encode_sao_merge_flags(encoder_state, sao_luma, x_lcu, y_lcu);
+
+  // If SAO is merged, nothing else needs to be coded.
+  if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
+    encode_sao_color(encoder_state, sao_luma, COLOR_Y);
+    encode_sao_color(encoder_state, sao_chroma, COLOR_U);
+    encode_sao_color(encoder_state, sao_chroma, COLOR_V);
+  }
+}
+
+
+void encode_coding_tree(encoder_state * const encoder_state,
+                        uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  cu_info *cur_cu = &cur_pic->cu_array[x_ctb + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)];
+  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
+  uint8_t split_model = 0;
+  
+  //Absolute ctb
+  uint16_t abs_x_ctb = x_ctb + (encoder_state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+  uint16_t abs_y_ctb = y_ctb + (encoder_state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+
+  // Check for slice border FIXME
+  uint8_t border_x = ((encoder_state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_y = ((encoder_state->encoder_control->in.height) < (abs_y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_split_x = ((encoder_state->encoder_control->in.width)  < ((abs_x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border_split_y = ((encoder_state->encoder_control->in.height) < ((abs_y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border = border_x | border_y; /*!< are we in any border CU */
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    // Implisit split flag when on border
+    if (!border) {
+      // Get left and top block split_flags and if they are present and true, increase model number
+      if (x_ctb > 0 && GET_SPLITDATA(&(cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)]), depth) == 1) {
+        split_model++;
+      }
+
+      if (y_ctb > 0 && GET_SPLITDATA(&(cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)]), depth) == 1) {
+        split_model++;
+      }
+
+      cabac->ctx = &(cabac->ctx_split_flag_model[split_model]);
+      CABAC_BIN(cabac, split_flag, "SplitFlag");
+    }
+
+    if (split_flag || border) {
+      // Split blocks and remember to change x and y block positions
+      uint8_t change = 1<<(MAX_DEPTH-1-depth);
+      encode_coding_tree(encoder_state, x_ctb, y_ctb, depth + 1); // x,y
+
+      // TODO: fix when other half of the block would not be completely over the border
+      if (!border_x || border_split_x) {
+        encode_coding_tree(encoder_state, x_ctb + change, y_ctb, depth + 1);
+      }
+      if (!border_y || border_split_y) {
+        encode_coding_tree(encoder_state, x_ctb, y_ctb + change, depth + 1);
+      }
+      if (!border || (border_split_x && border_split_y)) {
+        encode_coding_tree(encoder_state, x_ctb + change, y_ctb + change, depth + 1);
+      }
+      return;
+    }
+  }
+
+
+
+    // Encode skip flag
+  if (encoder_state->global->slicetype != SLICE_I) {
+    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
+    int ui;
+    int16_t num_cand = MRG_MAX_NUM_CANDS;
+    // Get left and top skipped flags and if they are present and true, increase context number
+    if (x_ctb > 0 && (&cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)])->skipped) {
+      ctx_skip++;
+    }
+
+    if (y_ctb > 0 && (&cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)])->skipped) {
+      ctx_skip++;
+    }
+
+    cabac->ctx = &(cabac->ctx_cu_skip_flag_model[ctx_skip]);
+    CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
+
+    // IF SKIP
+    if (cur_cu->skipped) {
+      if (num_cand > 1) {
+        for (ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            cabac->ctx = &(cabac->ctx_cu_merge_idx_ext_model);
+            CABAC_BIN(cabac, symbol, "MergeIndex");
+          } else {
+            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return;
+    }
+  }
+
+  // ENDIF SKIP
+
+  // Prediction mode
+  if (encoder_state->global->slicetype != SLICE_I) {
+    cabac->ctx = &(cabac->ctx_cu_pred_mode_model);
+    CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode");
+  }
+
+  // part_mode
+  if (cur_cu->type == CU_INTRA) {
+    if (depth == MAX_DEPTH) {
+      cabac->ctx = &(cabac->ctx_part_size_model[0]);
+      if (cur_cu->part_size == SIZE_2Nx2N) {
+        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+      } else {
+        CABAC_BIN(cabac, 0, "part_mode NxN");
+      }
+    }
+  } else {
+    // TODO: Handle inter sizes other than 2Nx2N
+    cabac->ctx = &(cabac->ctx_part_size_model[0]);
+    CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+  }
+
+  //end partsize
+  if (cur_cu->type == CU_INTER) {
+    // FOR each part
+    // Mergeflag
+    int16_t num_cand = 0;
+    cabac->ctx = &(cabac->ctx_cu_merge_flag_ext_model);
+    CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+    num_cand = MRG_MAX_NUM_CANDS;
+    if (cur_cu->merged) { //merge
+      if (num_cand > 1) {
+        int32_t ui;
+        for (ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            cabac->ctx = &(cabac->ctx_cu_merge_idx_ext_model);
+            CABAC_BIN(cabac, symbol, "MergeIndex");
+          } else {
+            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          }
+          if (symbol == 0) break;
+        }
+      }
+    } else {
+      uint32_t ref_list_idx;
+      /*
+      // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
+      if(cur_pic->slicetype == SLICE_B)
+      {
+        // Code Inter Dir
+        const UInt uiInterDir = pcCU->getInterDir( uiAbsPartIdx ) - 1;
+        const UInt uiCtx      = pcCU->getCtxInterDir( uiAbsPartIdx );
+        ContextModel *pCtx    = m_cCUInterDirSCModel.get( 0 );
+        if (pcCU->getPartitionSize(uiAbsPartIdx) == SIZE_2Nx2N || pcCU->getHeight(uiAbsPartIdx) != 8 )
+        {
+          m_pcBinIf->encodeBin( uiInterDir == 2 ? 1 : 0, *( pCtx + uiCtx ) );
+        }
+        if (uiInterDir < 2)
+        {
+          m_pcBinIf->encodeBin( uiInterDir, *( pCtx + 4 ) );
+        }
+      }
+      */
+
+      for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
+            //if(encoder_state->ref_idx_num[uiRefListIdx] > 0)
+            {
+          if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
+            if (encoder_state->global->ref->used_size != 1) { //encoder_state->ref_idx_num[uiRefListIdx] != 1)//NumRefIdx != 1)
+              // parseRefFrmIdx
+              int32_t ref_frame = cur_cu->inter.mv_ref;
+
+              cabac->ctx = &(cabac->ctx_cu_ref_pic_model[0]);
+              CABAC_BIN(cabac, (ref_frame != 0), "ref_frame_flag");
+
+              if (ref_frame > 0) {
+                int32_t i;
+                int32_t ref_num = encoder_state->global->ref->used_size - 2;
+
+                cabac->ctx = &(cabac->ctx_cu_ref_pic_model[1]);
+                ref_frame--;
+
+                for (i = 0; i < ref_num; ++i) {
+                  const uint32_t symbol = (i == ref_frame) ? 0 : 1;
+
+                  if (i == 0) {
+                    CABAC_BIN(cabac, symbol, "ref_frame_flag2");
+                  } else {
+                    CABAC_BIN_EP(cabac, symbol, "ref_frame_flag2");
+                  }
+                  if (symbol == 0) break;
+                }
+              }
+            }
+
+            if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder_state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
+              const int32_t mvd_hor = cur_cu->inter.mvd[0];
+              const int32_t mvd_ver = cur_cu->inter.mvd[1];
+              const int8_t hor_abs_gr0 = mvd_hor != 0;
+              const int8_t ver_abs_gr0 = mvd_ver != 0;
+              const uint32_t mvd_hor_abs = abs(mvd_hor);
+              const uint32_t mvd_ver_abs = abs(mvd_ver);
+
+              cabac->ctx = &(cabac->ctx_cu_mvd_model[0]);
+              CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
+              CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+
+              cabac->ctx = &(cabac->ctx_cu_mvd_model[1]);
+
+              if (hor_abs_gr0) {
+                CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+              }
+
+              if (ver_abs_gr0) {
+                CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+              }
+
+              if (hor_abs_gr0) {
+                if (mvd_hor_abs > 1) {
+                  cabac_write_ep_ex_golomb(cabac,mvd_hor_abs-2, 1);
+                }
+
+                CABAC_BIN_EP(cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
+              }
+
+              if (ver_abs_gr0) {
+                if (mvd_ver_abs > 1) {
+                  cabac_write_ep_ex_golomb(cabac,mvd_ver_abs-2, 1);
+                }
+
+                CABAC_BIN_EP(cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
+              }
+            }
+
+            // Signal which candidate MV to use
+            cabac_write_unary_max_symbol(cabac, cabac->ctx_mvp_idx_model, cur_cu->inter.mv_cand, 1,
+                                        AMVP_MAX_NUM_CANDS - 1);
+          }
+          }
+        } // for ref_list
+    } // if !merge
+
+    {
+      int cbf = (cbf_is_set(cur_cu->cbf.y, depth) ||
+                 cbf_is_set(cur_cu->cbf.u, depth) ||
+                 cbf_is_set(cur_cu->cbf.v, depth));
+
+      // Only need to signal coded block flag if not skipped or merged
+      // skip = no coded residual, merge = coded residual
+      if (!cur_cu->merged) {
+        cabac->ctx = &(cabac->ctx_cu_qt_root_cbf_model);
+        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
+      }
+      // Code (possible) coeffs to bitstream
+
+      if (cbf) {
+        encode_transform_coeff(encoder_state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+      }
+    }
+
+    // END for each part
+  } else if (cur_cu->type == CU_INTRA) {
+    uint8_t intra_pred_mode[4] = {
+      cur_cu->intra[0].mode, cur_cu->intra[1].mode,
+      cur_cu->intra[2].mode, cur_cu->intra[3].mode };
+      uint8_t intra_pred_mode_chroma = cur_cu->intra[0].mode_chroma;
+    int8_t intra_preds[4][3] = {{-1, -1, -1},{-1, -1, -1},{-1, -1, -1},{-1, -1, -1}};
+    int8_t mpm_preds[4] = {-1, -1, -1, -1};
+    int i, j;
+    uint32_t flag[4];
+    int num_pred_units = (cur_cu->part_size == SIZE_2Nx2N ? 1 : 4);
+
+    #if ENABLE_PCM == 1
+    // Code must start after variable initialization
+    cabac_encode_bin_trm(cabac, 0); // IPCMFlag == 0
+    #endif
+
+    // PREDINFO CODING
+    // If intra prediction mode is found from the predictors,
+    // it can be signaled with two EP's. Otherwise we can send
+    // 5 EP bins with the full predmode
+    for (j = 0; j < num_pred_units; ++j) {
+      static const vector2d offset[4] = {{0,0},{1,0},{0,1},{1,1}};
+      cu_info *left_cu = 0;
+      cu_info *above_cu = 0;
+
+      if (x_ctb > 0) {
+        left_cu = &cur_pic->cu_array[x_ctb - 1 + y_ctb * (cur_pic->width_in_lcu << MAX_DEPTH)];
+      }
+      // Don't take the above CU across the LCU boundary.
+      if (y_ctb > 0 && (y_ctb & 7) != 0) {
+        above_cu = &cur_pic->cu_array[x_ctb + (y_ctb - 1) * (cur_pic->width_in_lcu << MAX_DEPTH)];
+      }
+
+      intra_get_dir_luma_predictor((x_ctb<<3) + (offset[j].x<<2),
+                                   (y_ctb<<3) + (offset[j].y<<2),
+                                   intra_preds[j], cur_cu,
+                                   left_cu, above_cu);
+      for (i = 0; i < 3; i++) {
+        if (intra_preds[j][i] == intra_pred_mode[j]) {
+          mpm_preds[j] = (int8_t)i;
+          break;
+        }
+      }
+      flag[j] = (mpm_preds[j] == -1) ? 0 : 1;
+    }
+
+    cabac->ctx = &(cabac->ctx_intra_mode_model);
+    for (j = 0; j < num_pred_units; ++j) {
+      CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag");
+    }
+
+    for (j = 0; j < num_pred_units; ++j) {
+      // Signal index of the prediction mode in the prediction list.
+      if (flag[j]) {
+        CABAC_BIN_EP(cabac, (mpm_preds[j] == 0 ? 0 : 1), "mpm_idx");
+        if (mpm_preds[j] != 0) {
+          CABAC_BIN_EP(cabac, (mpm_preds[j] == 1 ? 0 : 1), "mpm_idx");
+        }
+      } else {
+        // Signal the actual prediction mode.
+        int32_t tmp_pred = intra_pred_mode[j];
+
+        // Sort prediction list from lowest to highest.
+        if (intra_preds[j][0] > intra_preds[j][1]) SWAP(intra_preds[j][0], intra_preds[j][1], int8_t);
+        if (intra_preds[j][0] > intra_preds[j][2]) SWAP(intra_preds[j][0], intra_preds[j][2], int8_t);
+        if (intra_preds[j][1] > intra_preds[j][2]) SWAP(intra_preds[j][1], intra_preds[j][2], int8_t);
+
+        // Reduce the index of the signaled prediction mode according to the
+        // prediction list, as it has been already signaled that it's not one
+        // of the prediction modes.
+        for (i = 2; i >= 0; i--) {
+          tmp_pred = (tmp_pred > intra_preds[j][i] ? tmp_pred - 1 : tmp_pred);
+        }
+
+        CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode");
+      }
+    }
+
+    {  // start intra chroma pred mode coding
+      unsigned pred_mode = 5;
+      unsigned chroma_pred_modes[4] = {0, 26, 10, 1};
+
+      if (intra_pred_mode_chroma == intra_pred_mode[0]) {
+        pred_mode = 4;
+      } else if (intra_pred_mode_chroma == 34) {
+        // Angular 34 mode is possible only if intra pred mode is one of the
+        // possible chroma pred modes, in which case it is signaled with that
+        // duplicate mode.
+        for (i = 0; i < 4; ++i) {
+          if (intra_pred_mode[0] == chroma_pred_modes[i]) pred_mode = i;
+        }
+      } else {
+        for (i = 0; i < 4; ++i) {
+          if (intra_pred_mode_chroma == chroma_pred_modes[i]) pred_mode = i;
+        }
+      }
+
+      // pred_mode == 5 mean intra_pred_mode_chroma is something that can't
+      // be coded.
+      assert(pred_mode != 5);
+
+      /**
+       * Table 9-35 - Binarization for intra_chroma_pred_mode
+       *   intra_chroma_pred_mode  bin_string
+       *                        4           0
+       *                        0         100
+       *                        1         101
+       *                        2         110
+       *                        3         111
+       * Table 9-37 - Assignment of ctxInc to syntax elements with context coded bins
+       *   intra_chroma_pred_mode[][] = 0, bypass, bypass
+       */
+      cabac->ctx = &(cabac->ctx_chroma_pred_model[0]);
+      if (pred_mode == 4) {
+        CABAC_BIN(cabac, 0, "intra_chroma_pred_mode");
+      } else {
+        CABAC_BIN(cabac, 1, "intra_chroma_pred_mode");
+        CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode");
+      }
+    }  // end intra chroma pred mode coding
+
+    encode_transform_coeff(encoder_state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+  }
+
+    #if ENABLE_PCM == 1
+  // Code IPCM block
+  if (cur_cu->type == CU_PCM) {
+    cabac_encode_bin_trm(cabac, 1); // IPCMFlag == 1
+      cabac_finish(cabac);
+      bitstream_align(cabac.stream);
+    // PCM sample
+      {
+      unsigned y, x;
+
+      pixel *base_y = &cur_pic->y_data[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))    + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH))) * encoder->in.width];
+      pixel *base_u = &cur_pic->u_data[(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2)];
+      pixel *base_v = &cur_pic->v_data[(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2)];
+
+      // Luma
+      for (y = 0; y < LCU_WIDTH >> depth; y++) {
+        for (x = 0; x < LCU_WIDTH >> depth; x++) {
+          bitstream_put(cabac.stream, base_y[x + y * encoder->in.width], 8);
+          }
+        }
+
+      // Chroma
+      if (encoder->in.video_format != FORMAT_400) {
+        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
+          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
+            bitstream_put(cabac.stream, base_u[x + y * (encoder->in.width >> 1)], 8);
+          }
+        }
+        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
+          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
+            bitstream_put(cabac.stream, base_v[x + y * (encoder->in.width >> 1)], 8);
+          }
+        }
+      }
+    }
+    // end PCM sample
+      cabac_start(cabac);
+  } // end Code IPCM block
+#endif /* END ENABLE_PCM */
+  else { /* Should not happend */
+    printf("UNHANDLED TYPE!\r\n");
+    assert(0);
+    exit(1);
+  }
+
+   /* end prediction unit */
+  /* end coding_unit */
+}
+
+
+coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth)
+{
+  // Scan mode is diagonal, except for 4x4+8x8 luma and 4x4 chroma, where:
+  // - angular 6-14 = vertical
+  // - angular 22-30 = horizontal
+  if (cu_type == CU_INTRA && depth >= 3) {
+    if (intra_mode >= 6 && intra_mode <= 14) {
+      return SCAN_VER;
+    } else if (intra_mode >= 22 && intra_mode <= 30) {
+      return SCAN_HOR;
+    }
+  }
+
+  return SCAN_DIAG;
+}
+
+
+static void encode_transform_unit(encoder_state * const encoder_state,
+                                  int x_pu, int y_pu, int depth)
+{
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  uint8_t width = LCU_WIDTH >> depth;
+  uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+
+  int x_cu = x_pu / 2;
+  int y_cu = y_pu / 2;
+  cu_info *cur_cu = &cur_pic->cu_array[x_cu + y_cu * (cur_pic->width_in_lcu << MAX_DEPTH)];
+
+  coefficient coeff_y[LCU_WIDTH*LCU_WIDTH+1];
+  coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
+  coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
+  int32_t coeff_stride = cur_pic->width;
+
+  int8_t scan_idx = get_scan_order(cur_cu->type, cur_cu->intra[PU_INDEX(x_pu, y_pu)].mode, depth);
+
+  int cbf_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
+
+  if (cbf_y) {
+    int x = x_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
+    int y = y_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
+    coefficient *orig_pos = &cur_pic->coeff_y[x + y * cur_pic->width];
+    for (y = 0; y < width; y++) {
+      for (x = 0; x < width; x++) {
+        coeff_y[x+y*width] = orig_pos[x];
+      }
+      orig_pos += coeff_stride;
+    }
+  }
+
+  // CoeffNxN
+  // Residual Coding
+  if (cbf_y) {
+    encode_coeff_nxn(encoder_state, coeff_y, width, 0, scan_idx, cur_cu->intra[PU_INDEX(x_pu, y_pu)].tr_skip);
+  }
+
+  if (depth == MAX_DEPTH + 1 && !(x_pu % 2 && y_pu % 2)) {
+    // For size 4x4 luma transform the corresponding chroma transforms are
+    // also of size 4x4 covering 8x8 luma pixels. The residual is coded
+    // in the last transform unit so for the other ones, don't do anything.
+    return;
+  }
+
+  if (cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth)) {
+    int x, y;
+    coefficient *orig_pos_u, *orig_pos_v;
+
+    if (depth <= MAX_DEPTH) {
+      x = x_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
+      y = y_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
+    } else {
+      // for 4x4 select top left pixel of the CU.
+      x = x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
+      y = y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
+    }
+    orig_pos_u = &cur_pic->coeff_u[x + y * (cur_pic->width >> 1)];
+    orig_pos_v = &cur_pic->coeff_v[x + y * (cur_pic->width >> 1)];
+    for (y = 0; y < (width_c); y++) {
+      for (x = 0; x < (width_c); x++) {
+        coeff_u[x+y*(width_c)] = orig_pos_u[x];
+        coeff_v[x+y*(width_c)] = orig_pos_v[x];
+      }
+      orig_pos_u += coeff_stride>>1;
+      orig_pos_v += coeff_stride>>1;
+    }
+
+    scan_idx = get_scan_order(cur_cu->type, cur_cu->intra[0].mode_chroma, depth);
+
+    if (cbf_is_set(cur_cu->cbf.u, depth)) {
+      encode_coeff_nxn(encoder_state, coeff_u, width_c, 2, scan_idx, 0);
+    }
+
+    if (cbf_is_set(cur_cu->cbf.v, depth)) {
+      encode_coeff_nxn(encoder_state, coeff_v, width_c, 2, scan_idx, 0);
+    }
+  }
+}
+
+/**
+ * \param encoder
+ * \param x_pu            Prediction units' x coordinate.
+ * \param y_pu            Prediction units' y coordinate.
+ * \param depth           Depth from LCU.
+ * \param tr_depth        Depth from last CU.
+ * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
+ * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
+ */
+void encode_transform_coeff(encoder_state * const encoder_state, int32_t x_pu,int32_t y_pu,
+                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  int32_t x_cu = x_pu / 2;
+  int32_t y_cu = y_pu / 2;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  cu_info *cur_cu = &cur_pic->cu_array[x_cu + y_cu * (cur_pic->width_in_lcu << MAX_DEPTH)];
+
+  // NxN signifies implicit transform split at the first transform level.
+  // There is a similar implicit split for inter, but it is only used when
+  // transform hierarchy is not in use.
+  int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
+
+  // The implicit split by intra NxN is not counted towards max_tr_depth.
+  int max_tr_depth = (cur_cu->type == CU_INTRA ? TR_DEPTH_INTRA + intra_split_flag : TR_DEPTH_INTER);
+
+  int8_t split = (cur_cu->tr_depth > depth);
+
+  const int cb_flag_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
+  const int cb_flag_u = cbf_is_set(cur_cu->cbf.u, depth);
+  const int cb_flag_v = cbf_is_set(cur_cu->cbf.v, depth);
+
+  // The split_transform_flag is not signaled when:
+  // - transform size is greater than 32 (depth == 0)
+  // - transform size is 4 (depth == MAX_PU_DEPTH)
+  // - transform depth is max
+  // - cu is intra NxN and it's the first split
+  if (depth > 0 &&
+      depth < MAX_PU_DEPTH &&
+      tr_depth < max_tr_depth &&
+      !(intra_split_flag && tr_depth == 0))
+  {
+    cabac->ctx = &(cabac->ctx_trans_subdiv_model[5 - ((g_convert_to_bit[LCU_WIDTH] + 2) - depth)]);
+    CABAC_BIN(cabac, split, "split_transform_flag");
+  }
+
+  // Chroma cb flags are not signaled when one of the following:
+  // - transform size is 4 (2x2 chroma transform doesn't exist)
+  // - they have already been signaled to 0 previously
+  // When they are not present they are inferred to be 0, except for size 4
+  // when the flags from previous level are used.
+  if (depth < MAX_PU_DEPTH) {
+    cabac->ctx = &(cabac->ctx_qt_cbf_model_chroma[tr_depth]);
+    if (tr_depth == 0 || parent_coeff_u) {
+      CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
+    }
+    if (tr_depth == 0 || parent_coeff_v) {
+      CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
+    }
+  }
+
+  if (split) {
+    uint8_t pu_offset = 1 << (MAX_PU_DEPTH - (depth + 1));
+    encode_transform_coeff(encoder_state, x_pu, y_pu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder_state, x_pu + pu_offset, y_pu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder_state, x_pu, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder_state, x_pu + pu_offset, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    return;
+  }
+
+  // Luma coded block flag is signaled when one of the following:
+  // - prediction mode is intra
+  // - transform depth > 0
+  // - we have chroma coefficients at this level
+  // When it is not present, it is inferred to be 1.
+  if(cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
+      cabac->ctx = &(cabac->ctx_qt_cbf_model_luma[!tr_depth]);
+      CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
+  }
+
+  if (cb_flag_y | cb_flag_u | cb_flag_v) {
+    encode_transform_unit(encoder_state, x_pu, y_pu, depth);
+  }
+}
+
+void encode_coeff_nxn(encoder_state * const encoder_state, coefficient *coeff, uint8_t width,
+                      uint8_t type, int8_t scan_mode, int8_t tr_skip)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  cabac_data * const cabac = &encoder_state->cabac;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_flag[64];
+
+  uint32_t num_nonzero = 0;
+  int32_t scan_pos_last = -1;
+  int32_t pos_last = 0;
+  int32_t shift   = 4>>1;
+  int8_t be_valid = ENABLE_SIGN_HIDING;
+  int32_t scan_pos_sig;
+  int32_t last_scan_set;
+  uint32_t go_rice_param = 0;
+  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> shift;
+  const uint32_t log2_block_size = g_convert_to_bit[width] + 2;
+  const uint32_t *scan           =
+    g_sig_last_scan[scan_mode][log2_block_size - 1];
+  const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
+
+  // Init base contexts according to block type
+  cabac_ctx *base_coeff_group_ctx = &(cabac->ctx_cu_sig_coeff_group_model[type]);
+  cabac_ctx *baseCtx           = (type == 0) ? &(cabac->ctx_cu_sig_model_luma[0]) :
+                                 &(cabac->ctx_cu_sig_model_chroma[0]);
+  memset(sig_coeffgroup_flag,0,sizeof(uint32_t)*64);
+
+  // Count non-zero coeffs
+  for (i = 0; i < width * width; i++) {
+    if (coeff[i] != 0) {
+      num_nonzero++;
+    }
+  }
+
+  // Transforms with no non-zero coefficients are indicated with CBFs.
+  assert(num_nonzero != 0);
+
+  // transform skip flag
+  if(width == 4 && encoder->trskip_enable) {
+    cabac->ctx = (type == 0) ? &(cabac->ctx_transform_skip_model_luma) : &(cabac->ctx_transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  scan_pos_last = -1;
+
+  // Significance mapping
+  while (num_nonzero > 0) {
+    pos_last = scan[++scan_pos_last];
+#define POSY (pos_last >> log2_block_size)
+#define POSX (pos_last - ( POSY << log2_block_size ))
+
+    if (coeff[pos_last] != 0) {
+      sig_coeffgroup_flag[(num_blk_side * (POSY >> shift) + (POSX >> shift))] = 1;
+    }
+
+    num_nonzero -= (coeff[pos_last] != 0) ? 1 : 0;
+    #undef POSY
+    #undef POSX
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  encode_last_significant_xy(encoder_state, last_coeff_x, last_coeff_y, width, width,
+                             type, scan_mode);
+
+  scan_pos_sig  = scan_pos_last;
+  last_scan_set = (scan_pos_last >> 4);
+
+  // significant_coeff_flag
+  for (i = last_scan_set; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t abs_coeff[16];
+    int32_t cg_blk_pos     = scan_cg[i];
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    uint32_t coeff_signs   = 0;
+    int32_t last_nz_pos_in_cg = -1;
+    int32_t first_nz_pos_in_cg = 16;
+    int32_t num_non_zero = 0;
+    go_rice_param = 0;
+
+    if (scan_pos_sig == scan_pos_last) {
+      abs_coeff[0] = abs(coeff[pos_last]);
+      coeff_signs  = (coeff[pos_last] < 0);
+      num_non_zero = 1;
+      last_nz_pos_in_cg  = scan_pos_sig;
+      first_nz_pos_in_cg = scan_pos_sig;
+      scan_pos_sig--;
+    }
+
+    if (i == last_scan_set || i == 0) {
+      sig_coeffgroup_flag[cg_blk_pos] = 1;
+    } else {
+      uint32_t sig_coeff_group   = (sig_coeffgroup_flag[cg_blk_pos] != 0);
+      uint32_t ctx_sig  = context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                      cg_pos_y, width);
+      cabac->ctx = &base_coeff_group_ctx[ctx_sig];
+      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+    }
+
+    if (sig_coeffgroup_flag[cg_blk_pos]) {
+      int32_t pattern_sig_ctx = context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
+                                                             cg_pos_x, cg_pos_y, width);
+
+      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
+        blk_pos = scan[scan_pos_sig];
+        pos_y   = blk_pos >> log2_block_size;
+        pos_x   = blk_pos - (pos_y << log2_block_size);
+        sig    = (coeff[blk_pos] != 0) ? 1 : 0;
+
+        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
+          ctx_sig  = context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                             log2_block_size, type);
+          cabac->ctx = &baseCtx[ctx_sig];
+          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+        }
+
+        if (sig) {
+          abs_coeff[num_non_zero] = abs(coeff[blk_pos]);
+          coeff_signs              = 2 * coeff_signs + (coeff[blk_pos] < 0);
+          num_non_zero++;
+
+          if (last_nz_pos_in_cg == -1) {
+            last_nz_pos_in_cg = scan_pos_sig;
+          }
+
+          first_nz_pos_in_cg  = scan_pos_sig;
+        }
+      }
+    } else {
+      scan_pos_sig = sub_pos - 1;
+    }
+
+    if (num_non_zero > 0) {
+      int8_t sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >=
+                            4 /*SBH_THRESHOLD*/) ? 1 : 0;
+      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
+      cabac_ctx *base_ctx_mod;
+      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
+
+      if (c1 == 0) {
+        ctx_set++;
+      }
+
+      c1 = 1;
+
+      base_ctx_mod     = (type == 0) ? &(cabac->ctx_cu_one_model_luma[4 * ctx_set]) :
+                         &(cabac->ctx_cu_one_model_chroma[4 * ctx_set]);
+      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
+      first_c2_flag_idx = -1;
+
+      for (idx = 0; idx < num_c1_flag; idx++) {
+        uint32_t symbol = (abs_coeff[idx] > 1) ? 1 : 0;
+        cabac->ctx = &base_ctx_mod[c1];
+        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+
+        if (symbol) {
+          c1 = 0;
+
+          if (first_c2_flag_idx == -1) {
+            first_c2_flag_idx = idx;
+          }
+        } else if ((c1 < 3) && (c1 > 0)) {
+          c1++;
+        }
+      }
+
+      if (c1 == 0) {
+        base_ctx_mod = (type == 0) ? &(cabac->ctx_cu_abs_model_luma[ctx_set]) :
+                       &(cabac->ctx_cu_abs_model_chroma[ctx_set]);
+
+        if (first_c2_flag_idx != -1) {
+          uint8_t symbol = (abs_coeff[first_c2_flag_idx] > 2) ? 1 : 0;
+          cabac->ctx      = &base_ctx_mod[0];
+          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+        }
+      }
+
+      if (be_valid && sign_hidden) {
+        CABAC_BINS_EP(cabac, (coeff_signs >> 1), (num_non_zero - 1), "coeff_sign_flag");
+      } else {
+        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
+      }
+
+      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
+        first_coeff2 = 1;
+
+        for (idx = 0; idx < num_non_zero; idx++) {
+          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
+
+          if (abs_coeff[idx] >= base_level) {
+            cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param);
+
+            if (abs_coeff[idx] > 3 * (1 << go_rice_param)) {
+              go_rice_param = MIN(go_rice_param + 1, 4);
+            }
+          }
+
+          if (abs_coeff[idx] >= 2) {
+            first_coeff2 = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+/*!
+ \brief Encode (X,Y) position of the last significant coefficient
+ \param lastpos_x X component of last coefficient
+ \param lastpos_y Y component of last coefficient
+ \param width  Block width
+ \param height Block height
+ \param type plane type / luminance or chrominance
+ \param scan scan type (diag, hor, ver)
+
+ This method encodes the X and Y component within a block of the last significant coefficient.
+*/
+void encode_last_significant_xy(encoder_state * const encoder_state,
+                                uint8_t lastpos_x, uint8_t lastpos_y,
+                                uint8_t width, uint8_t height,
+                                uint8_t type, uint8_t scan)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  uint8_t offset_x  = type?0:((TOBITS(width)*3) + ((TOBITS(width)+1)>>2)),offset_y = offset_x;
+  uint8_t shift_x   = type?(TOBITS(width)):((TOBITS(width)+3)>>2), shift_y = shift_x;
+  int group_idx_x;
+  int group_idx_y;
+  int last_x,last_y,i;
+  cabac_ctx *base_ctx_x = (type ? cabac->ctx_cu_ctx_last_x_chroma : cabac->ctx_cu_ctx_last_x_luma);
+  cabac_ctx *base_ctx_y = (type ? cabac->ctx_cu_ctx_last_y_chroma : cabac->ctx_cu_ctx_last_y_luma);
+
+  if (scan == SCAN_VER) {
+    SWAP( lastpos_x, lastpos_y,uint8_t );
+  }
+
+  group_idx_x   = g_group_idx[lastpos_x];
+  group_idx_y   = g_group_idx[lastpos_y];
+
+  // Last X binarization
+  for (last_x = 0; last_x < group_idx_x ; last_x++) {
+    cabac->ctx = &base_ctx_x[offset_x + (last_x >> shift_x)];
+    CABAC_BIN(cabac,1,"last_sig_coeff_x_prefix");
+  }
+
+  if (group_idx_x < g_group_idx[width - 1]) {
+    cabac->ctx = &base_ctx_x[offset_x + (last_x >> shift_x)];
+    CABAC_BIN(cabac,0,"last_sig_coeff_x_prefix");
+  }
+
+  // Last Y binarization
+  for (last_y = 0; last_y < group_idx_y ; last_y++) {
+    cabac->ctx = &base_ctx_y[offset_y + (last_y >> shift_y)];
+    CABAC_BIN(cabac,1,"last_sig_coeff_y_prefix");
+  }
+
+  if (group_idx_y < g_group_idx[height - 1]) {
+    cabac->ctx = &base_ctx_y[offset_y + (last_y >> shift_y)];
+    CABAC_BIN(cabac,0,"last_sig_coeff_y_prefix");
+  }
+
+  // Last X
+  if (group_idx_x > 3) {
+    lastpos_x -= g_min_in_group[group_idx_x];
+
+    for (i = ((group_idx_x - 2) >> 1) - 1; i >= 0; i--) {
+      CABAC_BIN_EP(cabac,(lastpos_x>>i) & 1,"last_sig_coeff_x_suffix");
+    }
+  }
+
+  // Last Y
+  if (group_idx_y > 3) {
+    lastpos_y -= g_min_in_group[group_idx_y];
+
+    for (i = ((group_idx_y - 2) >> 1) - 1; i >= 0; i--) {
+      CABAC_BIN_EP(cabac,(lastpos_y>>i) & 1,"last_sig_coeff_y_suffix");
+    }
+  }
+
+  // end LastSignificantXY
+}
diff --git a/src/encoderstate.h b/src/encoderstate.h
new file mode 100644
index 00000000..515d23d2
--- /dev/null
+++ b/src/encoderstate.h
@@ -0,0 +1,204 @@
+#ifndef ENCODERSTATE_H_
+#define ENCODERSTATE_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ * \brief
+ */
+
+#include "global.h"
+
+#include "encoder.h"
+#include "picture.h"
+#include "bitstream.h"
+#include "cabac.h"
+#include "config.h"
+#include "tables.h"
+#include "scalinglist.h"
+#include "threadqueue.h"
+
+typedef enum {
+  ENCODER_STATE_TYPE_INVALID = 'i',
+  ENCODER_STATE_TYPE_MAIN = 'M',
+  ENCODER_STATE_TYPE_SLICE = 'S',
+  ENCODER_STATE_TYPE_TILE = 'T',
+  ENCODER_STATE_TYPE_WAVEFRONT_ROW = 'W',
+} encoder_state_type;
+
+
+
+typedef struct {
+  double cur_lambda_cost;
+  
+  int32_t frame;
+  int32_t poc; /*!< \brief picture order count */
+  
+  int8_t QP;   //!< \brief Quantization parameter
+  
+  //Current picture available references
+  picture_list *ref;
+  int8_t ref_list;
+  //int8_t ref_idx_num[2];
+  
+  int is_radl_frame;
+  uint8_t pictype;
+  uint8_t slicetype;
+  
+} encoder_state_config_global;
+
+typedef struct {
+  //Current picture to encode
+  picture *cur_pic;
+  
+  int32_t id;
+  
+  //Tile: offset in LCU for current encoder_state in global coordinates
+  int32_t lcu_offset_x;
+  int32_t lcu_offset_y;
+  
+  //Position of the first element in tile scan in global coordinates
+  int32_t lcu_offset_in_ts;
+  
+  //Buffer for search
+  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
+  yuv_t *hor_buf_search;
+  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
+  yuv_t *ver_buf_search;
+  
+  yuv_t *hor_buf_before_sao;
+  yuv_t *ver_buf_before_sao;
+  
+  //Job pointers for wavefronts
+  threadqueue_job **wf_jobs;
+} encoder_state_config_tile;
+
+typedef struct {
+  int32_t id;
+  
+  //Global coordinates
+  int32_t start_in_ts;
+  int32_t end_in_ts;
+  
+  //Global coordinates
+  int32_t start_in_rs;
+  int32_t end_in_rs;
+} encoder_state_config_slice;
+
+typedef struct {
+  //Row in tile coordinates of the wavefront
+  int32_t lcu_offset_y;
+} encoder_state_config_wfrow;
+
+typedef struct lcu_order_element {
+  //This it used for leaf of the encoding tree. All is relative to the tile.
+  int id;
+  int index;
+  struct encoder_state *encoder_state;
+  vector2d position;
+  vector2d position_px; //Top-left
+  vector2d size;
+  int first_column;
+  int first_row;
+  int last_column;
+  int last_row;
+  
+  struct lcu_order_element *above;
+  struct lcu_order_element *below;
+  struct lcu_order_element *left;
+  struct lcu_order_element *right;
+} lcu_order_element;
+
+typedef struct encoder_state {
+  const encoder_control *encoder_control;
+  encoder_state_type type;
+
+  //List of children, the last item of this list is a pseudo-encoder with encoder_control = NULL
+  //Use for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+  struct encoder_state *children;
+  struct encoder_state *parent;
+  
+  encoder_state_config_global *global;
+  encoder_state_config_tile   *tile;
+  encoder_state_config_slice  *slice;
+  encoder_state_config_wfrow  *wfrow;
+  
+  int is_leaf; //A leaf encoder state is one which should encode LCUs...
+  lcu_order_element *lcu_order;
+  uint32_t lcu_order_count;
+  
+  bitstream stream;
+  cabac_data cabac;
+} encoder_state;
+
+int encoder_state_init(encoder_state * child_state, encoder_state * parent_state);
+void encoder_state_finalize(encoder_state *encoder_state);
+void encoder_state_init_lambda(encoder_state *encoder_state);
+
+void encode_one_frame(encoder_state *encoder_state);
+int read_one_frame(FILE* file, const encoder_state *encoder);
+
+void encoder_next_frame(encoder_state *encoder_state);
+
+void encode_seq_parameter_set(encoder_state *encoder);
+void encode_pic_parameter_set(encoder_state *encoder);
+void encode_vid_parameter_set(encoder_state *encoder);
+void encode_slice_header(encoder_state * encoder);
+void encode_access_unit_delimiter(encoder_state *encoder);
+void encode_prefix_sei_version(encoder_state *encoder);
+void encode_coding_tree(encoder_state *encoder, uint16_t x_ctb,
+                        uint16_t y_ctb, uint8_t depth);
+
+void encode_last_significant_xy(encoder_state *encoder,
+                                uint8_t lastpos_x, uint8_t lastpos_y,
+                                uint8_t width, uint8_t height,
+                                uint8_t type, uint8_t scan);
+void encode_coeff_nxn(encoder_state *encoder, int16_t *coeff, uint8_t width,
+                      uint8_t type, int8_t scan_mode, int8_t tr_skip);
+void encode_transform_coeff(encoder_state *encoder_state, int32_t x_cu, int32_t y_cu,
+                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
+void encode_block_residual(const encoder_control * const encoder,
+                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
+
+coeff_scan_order_t get_scan_order(int8_t cu_type, int intra_mode, int depth);
+
+static const uint8_t g_group_idx[32] = {
+  0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
+  6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
+  8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
+  9, 9 };
+
+static const uint8_t g_min_in_group[10] = {
+  0, 1, 2, 3, 4, 6, 8, 12, 16, 24 };
+
+
+#define C1FLAG_NUMBER 8 // maximum number of largerThan1 flag coded in one chunk
+#define C2FLAG_NUMBER 1 // maximum number of largerThan2 flag coded in one chunk
+
+//Get the data for vertical buffer position at the left of LCU identified by the position in pixel
+#define OFFSET_VER_BUF(position_x, position_y, cur_pic, i) ((position_y) + i + ((position_x)/LCU_WIDTH - 1) * (cur_pic)->height)
+#define OFFSET_VER_BUF_C(position_x, position_y, cur_pic, i) ((position_y/2) + i + ((position_x)/LCU_WIDTH - 1) * (cur_pic)->height / 2)
+
+//Get the data for horizontal buffer position at the top of LCU identified by the position in pixel
+#define OFFSET_HOR_BUF(position_x, position_y, cur_pic, i) ((position_x) + i + ((position_y)/LCU_WIDTH - 1) * (cur_pic)->width)
+#define OFFSET_HOR_BUF_C(position_x, position_y, cur_pic, i) ((position_x/2) + i + ((position_y)/LCU_WIDTH - 1) * (cur_pic)->width / 2)
+  
+
+#endif //ENCODERSTATE_H_
diff --git a/src/filter.h b/src/filter.h
index 61606c50..3f7ee130 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -27,6 +27,7 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "encoderstate.h"
 
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/inter.h b/src/inter.h
index d02749aa..c0df8bb1 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -28,6 +28,7 @@
 
 #include "picture.h"
 #include "encoder.h"
+#include "encoderstate.h"
 
 
 void inter_set_block(picture* pic,uint32_t x_cu, uint32_t y_cu, uint8_t depth, cu_info *cur_cu);
diff --git a/src/intra.h b/src/intra.h
index 89369011..7bfe1a40 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -28,6 +28,7 @@
 
 #include "picture.h"
 #include "encoder.h"
+#include "encoderstate.h"
 
 void intra_set_block_mode(picture* pic,uint32_t x_ctb, uint32_t y_ctb, uint8_t depth, uint8_t mode, uint8_t part_mode);
 
diff --git a/src/rdo.h b/src/rdo.h
index 4a118eed..fd073d00 100644
--- a/src/rdo.h
+++ b/src/rdo.h
@@ -27,6 +27,7 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "encoderstate.h"
 
 
 typedef struct
diff --git a/src/sao.h b/src/sao.h
index 93fd7967..35564634 100644
--- a/src/sao.h
+++ b/src/sao.h
@@ -27,6 +27,7 @@
 #include "global.h"
 #include "picture.h"
 #include "encoder.h"
+#include "encoderstate.h"
 #include "math.h"
 
 typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
diff --git a/src/search.h b/src/search.h
index 667f7667..26fb7916 100644
--- a/src/search.h
+++ b/src/search.h
@@ -27,6 +27,7 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "encoderstate.h"
 #include "picture.h"
 
 
diff --git a/src/transform.h b/src/transform.h
index 82ff3969..7ee30735 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -27,6 +27,7 @@
 #include "global.h"
 
 #include "encoder.h"
+#include "encoderstate.h"
 
 #include <math.h>
 

From 3d07f8cc849935dd1143396a079c5285f8e486b8 Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 14:25:16 +0200
Subject: [PATCH 08/15] encoderstate refactor

---
 src/encoderstate.c | 1552 ++++++++++++++++++++++----------------------
 src/encoderstate.h |    7 +-
 2 files changed, 791 insertions(+), 768 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index 54a26979..7640aa3d 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -51,7 +51,7 @@ static void encode_sao(encoder_state *encoder,
                        sao_info *sao_luma, sao_info *sao_chroma);
 
 static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state);
-static void worker_encoder_state_write_bitstream_leaf(void * opaque);
+static void encoder_state_worker_write_bitstream_leaf(void * opaque);
 
 /*!
   \brief Initializes lambda-value for current QP
@@ -82,6 +82,7 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
   encoder_state->global->cur_lambda_cost = lambda;
 }
 
+/******  BEGIN Functions to obtain geometry information from LCU  ******/
 static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
   int i;
   assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
@@ -195,7 +196,10 @@ static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu
   return 0;
 }
 
+/******  END Functions to obtain geometry information from LCU  ******/
 
+
+/******  BEGIN Constructors/destructors  ******/
 static int encoder_state_config_global_init(encoder_state * const encoder_state) {
   encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
   if(!encoder_state->global->ref) {
@@ -848,16 +852,776 @@ void encoder_state_finalize(encoder_state * const encoder_state) {
   bitstream_finalize(&encoder_state->stream);
 }
 
+/******  END Constructors/destructors  ******/
 
-static void encoder_state_clear_refs(encoder_state *encoder_state) {
-  //FIXME: Do we need to handle children? At present they all share the same global
-  while (encoder_state->global->ref->used_size) {
-    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
+/******  BEGIN Functions writing bitstream parts ******/
+
+static void encoder_state_write_bitstream_access_unit_delimiter(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
+                   : encoder_state->global->slicetype == SLICE_P ? 1
+                   :                                             2;
+  WRITE_U(stream, pic_type, 3, "pic_type");
+}
+
+static void encoder_state_write_bitstream_aud(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  encoder_state_write_bitstream_access_unit_delimiter(encoder_state);
+  nal_write(stream, AUD_NUT, 0, 1);
+  bitstream_align(stream);
+}
+
+static void encoder_state_write_bitstream_PTL(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+  // PTL
+  // Profile Tier
+  WRITE_U(stream, 0, 2, "general_profile_space");
+  WRITE_U(stream, 0, 1, "general_tier_flag");
+  // Main Profile == 1
+  WRITE_U(stream, 1, 5, "general_profile_idc");
+  /* Compatibility flags should be set at general_profile_idc
+   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
+   * According to specification, when compatibility_flag[1] is set,
+   *  compatibility_flag[2] should be set too.
+   */
+  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
+
+  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
+  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
+  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
+  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
+
+  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
+  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
+
+  // end Profile Tier
+
+  // Level 6.2 (general_level_idc is 30 * 6.2)
+  WRITE_U(stream, 186, 8, "general_level_idc");
+
+  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
+  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
+
+  for (i = 1; i < 8; i++) {
+    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
   }
 
-  encoder_state->global->poc = 0;
+  // end PTL
 }
 
+static void encoder_state_write_bitstream_vid_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+#ifdef _DEBUG
+  printf("=========== Video Parameter Set ID: 0 ===========\n");
+#endif
+
+  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
+  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
+  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
+  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
+  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
+
+  encoder_state_write_bitstream_PTL(encoder_state);
+
+  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  for (i = 0; i < 1; i++) {
+  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "vps_num_reorder_pics");
+  WRITE_UE(stream, 0, "vps_max_latency_increase");
+  }
+
+  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
+  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
+  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
+
+  //IF timing info
+  //END IF
+
+  WRITE_U(stream, 0, 1, "vps_extension_flag");
+}
+
+static void encoder_state_write_bitstream_scaling_list(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+  uint32_t size_id;
+  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
+    int32_t list_id;
+    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
+      uint8_t scaling_list_pred_mode_flag = 1;
+      int32_t pred_list_idx;
+      int32_t i;
+      uint32_t ref_matrix_id = UINT32_MAX;
+
+      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
+        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
+                                     scalinglist_get_default(size_id, pred_list_idx) :
+                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
+
+        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
+            ((size_id < SCALING_LIST_16x16) ||
+             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
+          ref_matrix_id = pred_list_idx;
+          scaling_list_pred_mode_flag = 0;
+          break;
+        }
+      }
+      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
+
+      if (!scaling_list_pred_mode_flag) {
+        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
+      } else {
+        int32_t delta;
+        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
+        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
+        int32_t next_coef = 8;
+        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
+
+        if (size_id >= SCALING_LIST_16x16) {
+          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
+          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
+        }
+
+        for (i = 0; i < coef_num; i++) {
+          delta     = coef_list[scan_cg[i]] - next_coef;
+          next_coef = coef_list[scan_cg[i]];
+          if (delta > 127)
+            delta -= 256;
+          if (delta < -128)
+            delta += 256;
+
+          WRITE_SE(stream, delta, "scaling_list_delta_coef");
+        }
+      }
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_seq_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  //FIXME: use encoder_control instead of cur_pic
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+
+#ifdef _DEBUG
+  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
+#endif
+
+  // TODO: profile IDC and level IDC should be defined later on
+  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
+  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
+
+  encoder_state_write_bitstream_PTL(encoder_state);
+
+  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
+  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
+           "chroma_format_idc");
+
+  if (encoder_state->encoder_control->in.video_format == 3) {
+    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
+  }
+
+  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
+  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
+
+  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
+    // The standard does not seem to allow setting conf_win values such that
+    // the number of luma samples is not a multiple of 2. Options are to either
+    // hide one line or show an extra line of non-video. Neither seems like a
+    // very good option, so let's not even try.
+    assert(!(cur_pic->width % 2));
+    WRITE_U(stream, 1, 1, "conformance_window_flag");
+    WRITE_UE(stream, 0, "conf_win_left_offset");
+    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
+             "conf_win_right_offset");
+    WRITE_UE(stream, 0, "conf_win_top_offset");
+    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
+             "conf_win_bottom_offset");
+  } else {
+    WRITE_U(stream, 0, 1, "conformance_window_flag");
+  }
+
+  //IF window flag
+  //END IF
+
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
+  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "sps_num_reorder_pics");
+  WRITE_UE(stream, 0, "sps_max_latency_increase");
+  //end for
+
+  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
+  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
+  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
+  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
+  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
+  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
+
+  // scaling list
+  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
+  if (encoder_state->encoder_control->scaling_list.enable) {
+    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
+    encoder_state_write_bitstream_scaling_list(encoder_state);
+  }
+
+  WRITE_U(stream, 0, 1, "amp_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
+          "sample_adaptive_offset_enabled_flag");
+  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
+  #if ENABLE_PCM == 1
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
+    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
+    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
+    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
+  #endif
+
+  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
+
+  //IF num short term ref pic sets
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
+
+  //IF long_term_ref_pics_present
+  //ENDIF
+
+  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
+          "sps_temporal_mvp_enable_flag");
+  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
+  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
+
+  encode_VUI(encoder_state);
+
+  WRITE_U(stream, 0, 1, "sps_extension_flag");
+}
+
+static void encoder_state_write_bitstream_pic_parameter_set(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+#ifdef _DEBUG
+  printf("=========== Picture Parameter Set ID: 0 ===========\n");
+#endif
+  WRITE_UE(stream, 0, "pic_parameter_set_id");
+  WRITE_UE(stream, 0, "seq_parameter_set_id");
+  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
+  WRITE_U(stream, 0, 1, "output_flag_present_flag");
+  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
+  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
+  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
+
+  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
+  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
+  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
+  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
+  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
+  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+  //if cu_qp_delta_enabled_flag
+  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+
+  //TODO: add QP offsets
+  WRITE_SE(stream, 0, "pps_cb_qp_offset");
+  WRITE_SE(stream, 0, "pps_cr_qp_offset");
+  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
+  WRITE_U(stream, 0, 1, "weighted_pred_flag");
+  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
+
+  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
+  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
+  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
+  //wavefronts
+  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
+
+  if (encoder->tiles_enable) {
+    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
+    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
+    
+    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
+    
+    if (!encoder->tiles_uniform_spacing_flag) {
+      int i;
+      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
+      }
+      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
+      }
+    }
+    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
+    
+  }
+  
+  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
+  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
+
+  //IF deblocking_filter
+    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
+          "pps_disable_deblocking_filter_flag");
+
+    //IF !disabled
+  if (encoder_state->encoder_control->deblock_enable) {
+     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
+     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
+    }
+
+    //ENDIF
+  //ENDIF
+  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
+  //IF scaling_list
+  //ENDIF
+  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
+  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
+  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
+  WRITE_U(stream, 0, 1, "pps_extension_flag");
+}
+
+void encoder_state_write_bitstream_prefix_sei_version(encoder_state * const encoder_state)
+{
+#define STR_BUF_LEN 1000
+  bitstream * const stream = &encoder_state->stream;
+  int i, length;
+  char buf[STR_BUF_LEN] = { 0 };
+  char *s = buf + 16;
+  const config * const cfg = encoder_state->encoder_control->cfg;
+
+  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
+  static const uint8_t uuid[16] = {
+    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
+    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
+  };
+  memcpy(buf, uuid, 16);
+
+  // user_data_payload_byte
+  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
+                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
+  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
+  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
+               cfg->deblock_beta, cfg->deblock_tc);
+  s += sprintf(s, " sao=%d", cfg->sao_enable);
+  s += sprintf(s, " intra_period=%d", cfg->intra_period);
+  s += sprintf(s, " qp=%d", cfg->qp);
+  s += sprintf(s, " ref=%d", cfg->ref_frames);
+
+  length = (int)(s - buf + 1);  // length, +1 for \0
+
+  // Assert this so that in the future if the message gets longer, we remember
+  // to increase the buf len. Divide by 2 for margin.
+  assert(length < STR_BUF_LEN / 2);
+
+  // payloadType = 5 -> user_data_unregistered
+  WRITE_U(stream, 5, 8, "last_payload_type_byte");
+
+  // payloadSize
+  for (i = 0; i <= length - 255; i += 255)
+    WRITE_U(stream, 255, 8, "ff_byte");
+  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
+
+  for (i = 0; i < length; i++)
+    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
+
+#undef STR_BUF_LEN
+}
+
+static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      if (my_length > *r_max_length) {
+        *r_max_length = my_length;
+      }
+    } else {
+      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      //Don't write the last one
+      if (*r_count < num_entry_points) {
+        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
+      }
+    } else {
+      encoder_state_write_bitstream_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
+    }
+  }
+}
+
+static int num_bitcount(unsigned int n) {
+  int pos = 0;
+  if (n >= 1<<16) { n >>= 16; pos += 16; }
+  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
+  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
+  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
+  if (n >= 1<< 1) {           pos +=  1; }
+  return ((n == 0) ? (-1) : pos);
+}
+
+void encoder_state_write_bitstream_slice_header(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+
+#ifdef _DEBUG
+  printf("=========== Slice ===========\n");
+#endif
+  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
+
+  if (encoder_state->global->pictype >= NAL_BLA_W_LP
+      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
+    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
+  }
+
+  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
+  if (encoder_state->slice->start_in_rs > 0) {
+    //For now, we don't support dependent slice segments
+    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
+    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
+  }
+
+  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
+
+  // if !entropy_slice_flag
+
+    //if output_flag_present_flag
+      //WRITE_U(stream, 1, 1, "pic_output_flag");
+    //end if
+    //if( IdrPicFlag ) <- nal_unit_type == 5
+  if (encoder_state->global->pictype != NAL_IDR_W_RADL
+      && encoder_state->global->pictype != NAL_IDR_N_LP) {
+      int j;
+      int ref_negative = encoder_state->global->ref->used_size;
+      int ref_positive = 0;
+      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
+      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
+      WRITE_UE(stream, ref_negative, "num_negative_pics");
+      WRITE_UE(stream, ref_positive, "num_positive_pics");
+
+    for (j = 0; j < ref_negative; j++) {
+      int32_t delta_poc_minus1 = 0;
+      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
+      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
+    }
+
+    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
+  }
+
+    //end if
+  //end if
+  if (encoder->sao_enable) {
+    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
+    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
+  }
+
+  if (encoder_state->global->slicetype != SLICE_I) {
+      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
+        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+  }
+
+  if (encoder_state->global->slicetype == SLICE_B) {
+      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
+  }
+
+  // Skip flags that are not present
+  // if !entropy_slice_flag
+    WRITE_SE(stream, 0, "slice_qp_delta");
+    //WRITE_U(stream, 1, 1, "alignment");
+   
+  if (encoder->tiles_enable || encoder->wpp) {
+    int num_entry_points = 0;
+    int max_length_seen = 0;
+    
+    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
+    
+    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
+    if (num_entry_points > 0) {
+      int entry_points_written = 0;
+      int offset_len = num_bitcount(max_length_seen) + 1;
+      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
+      encoder_state_write_bitstream_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
+  const encoder_control * const encoder = main_state->encoder_control;
+  bitstream * const stream = &main_state->stream;
+
+  int i;
+
+  if (main_state->global->is_radl_frame) {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_bitstream_aud(main_state);
+
+    // Video Parameter Set (VPS)
+    nal_write(stream, NAL_VPS_NUT, 0, 1);
+    encoder_state_write_bitstream_vid_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Sequence Parameter Set (SPS)
+    nal_write(stream, NAL_SPS_NUT, 0, 1);
+    encoder_state_write_bitstream_seq_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Picture Parameter Set (PPS)
+    nal_write(stream, NAL_PPS_NUT, 0, 1);
+    encoder_state_write_bitstream_pic_parameter_set(main_state);
+    bitstream_align(stream);
+
+    if (main_state->global->frame == 0) {
+      // Prefix SEI
+      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
+      encoder_state_write_bitstream_prefix_sei_version(main_state);
+      bitstream_align(stream);
+    }
+  } else {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_bitstream_aud(main_state);
+  }
+
+  {
+    // Not quite sure if this is correct, but it seems to have worked so far
+    // so I tried to not change it's behavior.
+    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
+
+    nal_write(stream,
+              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
+  }
+  {
+    PERFORMANCE_MEASURE_START();
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+    //FIXME: Move this...
+    bitstream_clear(&main_state->children[i].stream);
+  }
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  {
+    PERFORMANCE_MEASURE_START();
+    // Calculate checksum
+    add_checksum(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  //FIXME: Why is this needed?
+  main_state->tile->cur_pic->poc = main_state->global->poc;
+}
+
+static void encoder_state_worker_write_bitstream_leaf(void * opaque) {
+  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
+}
+
+static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  //Write terminator of the leaf
+  assert(encoder_state->is_leaf);
+  
+  //Last LCU
+  {
+    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
+    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
+    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
+  
+    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
+  
+    if (!end_of_slice_segment_flag) {
+      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
+      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
+      cabac_flush(&encoder_state->cabac);
+    } else {
+      cabac_flush(&encoder_state->cabac);
+      bitstream_align(&encoder_state->stream);
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
+  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
+  int i;
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
+  int i;
+  encoder_state_write_bitstream_slice_header(main_state);
+  bitstream_align(&main_state->stream); 
+  
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+
+static void encoder_state_write_bitstream(encoder_state * const main_state) {
+  int i;
+  if (!main_state->is_leaf) {
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      encoder_state_write_bitstream(sub_state);
+    }
+    
+    switch (main_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        encoder_state_write_bitstream_main(main_state);
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        encoder_state_write_bitstream_tile(main_state);
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        encoder_state_write_bitstream_slice(main_state);
+        break;
+      default:
+        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
+        assert(0);
+    }
+  }
+}
+
+
+
+
+
+
+
+
+
+
+static void encode_VUI(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  const encoder_control * const encoder = encoder_state->encoder_control;
+#ifdef _DEBUG
+  printf("=========== VUI Set ID: 0 ===========\n");
+#endif
+  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
+    int i;
+    static const struct
+    {
+      uint8_t width;
+      uint8_t height;
+      uint8_t idc;
+    } sar[] = {
+      // aspect_ratio_idc = 0 -> unspecified
+      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
+      // aspect_ratio_idc = [17..254] -> reserved
+      { 0, 0, 255 }
+    };
+
+    for (i = 0; sar[i].idc != 255; i++)
+      if (sar[i].width  == encoder->vui.sar_width &&
+          sar[i].height == encoder->vui.sar_height)
+        break;
+
+    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
+    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
+    if (sar[i].idc == 255) {
+      // EXTENDED_SAR
+      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
+      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
+    }
+  } else
+    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
+
+  //IF aspect ratio info
+  //ENDIF
+
+  if (encoder->vui.overscan > 0) {
+    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
+    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
+  } else
+    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
+
+  //IF overscan info
+  //ENDIF
+
+  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
+      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+      encoder->vui.colormatrix != 2) {
+    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
+    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
+    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
+
+    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+        encoder->vui.colormatrix != 2) {
+      WRITE_U(stream, 1, 1, "colour_description_present_flag");
+      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
+      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
+      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
+    } else
+      WRITE_U(stream, 0, 1, "colour_description_present_flag");
+  } else
+    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
+
+  //IF video type
+  //ENDIF
+
+  if (encoder->vui.chroma_loc > 0) {
+    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
+  } else
+    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
+
+  //IF chroma loc info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
+  WRITE_U(stream, 0, 1, "field_seq_flag");
+  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
+  WRITE_U(stream, 0, 1, "default_display_window_flag");
+
+  //IF default display window
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
+
+  //IF timing info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
+
+  //IF bitstream restriction
+  //ENDIF
+}
+
+/******  END Functions writing bitstream parts ******/
+
+
 static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
   const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
   const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
@@ -909,13 +1673,6 @@ static void encoder_state_blit_pixels(const encoder_state * const target_enc, pi
 
 
 
-static void encoder_state_write_aud(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  encode_access_unit_delimiter(encoder_state);
-  nal_write(stream, AUD_NUT, 0, 1);
-  bitstream_align(stream);
-}
 
 static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
   picture* const cur_pic = encoder_state->tile->cur_pic;
@@ -958,7 +1715,7 @@ static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, c
 }
 
 
-static void worker_encoder_state_encode_lcu(void * opaque) {
+static void encoder_state_worker_encode_lcu(void * opaque) {
   const lcu_order_element * const lcu = opaque;
   encoder_state *encoder_state = lcu->encoder_state;
   const encoder_control * const encoder = encoder_state->encoder_control;
@@ -1003,7 +1760,7 @@ static void worker_encoder_state_encode_lcu(void * opaque) {
   
   //First LCU, and we are in a slice. We need a slice header
   if (encoder_state->type == ENCODER_STATE_TYPE_SLICE && lcu->index == 0) {
-    encode_slice_header(encoder_state);
+    encoder_state_write_bitstream_slice_header(encoder_state);
     bitstream_align(&encoder_state->stream); 
   }
   
@@ -1059,7 +1816,7 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
     for (i = 0; i < encoder_state->lcu_order_count; ++i) {
       PERFORMANCE_MEASURE_START();
 
-      worker_encoder_state_encode_lcu(&encoder_state->lcu_order[i]);
+      encoder_state_worker_encode_lcu(&encoder_state->lcu_order[i]);
 
 #ifdef _DEBUG
       {
@@ -1083,7 +1840,7 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
 #else
       char* job_description = NULL;
 #endif
-      encoder_state->tile->wf_jobs[lcu->id] = threadqueue_submit(encoder_state->encoder_control->threadqueue, worker_encoder_state_encode_lcu, (void*)lcu, 1, job_description);
+      encoder_state->tile->wf_jobs[lcu->id] = threadqueue_submit(encoder_state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
       if (encoder_state->tile->wf_jobs[lcu->id]) {
         if (lcu->position.x > 0) {
           // Wait for the LCU on the left.
@@ -1106,7 +1863,7 @@ static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
 
 static void encoder_state_encode(encoder_state * const main_state);
 
-static void worker_encoder_state_encode_children(void * opaque) {
+static void encoder_state_worker_encode_children(void * opaque) {
   encoder_state *sub_state = opaque;
   encoder_state_encode(sub_state);
   if (sub_state->is_leaf) {
@@ -1122,7 +1879,7 @@ static void worker_encoder_state_encode_children(void * opaque) {
 #else
       char* job_description = NULL;
 #endif
-      job = threadqueue_submit(sub_state->encoder_control->threadqueue, worker_encoder_state_write_bitstream_leaf, sub_state, 1, job_description);
+      job = threadqueue_submit(sub_state->encoder_control->threadqueue, encoder_state_worker_write_bitstream_leaf, sub_state, 1, job_description);
       threadqueue_job_dep_add(job, sub_state->tile->wf_jobs[sub_state->wfrow->lcu_offset_y * sub_state->tile->cur_pic->width_in_lcu + sub_state->lcu_order_count - 1]);
       threadqueue_job_unwait_job(sub_state->encoder_control->threadqueue, job);
       return;
@@ -1135,7 +1892,7 @@ typedef struct {
   const encoder_state * encoder_state;
 } worker_sao_reconstruct_lcu_data;
 
-static void worker_sao_reconstruct_lcu(void *opaque) {
+static void encoder_state_worker_sao_reconstruct_lcu(void *opaque) {
   worker_sao_reconstruct_lcu_data *data = opaque;
   picture * const cur_pic = data->encoder_state->tile->cur_pic;
   unsigned stride = cur_pic->width_in_lcu;
@@ -1228,11 +1985,11 @@ static void encoder_state_encode(encoder_state * const main_state) {
 #else
           char* job_description = NULL;
 #endif
-          threadqueue_submit(main_state->encoder_control->threadqueue, worker_encoder_state_encode_children, &(main_state->children[i]), 0, job_description);
+          threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->children[i]), 0, job_description);
         } else {
           //Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
           //FIXME: add an assert: we can only have wavefront children
-          worker_encoder_state_encode_children(&(main_state->children[i]));
+          encoder_state_worker_encode_children(&(main_state->children[i]));
         }
       }
       
@@ -1254,7 +2011,7 @@ static void encoder_state_encode(encoder_state * const main_state) {
           data->y = y;
           data->encoder_state = main_state;
           
-          job = threadqueue_submit(main_state->encoder_control->threadqueue, worker_sao_reconstruct_lcu, data, 1, job_description);
+          job = threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_sao_reconstruct_lcu, data, 1, job_description);
           
           if (previous_job) {
             threadqueue_job_dep_add(job, previous_job);
@@ -1275,7 +2032,7 @@ static void encoder_state_encode(encoder_state * const main_state) {
       threadqueue_flush(main_state->encoder_control->threadqueue);
     } else {
       for (i=0; main_state->children[i].encoder_control; ++i) {
-        worker_encoder_state_encode_children(&(main_state->children[i]));
+        encoder_state_worker_encode_children(&(main_state->children[i]));
       }
     }
     
@@ -1301,6 +2058,15 @@ static void encoder_state_encode(encoder_state * const main_state) {
   }
 }
 
+static void encoder_state_clear_refs(encoder_state *encoder_state) {
+  //FIXME: Do we need to handle children? At present they all share the same global
+  while (encoder_state->global->ref->used_size) {
+    picture_list_rem(encoder_state->global->ref, encoder_state->global->ref->used_size - 1);
+  }
+
+  encoder_state->global->poc = 0;
+}
+
 static void encoder_state_new_frame(encoder_state * const main_state) {
   int i;
   //FIXME Move this somewhere else!
@@ -1342,147 +2108,7 @@ static void encoder_state_new_frame(encoder_state * const main_state) {
 
 }
 
-static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
-  const encoder_control * const encoder = main_state->encoder_control;
-  bitstream * const stream = &main_state->stream;
 
-  int i;
-
-  if (main_state->global->is_radl_frame) {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      encoder_state_write_aud(main_state);
-
-    // Video Parameter Set (VPS)
-    nal_write(stream, NAL_VPS_NUT, 0, 1);
-    encode_vid_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Sequence Parameter Set (SPS)
-    nal_write(stream, NAL_SPS_NUT, 0, 1);
-    encode_seq_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Picture Parameter Set (PPS)
-    nal_write(stream, NAL_PPS_NUT, 0, 1);
-    encode_pic_parameter_set(main_state);
-    bitstream_align(stream);
-
-    if (main_state->global->frame == 0) {
-      // Prefix SEI
-      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
-      encode_prefix_sei_version(main_state);
-      bitstream_align(stream);
-    }
-  } else {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      encoder_state_write_aud(main_state);
-  }
-
-  {
-    // Not quite sure if this is correct, but it seems to have worked so far
-    // so I tried to not change it's behavior.
-    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
-
-    nal_write(stream,
-              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
-  }
-  {
-    PERFORMANCE_MEASURE_START();
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-    //FIXME: Move this...
-    bitstream_clear(&main_state->children[i].stream);
-  }
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  {
-    PERFORMANCE_MEASURE_START();
-    // Calculate checksum
-    add_checksum(main_state);
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  //FIXME: Why is this needed?
-  main_state->tile->cur_pic->poc = main_state->global->poc;
-}
-
-static void worker_encoder_state_write_bitstream_leaf(void * opaque) {
-  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
-}
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  //Write terminator of the leaf
-  assert(encoder_state->is_leaf);
-  
-  //Last LCU
-  {
-    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
-    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
-    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
-  
-    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
-  
-    if (!end_of_slice_segment_flag) {
-      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
-      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
-      cabac_flush(&encoder_state->cabac);
-    } else {
-      cabac_flush(&encoder_state->cabac);
-      bitstream_align(&encoder_state->stream);
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
-  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
-  int i;
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
-  int i;
-  encode_slice_header(main_state);
-  bitstream_align(&main_state->stream); 
-  
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-
-static void encoder_state_write_bitstream(encoder_state * const main_state) {
-  int i;
-  if (!main_state->is_leaf) {
-    for (i=0; main_state->children[i].encoder_control; ++i) {
-      encoder_state *sub_state = &(main_state->children[i]);
-      encoder_state_write_bitstream(sub_state);
-    }
-    
-    switch (main_state->type) {
-      case ENCODER_STATE_TYPE_MAIN:
-        encoder_state_write_bitstream_main(main_state);
-        break;
-      case ENCODER_STATE_TYPE_TILE:
-        encoder_state_write_bitstream_tile(main_state);
-        break;
-      case ENCODER_STATE_TYPE_SLICE:
-        encoder_state_write_bitstream_slice(main_state);
-        break;
-      default:
-        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
-        assert(0);
-    }
-  }
-}
 
 void encode_one_frame(encoder_state * const main_state)
 {
@@ -1612,481 +2238,7 @@ static void add_checksum(encoder_state * const encoder_state)
   bitstream_align(stream);
 }
 
-void encode_access_unit_delimiter(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
-                   : encoder_state->global->slicetype == SLICE_P ? 1
-                   :                                             2;
-  WRITE_U(stream, pic_type, 3, "pic_type");
-}
 
-void encode_prefix_sei_version(encoder_state * const encoder_state)
-{
-#define STR_BUF_LEN 1000
-  bitstream * const stream = &encoder_state->stream;
-  int i, length;
-  char buf[STR_BUF_LEN] = { 0 };
-  char *s = buf + 16;
-  const config * const cfg = encoder_state->encoder_control->cfg;
-
-  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
-  static const uint8_t uuid[16] = {
-    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
-    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
-  };
-  memcpy(buf, uuid, 16);
-
-  // user_data_payload_byte
-  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
-                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
-  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
-  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
-               cfg->deblock_beta, cfg->deblock_tc);
-  s += sprintf(s, " sao=%d", cfg->sao_enable);
-  s += sprintf(s, " intra_period=%d", cfg->intra_period);
-  s += sprintf(s, " qp=%d", cfg->qp);
-  s += sprintf(s, " ref=%d", cfg->ref_frames);
-
-  length = (int)(s - buf + 1);  // length, +1 for \0
-
-  // Assert this so that in the future if the message gets longer, we remember
-  // to increase the buf len. Divide by 2 for margin.
-  assert(length < STR_BUF_LEN / 2);
-
-  // payloadType = 5 -> user_data_unregistered
-  WRITE_U(stream, 5, 8, "last_payload_type_byte");
-
-  // payloadSize
-  for (i = 0; i <= length - 255; i += 255)
-    WRITE_U(stream, 255, 8, "ff_byte");
-  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
-
-  for (i = 0; i < length; i++)
-    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
-
-#undef STR_BUF_LEN
-}
-
-void encode_pic_parameter_set(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-#ifdef _DEBUG
-  printf("=========== Picture Parameter Set ID: 0 ===========\n");
-#endif
-  WRITE_UE(stream, 0, "pic_parameter_set_id");
-  WRITE_UE(stream, 0, "seq_parameter_set_id");
-  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
-  WRITE_U(stream, 0, 1, "output_flag_present_flag");
-  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
-  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
-  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
-
-  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
-  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
-  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
-  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
-  //if cu_qp_delta_enabled_flag
-  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
-
-  //TODO: add QP offsets
-  WRITE_SE(stream, 0, "pps_cb_qp_offset");
-  WRITE_SE(stream, 0, "pps_cr_qp_offset");
-  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
-  WRITE_U(stream, 0, 1, "weighted_pred_flag");
-  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
-
-  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
-  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
-  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
-  //wavefronts
-  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
-
-  if (encoder->tiles_enable) {
-    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
-    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
-    
-    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
-    
-    if (!encoder->tiles_uniform_spacing_flag) {
-      int i;
-      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
-      }
-      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
-      }
-    }
-    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
-    
-  }
-  
-  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
-  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
-
-  //IF deblocking_filter
-    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
-          "pps_disable_deblocking_filter_flag");
-
-    //IF !disabled
-  if (encoder_state->encoder_control->deblock_enable) {
-     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
-     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
-    }
-
-    //ENDIF
-  //ENDIF
-  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
-  //IF scaling_list
-  //ENDIF
-  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
-  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
-  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
-  WRITE_U(stream, 0, 1, "pps_extension_flag");
-}
-
-static void encode_PTL(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-  // PTL
-  // Profile Tier
-  WRITE_U(stream, 0, 2, "general_profile_space");
-  WRITE_U(stream, 0, 1, "general_tier_flag");
-  // Main Profile == 1
-  WRITE_U(stream, 1, 5, "general_profile_idc");
-  /* Compatibility flags should be set at general_profile_idc
-   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
-   * According to specification, when compatibility_flag[1] is set,
-   *  compatibility_flag[2] should be set too.
-   */
-  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
-
-  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
-  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
-  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
-  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
-
-  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
-  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
-
-  // end Profile Tier
-
-  // Level 6.2 (general_level_idc is 30 * 6.2)
-  WRITE_U(stream, 186, 8, "general_level_idc");
-
-  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
-  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
-
-  for (i = 1; i < 8; i++) {
-    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
-  }
-
-  // end PTL
-}
-
-static void encode_scaling_list(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-  uint32_t size_id;
-  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
-    int32_t list_id;
-    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
-      uint8_t scaling_list_pred_mode_flag = 1;
-      int32_t pred_list_idx;
-      int32_t i;
-      uint32_t ref_matrix_id = UINT32_MAX;
-
-      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
-        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
-                                     scalinglist_get_default(size_id, pred_list_idx) :
-                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
-
-        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
-            ((size_id < SCALING_LIST_16x16) ||
-             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
-          ref_matrix_id = pred_list_idx;
-          scaling_list_pred_mode_flag = 0;
-          break;
-        }
-      }
-      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
-
-      if (!scaling_list_pred_mode_flag) {
-        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
-      } else {
-        int32_t delta;
-        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
-        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
-        int32_t next_coef = 8;
-        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
-
-        if (size_id >= SCALING_LIST_16x16) {
-          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
-          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
-        }
-
-        for (i = 0; i < coef_num; i++) {
-          delta     = coef_list[scan_cg[i]] - next_coef;
-          next_coef = coef_list[scan_cg[i]];
-          if (delta > 127)
-            delta -= 256;
-          if (delta < -128)
-            delta += 256;
-
-          WRITE_SE(stream, delta, "scaling_list_delta_coef");
-        }
-      }
-    }
-  }
-}
-
-void encode_seq_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  //FIXME: use encoder_control instead of cur_pic
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-
-#ifdef _DEBUG
-  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
-#endif
-
-  // TODO: profile IDC and level IDC should be defined later on
-  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
-  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
-
-  encode_PTL(encoder_state);
-
-  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
-  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
-           "chroma_format_idc");
-
-  if (encoder_state->encoder_control->in.video_format == 3) {
-    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
-  }
-
-  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
-  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
-
-  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
-    // The standard does not seem to allow setting conf_win values such that
-    // the number of luma samples is not a multiple of 2. Options are to either
-    // hide one line or show an extra line of non-video. Neither seems like a
-    // very good option, so let's not even try.
-    assert(!(cur_pic->width % 2));
-    WRITE_U(stream, 1, 1, "conformance_window_flag");
-    WRITE_UE(stream, 0, "conf_win_left_offset");
-    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
-             "conf_win_right_offset");
-    WRITE_UE(stream, 0, "conf_win_top_offset");
-    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
-             "conf_win_bottom_offset");
-  } else {
-    WRITE_U(stream, 0, 1, "conformance_window_flag");
-  }
-
-  //IF window flag
-  //END IF
-
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
-  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "sps_num_reorder_pics");
-  WRITE_UE(stream, 0, "sps_max_latency_increase");
-  //end for
-
-  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
-  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
-  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
-  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
-  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
-  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
-
-  // scaling list
-  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
-  if (encoder_state->encoder_control->scaling_list.enable) {
-    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
-    encode_scaling_list(encoder_state);
-  }
-
-  WRITE_U(stream, 0, 1, "amp_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
-          "sample_adaptive_offset_enabled_flag");
-  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
-  #if ENABLE_PCM == 1
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
-    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
-    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
-    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
-  #endif
-
-  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
-
-  //IF num short term ref pic sets
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
-
-  //IF long_term_ref_pics_present
-  //ENDIF
-
-  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
-          "sps_temporal_mvp_enable_flag");
-  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
-  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
-
-  encode_VUI(encoder_state);
-
-  WRITE_U(stream, 0, 1, "sps_extension_flag");
-}
-
-void encode_vid_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-#ifdef _DEBUG
-  printf("=========== Video Parameter Set ID: 0 ===========\n");
-#endif
-
-  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
-  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
-  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
-  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
-  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
-
-  encode_PTL(encoder_state);
-
-  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  for (i = 0; i < 1; i++) {
-  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "vps_num_reorder_pics");
-  WRITE_UE(stream, 0, "vps_max_latency_increase");
-  }
-
-  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
-  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
-  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
-
-  //IF timing info
-  //END IF
-
-  WRITE_U(stream, 0, 1, "vps_extension_flag");
-}
-
-static void encode_VUI(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  const encoder_control * const encoder = encoder_state->encoder_control;
-#ifdef _DEBUG
-  printf("=========== VUI Set ID: 0 ===========\n");
-#endif
-  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
-    int i;
-    static const struct
-    {
-      uint8_t width;
-      uint8_t height;
-      uint8_t idc;
-    } sar[] = {
-      // aspect_ratio_idc = 0 -> unspecified
-      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
-      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
-      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
-      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
-      // aspect_ratio_idc = [17..254] -> reserved
-      { 0, 0, 255 }
-    };
-
-    for (i = 0; sar[i].idc != 255; i++)
-      if (sar[i].width  == encoder->vui.sar_width &&
-          sar[i].height == encoder->vui.sar_height)
-        break;
-
-    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
-    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
-    if (sar[i].idc == 255) {
-      // EXTENDED_SAR
-      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
-      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
-    }
-  } else
-    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
-
-  //IF aspect ratio info
-  //ENDIF
-
-  if (encoder->vui.overscan > 0) {
-    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
-    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
-  } else
-    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
-
-  //IF overscan info
-  //ENDIF
-
-  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
-      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-      encoder->vui.colormatrix != 2) {
-    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
-    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
-    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
-
-    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-        encoder->vui.colormatrix != 2) {
-      WRITE_U(stream, 1, 1, "colour_description_present_flag");
-      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
-      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
-      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
-    } else
-      WRITE_U(stream, 0, 1, "colour_description_present_flag");
-  } else
-    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
-
-  //IF video type
-  //ENDIF
-
-  if (encoder->vui.chroma_loc > 0) {
-    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
-  } else
-    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
-
-  //IF chroma loc info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
-  WRITE_U(stream, 0, 1, "field_seq_flag");
-  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
-  WRITE_U(stream, 0, 1, "default_display_window_flag");
-
-  //IF default display window
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
-
-  //IF timing info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
-
-  //IF bitstream restriction
-  //ENDIF
-}
 
 void encoder_next_frame(encoder_state *encoder_state) {
   const encoder_control * const encoder = encoder_state->encoder_control;
@@ -2115,133 +2267,9 @@ void encoder_next_frame(encoder_state *encoder_state) {
   encoder_state->global->poc++;
 }
 
-static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      if (my_length > *r_max_length) {
-        *r_max_length = my_length;
-      }
-    } else {
-      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
-    }
-  }
-}
 
-static void encoder_state_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      //Don't write the last one
-      if (*r_count < num_entry_points) {
-        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
-      }
-    } else {
-      encoder_state_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
-    }
-  }
-}
 
-static int num_bitcount(unsigned int n) {
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return ((n == 0) ? (-1) : pos);
-}
 
-void encode_slice_header(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-
-#ifdef _DEBUG
-  printf("=========== Slice ===========\n");
-#endif
-  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
-
-  if (encoder_state->global->pictype >= NAL_BLA_W_LP
-      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
-    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
-  }
-
-  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
-  if (encoder_state->slice->start_in_rs > 0) {
-    //For now, we don't support dependent slice segments
-    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
-    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
-  }
-
-  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
-
-  // if !entropy_slice_flag
-
-    //if output_flag_present_flag
-      //WRITE_U(stream, 1, 1, "pic_output_flag");
-    //end if
-    //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (encoder_state->global->pictype != NAL_IDR_W_RADL
-      && encoder_state->global->pictype != NAL_IDR_N_LP) {
-      int j;
-      int ref_negative = encoder_state->global->ref->used_size;
-      int ref_positive = 0;
-      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
-      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
-      WRITE_UE(stream, ref_negative, "num_negative_pics");
-      WRITE_UE(stream, ref_positive, "num_positive_pics");
-
-    for (j = 0; j < ref_negative; j++) {
-      int32_t delta_poc_minus1 = 0;
-      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
-      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
-    }
-
-    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
-  }
-
-    //end if
-  //end if
-  if (encoder->sao_enable) {
-    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
-    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
-  }
-
-  if (encoder_state->global->slicetype != SLICE_I) {
-      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
-      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
-  }
-
-  if (encoder_state->global->slicetype == SLICE_B) {
-      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
-  }
-
-  // Skip flags that are not present
-  // if !entropy_slice_flag
-    WRITE_SE(stream, 0, "slice_qp_delta");
-    //WRITE_U(stream, 1, 1, "alignment");
-   
-  if (encoder->tiles_enable || encoder->wpp) {
-    int num_entry_points = 0;
-    int max_length_seen = 0;
-    
-    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
-    
-    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
-    if (num_entry_points > 0) {
-      int entry_points_written = 0;
-      int offset_len = num_bitcount(max_length_seen) + 1;
-      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
-      encoder_state_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
-    }
-  }
-}
 
 
 static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 515d23d2..da04ad67 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -157,12 +157,7 @@ int read_one_frame(FILE* file, const encoder_state *encoder);
 
 void encoder_next_frame(encoder_state *encoder_state);
 
-void encode_seq_parameter_set(encoder_state *encoder);
-void encode_pic_parameter_set(encoder_state *encoder);
-void encode_vid_parameter_set(encoder_state *encoder);
-void encode_slice_header(encoder_state * encoder);
-void encode_access_unit_delimiter(encoder_state *encoder);
-void encode_prefix_sei_version(encoder_state *encoder);
+
 void encode_coding_tree(encoder_state *encoder, uint16_t x_ctb,
                         uint16_t y_ctb, uint8_t depth);
 

From 961da0523568f123f62492519890a371197f9a3c Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 14:47:49 +0200
Subject: [PATCH 09/15] Split encoderstate.c in multiple files

---
 src/_encoderstatebitstream.c  |  776 +++++++++++++++++
 src/_encoderstatectorsdtors.c |  672 ++++++++++++++
 src/_encoderstategeometry.c   |  133 +++
 src/encoderstate.c            | 1544 +--------------------------------
 4 files changed, 1587 insertions(+), 1538 deletions(-)
 create mode 100644 src/_encoderstatebitstream.c
 create mode 100644 src/_encoderstatectorsdtors.c
 create mode 100644 src/_encoderstategeometry.c

diff --git a/src/_encoderstatebitstream.c b/src/_encoderstatebitstream.c
new file mode 100644
index 00000000..906afc0f
--- /dev/null
+++ b/src/_encoderstatebitstream.c
@@ -0,0 +1,776 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+
+static void encoder_state_write_bitstream_access_unit_delimiter(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
+                   : encoder_state->global->slicetype == SLICE_P ? 1
+                   :                                             2;
+  WRITE_U(stream, pic_type, 3, "pic_type");
+}
+
+static void encoder_state_write_bitstream_aud(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  encoder_state_write_bitstream_access_unit_delimiter(encoder_state);
+  nal_write(stream, AUD_NUT, 0, 1);
+  bitstream_align(stream);
+}
+
+static void encoder_state_write_bitstream_PTL(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+  // PTL
+  // Profile Tier
+  WRITE_U(stream, 0, 2, "general_profile_space");
+  WRITE_U(stream, 0, 1, "general_tier_flag");
+  // Main Profile == 1
+  WRITE_U(stream, 1, 5, "general_profile_idc");
+  /* Compatibility flags should be set at general_profile_idc
+   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
+   * According to specification, when compatibility_flag[1] is set,
+   *  compatibility_flag[2] should be set too.
+   */
+  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
+
+  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
+  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
+  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
+  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
+
+  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
+  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
+
+  // end Profile Tier
+
+  // Level 6.2 (general_level_idc is 30 * 6.2)
+  WRITE_U(stream, 186, 8, "general_level_idc");
+
+  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
+  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
+
+  for (i = 1; i < 8; i++) {
+    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
+  }
+
+  // end PTL
+}
+
+static void encoder_state_write_bitstream_vid_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  int i;
+#ifdef _DEBUG
+  printf("=========== Video Parameter Set ID: 0 ===========\n");
+#endif
+
+  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
+  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
+  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
+  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
+  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
+
+  encoder_state_write_bitstream_PTL(encoder_state);
+
+  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  for (i = 0; i < 1; i++) {
+  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "vps_num_reorder_pics");
+  WRITE_UE(stream, 0, "vps_max_latency_increase");
+  }
+
+  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
+  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
+  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
+
+  //IF timing info
+  //END IF
+
+  WRITE_U(stream, 0, 1, "vps_extension_flag");
+}
+
+static void encoder_state_write_bitstream_scaling_list(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+  uint32_t size_id;
+  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
+    int32_t list_id;
+    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
+      uint8_t scaling_list_pred_mode_flag = 1;
+      int32_t pred_list_idx;
+      int32_t i;
+      uint32_t ref_matrix_id = UINT32_MAX;
+
+      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
+        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
+                                     scalinglist_get_default(size_id, pred_list_idx) :
+                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
+
+        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
+            ((size_id < SCALING_LIST_16x16) ||
+             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
+          ref_matrix_id = pred_list_idx;
+          scaling_list_pred_mode_flag = 0;
+          break;
+        }
+      }
+      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
+
+      if (!scaling_list_pred_mode_flag) {
+        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
+      } else {
+        int32_t delta;
+        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
+        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
+        int32_t next_coef = 8;
+        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
+
+        if (size_id >= SCALING_LIST_16x16) {
+          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
+          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
+        }
+
+        for (i = 0; i < coef_num; i++) {
+          delta     = coef_list[scan_cg[i]] - next_coef;
+          next_coef = coef_list[scan_cg[i]];
+          if (delta > 127)
+            delta -= 256;
+          if (delta < -128)
+            delta += 256;
+
+          WRITE_SE(stream, delta, "scaling_list_delta_coef");
+        }
+      }
+    }
+  }
+}
+
+
+static void encoder_state_write_bitstream_VUI(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  const encoder_control * const encoder = encoder_state->encoder_control;
+#ifdef _DEBUG
+  printf("=========== VUI Set ID: 0 ===========\n");
+#endif
+  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
+    int i;
+    static const struct
+    {
+      uint8_t width;
+      uint8_t height;
+      uint8_t idc;
+    } sar[] = {
+      // aspect_ratio_idc = 0 -> unspecified
+      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
+      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
+      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
+      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
+      // aspect_ratio_idc = [17..254] -> reserved
+      { 0, 0, 255 }
+    };
+
+    for (i = 0; sar[i].idc != 255; i++)
+      if (sar[i].width  == encoder->vui.sar_width &&
+          sar[i].height == encoder->vui.sar_height)
+        break;
+
+    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
+    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
+    if (sar[i].idc == 255) {
+      // EXTENDED_SAR
+      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
+      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
+    }
+  } else
+    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
+
+  //IF aspect ratio info
+  //ENDIF
+
+  if (encoder->vui.overscan > 0) {
+    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
+    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
+  } else
+    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
+
+  //IF overscan info
+  //ENDIF
+
+  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
+      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+      encoder->vui.colormatrix != 2) {
+    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
+    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
+    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
+
+    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
+        encoder->vui.colormatrix != 2) {
+      WRITE_U(stream, 1, 1, "colour_description_present_flag");
+      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
+      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
+      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
+    } else
+      WRITE_U(stream, 0, 1, "colour_description_present_flag");
+  } else
+    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
+
+  //IF video type
+  //ENDIF
+
+  if (encoder->vui.chroma_loc > 0) {
+    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
+    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
+  } else
+    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
+
+  //IF chroma loc info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
+  WRITE_U(stream, 0, 1, "field_seq_flag");
+  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
+  WRITE_U(stream, 0, 1, "default_display_window_flag");
+
+  //IF default display window
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
+
+  //IF timing info
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
+
+  //IF bitstream restriction
+  //ENDIF
+}
+
+static void encoder_state_write_bitstream_seq_parameter_set(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  //FIXME: use encoder_control instead of cur_pic
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+
+#ifdef _DEBUG
+  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
+#endif
+
+  // TODO: profile IDC and level IDC should be defined later on
+  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
+  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
+  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
+
+  encoder_state_write_bitstream_PTL(encoder_state);
+
+  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
+  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
+           "chroma_format_idc");
+
+  if (encoder_state->encoder_control->in.video_format == 3) {
+    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
+  }
+
+  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
+  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
+
+  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
+    // The standard does not seem to allow setting conf_win values such that
+    // the number of luma samples is not a multiple of 2. Options are to either
+    // hide one line or show an extra line of non-video. Neither seems like a
+    // very good option, so let's not even try.
+    assert(!(cur_pic->width % 2));
+    WRITE_U(stream, 1, 1, "conformance_window_flag");
+    WRITE_UE(stream, 0, "conf_win_left_offset");
+    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
+             "conf_win_right_offset");
+    WRITE_UE(stream, 0, "conf_win_top_offset");
+    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
+             "conf_win_bottom_offset");
+  } else {
+    WRITE_U(stream, 0, 1, "conformance_window_flag");
+  }
+
+  //IF window flag
+  //END IF
+
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
+  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
+  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
+
+  //for each layer
+  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
+  WRITE_UE(stream, 0, "sps_num_reorder_pics");
+  WRITE_UE(stream, 0, "sps_max_latency_increase");
+  //end for
+
+  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
+  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
+  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
+  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
+  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
+  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
+
+  // scaling list
+  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
+  if (encoder_state->encoder_control->scaling_list.enable) {
+    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
+    encoder_state_write_bitstream_scaling_list(encoder_state);
+  }
+
+  WRITE_U(stream, 0, 1, "amp_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
+          "sample_adaptive_offset_enabled_flag");
+  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
+  #if ENABLE_PCM == 1
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
+    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
+    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
+    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
+    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
+  #endif
+
+  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
+
+  //IF num short term ref pic sets
+  //ENDIF
+
+  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
+
+  //IF long_term_ref_pics_present
+  //ENDIF
+
+  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
+          "sps_temporal_mvp_enable_flag");
+  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
+  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
+
+  encoder_state_write_bitstream_VUI(encoder_state);
+
+  WRITE_U(stream, 0, 1, "sps_extension_flag");
+}
+
+static void encoder_state_write_bitstream_pic_parameter_set(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+#ifdef _DEBUG
+  printf("=========== Picture Parameter Set ID: 0 ===========\n");
+#endif
+  WRITE_UE(stream, 0, "pic_parameter_set_id");
+  WRITE_UE(stream, 0, "seq_parameter_set_id");
+  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
+  WRITE_U(stream, 0, 1, "output_flag_present_flag");
+  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
+  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
+  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
+
+  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
+  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
+  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
+  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
+  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
+  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+  //if cu_qp_delta_enabled_flag
+  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+
+  //TODO: add QP offsets
+  WRITE_SE(stream, 0, "pps_cb_qp_offset");
+  WRITE_SE(stream, 0, "pps_cr_qp_offset");
+  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
+  WRITE_U(stream, 0, 1, "weighted_pred_flag");
+  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
+
+  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
+  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
+  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
+  //wavefronts
+  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
+
+  if (encoder->tiles_enable) {
+    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
+    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
+    
+    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
+    
+    if (!encoder->tiles_uniform_spacing_flag) {
+      int i;
+      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
+      }
+      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
+        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
+      }
+    }
+    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
+    
+  }
+  
+  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
+  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
+
+  //IF deblocking_filter
+    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
+  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
+          "pps_disable_deblocking_filter_flag");
+
+    //IF !disabled
+  if (encoder_state->encoder_control->deblock_enable) {
+     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
+     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
+    }
+
+    //ENDIF
+  //ENDIF
+  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
+  //IF scaling_list
+  //ENDIF
+  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
+  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
+  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
+  WRITE_U(stream, 0, 1, "pps_extension_flag");
+}
+
+static void encoder_state_write_bitstream_prefix_sei_version(encoder_state * const encoder_state)
+{
+#define STR_BUF_LEN 1000
+  bitstream * const stream = &encoder_state->stream;
+  int i, length;
+  char buf[STR_BUF_LEN] = { 0 };
+  char *s = buf + 16;
+  const config * const cfg = encoder_state->encoder_control->cfg;
+
+  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
+  static const uint8_t uuid[16] = {
+    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
+    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
+  };
+  memcpy(buf, uuid, 16);
+
+  // user_data_payload_byte
+  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
+                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
+  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
+  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
+               cfg->deblock_beta, cfg->deblock_tc);
+  s += sprintf(s, " sao=%d", cfg->sao_enable);
+  s += sprintf(s, " intra_period=%d", cfg->intra_period);
+  s += sprintf(s, " qp=%d", cfg->qp);
+  s += sprintf(s, " ref=%d", cfg->ref_frames);
+
+  length = (int)(s - buf + 1);  // length, +1 for \0
+
+  // Assert this so that in the future if the message gets longer, we remember
+  // to increase the buf len. Divide by 2 for margin.
+  assert(length < STR_BUF_LEN / 2);
+
+  // payloadType = 5 -> user_data_unregistered
+  WRITE_U(stream, 5, 8, "last_payload_type_byte");
+
+  // payloadSize
+  for (i = 0; i <= length - 255; i += 255)
+    WRITE_U(stream, 255, 8, "ff_byte");
+  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
+
+  for (i = 0; i < length; i++)
+    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
+
+#undef STR_BUF_LEN
+}
+
+static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      if (my_length > *r_max_length) {
+        *r_max_length = my_length;
+      }
+    } else {
+      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
+  int i;
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    if (encoder_state->children[i].is_leaf) {
+      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
+      ++(*r_count);
+      //Don't write the last one
+      if (*r_count < num_entry_points) {
+        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
+      }
+    } else {
+      encoder_state_write_bitstream_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
+    }
+  }
+}
+
+static int num_bitcount(unsigned int n) {
+  int pos = 0;
+  if (n >= 1<<16) { n >>= 16; pos += 16; }
+  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
+  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
+  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
+  if (n >= 1<< 1) {           pos +=  1; }
+  return ((n == 0) ? (-1) : pos);
+}
+
+void encoder_state_write_bitstream_slice_header(encoder_state * const encoder_state)
+{
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  bitstream * const stream = &encoder_state->stream;
+
+#ifdef _DEBUG
+  printf("=========== Slice ===========\n");
+#endif
+  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
+
+  if (encoder_state->global->pictype >= NAL_BLA_W_LP
+      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
+    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
+  }
+
+  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
+  if (encoder_state->slice->start_in_rs > 0) {
+    //For now, we don't support dependent slice segments
+    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
+    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
+  }
+
+  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
+
+  // if !entropy_slice_flag
+
+    //if output_flag_present_flag
+      //WRITE_U(stream, 1, 1, "pic_output_flag");
+    //end if
+    //if( IdrPicFlag ) <- nal_unit_type == 5
+  if (encoder_state->global->pictype != NAL_IDR_W_RADL
+      && encoder_state->global->pictype != NAL_IDR_N_LP) {
+      int j;
+      int ref_negative = encoder_state->global->ref->used_size;
+      int ref_positive = 0;
+      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
+      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
+      WRITE_UE(stream, ref_negative, "num_negative_pics");
+      WRITE_UE(stream, ref_positive, "num_positive_pics");
+
+    for (j = 0; j < ref_negative; j++) {
+      int32_t delta_poc_minus1 = 0;
+      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
+      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
+    }
+
+    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
+  }
+
+    //end if
+  //end if
+  if (encoder->sao_enable) {
+    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
+    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
+  }
+
+  if (encoder_state->global->slicetype != SLICE_I) {
+      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
+        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+  }
+
+  if (encoder_state->global->slicetype == SLICE_B) {
+      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
+  }
+
+  // Skip flags that are not present
+  // if !entropy_slice_flag
+    WRITE_SE(stream, 0, "slice_qp_delta");
+    //WRITE_U(stream, 1, 1, "alignment");
+   
+  if (encoder->tiles_enable || encoder->wpp) {
+    int num_entry_points = 0;
+    int max_length_seen = 0;
+    
+    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
+    
+    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
+    if (num_entry_points > 0) {
+      int entry_points_written = 0;
+      int offset_len = num_bitcount(max_length_seen) + 1;
+      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
+      encoder_state_write_bitstream_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
+  const encoder_control * const encoder = main_state->encoder_control;
+  bitstream * const stream = &main_state->stream;
+
+  int i;
+
+  if (main_state->global->is_radl_frame) {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_bitstream_aud(main_state);
+
+    // Video Parameter Set (VPS)
+    nal_write(stream, NAL_VPS_NUT, 0, 1);
+    encoder_state_write_bitstream_vid_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Sequence Parameter Set (SPS)
+    nal_write(stream, NAL_SPS_NUT, 0, 1);
+    encoder_state_write_bitstream_seq_parameter_set(main_state);
+    bitstream_align(stream);
+
+    // Picture Parameter Set (PPS)
+    nal_write(stream, NAL_PPS_NUT, 0, 1);
+    encoder_state_write_bitstream_pic_parameter_set(main_state);
+    bitstream_align(stream);
+
+    if (main_state->global->frame == 0) {
+      // Prefix SEI
+      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
+      encoder_state_write_bitstream_prefix_sei_version(main_state);
+      bitstream_align(stream);
+    }
+  } else {
+    // Access Unit Delimiter (AUD)
+    if (encoder->aud_enable)
+      encoder_state_write_bitstream_aud(main_state);
+  }
+
+  {
+    // Not quite sure if this is correct, but it seems to have worked so far
+    // so I tried to not change it's behavior.
+    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
+
+    nal_write(stream,
+              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
+  }
+  {
+    PERFORMANCE_MEASURE_START();
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+    //FIXME: Move this...
+    bitstream_clear(&main_state->children[i].stream);
+  }
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  {
+    PERFORMANCE_MEASURE_START();
+    // Calculate checksum
+    add_checksum(main_state);
+    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
+  }
+  
+  //FIXME: Why is this needed?
+  main_state->tile->cur_pic->poc = main_state->global->poc;
+}
+
+static void encoder_state_worker_write_bitstream_leaf(void * opaque) {
+  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
+}
+
+static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  //Write terminator of the leaf
+  assert(encoder_state->is_leaf);
+  
+  //Last LCU
+  {
+    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
+    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
+    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
+  
+    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
+  
+    if (!end_of_slice_segment_flag) {
+      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
+      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
+      cabac_flush(&encoder_state->cabac);
+    } else {
+      cabac_flush(&encoder_state->cabac);
+      bitstream_align(&encoder_state->stream);
+    }
+  }
+}
+
+static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
+  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
+  int i;
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
+  int i;
+  encoder_state_write_bitstream_slice_header(main_state);
+  bitstream_align(&main_state->stream); 
+  
+  for (i = 0; main_state->children[i].encoder_control; ++i) {
+    //Append bitstream to main stream
+    bitstream_append(&main_state->stream, &main_state->children[i].stream);
+  }
+}
+
+
+static void encoder_state_write_bitstream(encoder_state * const main_state) {
+  int i;
+  if (!main_state->is_leaf) {
+    for (i=0; main_state->children[i].encoder_control; ++i) {
+      encoder_state *sub_state = &(main_state->children[i]);
+      encoder_state_write_bitstream(sub_state);
+    }
+    
+    switch (main_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        encoder_state_write_bitstream_main(main_state);
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        encoder_state_write_bitstream_tile(main_state);
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        encoder_state_write_bitstream_slice(main_state);
+        break;
+      default:
+        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
+        assert(0);
+    }
+  }
+}
+
diff --git a/src/_encoderstatectorsdtors.c b/src/_encoderstatectorsdtors.c
new file mode 100644
index 00000000..44e4c0fd
--- /dev/null
+++ b/src/_encoderstatectorsdtors.c
@@ -0,0 +1,672 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+
+static int encoder_state_config_global_init(encoder_state * const encoder_state) {
+  encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
+  if(!encoder_state->global->ref) {
+    fprintf(stderr, "Failed to allocate the picture list!\n");
+    return 0;
+  }
+  encoder_state->global->ref_list = REF_PIC_LIST_0;
+  encoder_state->global->frame = 0;
+  encoder_state->global->poc = 0;
+  return 1;
+}
+
+static void encoder_state_config_global_finalize(encoder_state * const encoder_state) {
+  picture_list_destroy(encoder_state->global->ref);
+}
+
+static int encoder_state_config_tile_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_x, const int lcu_offset_y,
+                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
+  
+  const encoder_control * const encoder = encoder_state->encoder_control;
+  encoder_state->tile->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
+
+  if (!encoder_state->tile->cur_pic) {
+    printf("Error allocating picture!\r\n");
+    return 0;
+  }
+  
+  // Init coeff data table
+  //FIXME: move them
+  encoder_state->tile->cur_pic->coeff_y = MALLOC(coefficient, width * height);
+  encoder_state->tile->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
+  encoder_state->tile->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
+  
+  encoder_state->tile->lcu_offset_x = lcu_offset_x;
+  encoder_state->tile->lcu_offset_y = lcu_offset_y;
+  
+  encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
+  
+  //Allocate buffers
+  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
+  encoder_state->tile->hor_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
+  encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu);
+  
+  if (encoder->sao_enable) {
+    encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+  } else {
+    encoder_state->tile->hor_buf_before_sao = NULL;
+  }
+  
+  if (encoder->wpp) {
+    encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
+    if (!encoder_state->tile->wf_jobs) {
+      printf("Error allocating wf_jobs array!\n");
+      return 0;
+    }
+  } else {
+    encoder_state->tile->wf_jobs = NULL;
+  }
+  
+  encoder_state->tile->id = encoder->tiles_tile_id[encoder_state->tile->lcu_offset_in_ts];
+  return 1;
+}
+
+static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
+  if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao);
+  
+  yuv_t_free(encoder_state->tile->hor_buf_search);
+  yuv_t_free(encoder_state->tile->ver_buf_search);
+  
+  picture_free(encoder_state->tile->cur_pic);
+  encoder_state->tile->cur_pic = NULL;
+  
+  FREE_POINTER(encoder_state->tile->wf_jobs);
+}
+
+static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
+                                          const int start_address_in_ts, const int end_address_in_ts) {
+  int i = 0, slice_found=0;
+  for (i = 0; i < encoder_state->encoder_control->slice_count; ++i) {
+    if (encoder_state->encoder_control->slice_addresses_in_ts[i] == start_address_in_ts) {
+      encoder_state->slice->id = i;
+      slice_found = 1;
+      break;
+    }
+  }
+  assert(slice_found);
+  encoder_state->slice->start_in_ts = start_address_in_ts;
+  encoder_state->slice->end_in_ts = end_address_in_ts;
+  
+  encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
+  encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
+  return 1;
+}
+
+static void encoder_state_config_slice_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+static int encoder_state_config_wfrow_init(encoder_state * const encoder_state, 
+                                          const int lcu_offset_y) {
+  
+  encoder_state->wfrow->lcu_offset_y = lcu_offset_y;
+  return 1;
+}
+
+static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_state) {
+  //Nothing to do (yet?)
+}
+
+#ifdef _DEBUG
+static void encoder_state_dump_graphviz(const encoder_state * const encoder_state) {
+  int i;
+  
+  if (!encoder_state->parent) {
+    const encoder_control * const encoder = encoder_state->encoder_control;
+    int y,x;
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+    //Some styling...
+    printf("digraph EncoderStates {\n");
+    printf(" fontname = \"Bitstream Vera Sans\"\n");
+    printf(" fontsize = 8\n\n");
+    printf(" node [\n");
+    printf("  fontname = \"Bitstream Vera Sans\"\n");
+    printf("  fontsize = 8\n");
+    printf("  shape = \"record\"\n");
+    printf(" ]\n\n");
+    printf(" edge [\n");
+    printf("  arrowtail = \"empty\"\n");
+    printf(" ]\n\n");
+    
+    printf(" \"Map\" [\n");
+    printf("  shape=plaintext\n");
+    printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        
+        printf("<td>%d</td>", lcu_id_rs);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>TS Map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", lcu_id_ts);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Tile map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        
+        printf("<td>%d</td>", encoder->tiles_tile_id[lcu_id_ts]);
+      }
+      printf("</tr>");
+    }
+    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Slice map</b></td></tr>", encoder->in.width_in_lcu);
+    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
+      printf("<tr>");
+      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
+        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
+        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
+        int slice_id = 0;
+        
+        //Not efficient, but who cares
+        for (i=0; i < encoder->slice_count; ++i) {
+          if (encoder->slice_addresses_in_ts[i] <= lcu_id_ts) {
+            slice_id = i;
+          }
+        }
+        
+        printf("<td>%d</td>", slice_id);
+      }
+      printf("</tr>");
+    }
+    printf("</table>>\n ]\n");
+  }
+  
+  printf(" \"%p\" [\n", encoder_state);
+  printf("  label = \"{encoder_state|");
+  printf("+ type=%c\\l", encoder_state->type);
+  if (!encoder_state->parent || encoder_state->global != encoder_state->parent->global) {
+    printf("|+ global\\l");
+  }
+  if (!encoder_state->parent || encoder_state->tile != encoder_state->parent->tile) {
+    printf("|+ tile\\l");
+    printf(" - id = %d\\l", encoder_state->tile->id);
+    printf(" - lcu_offset_x = %d\\l", encoder_state->tile->lcu_offset_x);
+    printf(" - lcu_offset_y = %d\\l", encoder_state->tile->lcu_offset_y);
+    printf(" - lcu_offset_in_ts = %d\\l", encoder_state->tile->lcu_offset_in_ts);
+  }
+  if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
+    printf("|+ slice\\l");
+    printf(" - id = %d\\l", encoder_state->slice->id);
+    printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
+    printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
+    printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
+    printf(" - end_in_rs = %d\\l", encoder_state->slice->end_in_rs);
+  }
+  if (!encoder_state->parent || encoder_state->wfrow != encoder_state->parent->wfrow) {
+    printf("|+ wfrow\\l");
+    printf(" - lcu_offset_y = %d\\l", encoder_state->wfrow->lcu_offset_y);
+  }
+  printf("}\"\n");
+  printf(" ]\n");
+  
+  if (encoder_state->parent) {
+    printf(" \"%p\" -> \"%p\"\n", encoder_state->parent, encoder_state);
+  }
+  
+  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+    encoder_state_dump_graphviz(&encoder_state->children[i]);
+  }
+  
+  if (!encoder_state->parent) {
+    printf("}\n");
+    //Empty lines (easier to copy-paste)
+    printf("\n\n\n\n\n");
+  }
+}
+#endif //_DEBUG
+
+int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
+  //We require that, if parent_state is NULL:
+  //child_state->encoder_control is set
+  //
+  //If parent_state is not NULL, the following variable should either be set to NULL,
+  //in order to inherit from parent, or should point to a valid structure:
+  //child_state->global
+  //child_state->tile
+  //child_state->slice
+  //child_state->wfrow
+  
+  child_state->parent = parent_state;
+  child_state->children = MALLOC(encoder_state, 1);
+  child_state->children[0].encoder_control = NULL;
+  
+  if (!parent_state) {
+    const encoder_control * const encoder = child_state->encoder_control;
+    child_state->type = ENCODER_STATE_TYPE_MAIN;
+    assert(child_state->encoder_control);
+    child_state->global = MALLOC(encoder_state_config_global, 1);
+    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
+      fprintf(stderr, "Could not initialize encoder_state->global!\n");
+      return 0;
+    }
+    child_state->tile = MALLOC(encoder_state_config_tile, 1);
+    if (!child_state->tile || !encoder_state_config_tile_init(child_state, 0, 0, encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu)) {
+      fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+      return 0;
+    }
+    child_state->slice = MALLOC(encoder_state_config_slice, 1);
+    if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
+      fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+      return 0;
+    }
+    child_state->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+    if (!child_state->wfrow || !encoder_state_config_wfrow_init(child_state, 0)) {
+      fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+      return 0;
+    }
+  } else {
+    child_state->encoder_control = parent_state->encoder_control;
+    if (!child_state->global) child_state->global = parent_state->global;
+    if (!child_state->tile) child_state->tile = parent_state->tile;
+    if (!child_state->slice) child_state->slice = parent_state->slice;
+    if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
+  }
+  
+  //Allocate bitstream
+  if (child_state->type == ENCODER_STATE_TYPE_MAIN) {
+    //Main encoder outputs to file
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_FILE)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+    child_state->stream.file.output = child_state->encoder_control->out.file;
+  } else {
+    //Other encoders use a memory bitstream
+    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_MEMORY)) {
+      fprintf(stderr, "Could not initialize stream!\n");
+      return 0;
+    }
+  }
+  
+  // Set CABAC output bitstream
+  child_state->cabac.stream = &child_state->stream;
+  
+  //Create sub-encoders
+  {
+    const encoder_control * const encoder = child_state->encoder_control;
+    int child_count = 0;
+    //We first check the type of this element.
+    //If it's a MAIN, it can allow both slices or tiles as child
+    //If it's a TILE, it can allow slices as child, if its parent is not a slice, or wavefront rows if there is no other children
+    //If it's a SLICE, it can allow tiles as child, if its parent is not a tile, or wavefront rows if there is no other children
+    //If it's a WAVEFRONT_ROW, it doesn't allow any children
+    int children_allow_wavefront_row = 0;
+    int children_allow_slice = 0;
+    int children_allow_tile = 0;
+    int range_start;
+    
+    int start_in_ts, end_in_ts;
+    
+    switch(child_state->type) {
+      case ENCODER_STATE_TYPE_MAIN:
+        children_allow_slice = 1;
+        children_allow_tile = 1;
+        start_in_ts = 0;
+        end_in_ts = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+        break;
+      case ENCODER_STATE_TYPE_SLICE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->slice->start_in_ts;
+        end_in_ts = child_state->slice->end_in_ts;
+        break;
+      case ENCODER_STATE_TYPE_TILE:
+        assert(child_state->parent);
+        if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
+        children_allow_wavefront_row = encoder->wpp;
+        start_in_ts = child_state->tile->lcu_offset_in_ts;
+        end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+        break;
+      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
+        //GCC tries to be too clever...
+        start_in_ts = -1;
+        end_in_ts = -1;
+        break;
+      default:
+        fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
+        assert(0);
+        return 0;
+    }
+    
+    range_start = start_in_ts;
+    //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
+    while (range_start < end_in_ts && (children_allow_slice || children_allow_tile)) {
+      encoder_state *new_child = NULL;
+      int range_end_slice = range_start; //Will be incremented to get the range of the "thing"
+      int range_end_tile = range_start; //Will be incremented to get the range of the "thing"
+      
+      int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
+      int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
+      
+      //Find the smallest structure following the cursor
+      if (slice_allowed) {
+        while(!lcu_at_slice_end(encoder, range_end_slice)) {
+          ++range_end_slice;
+        }
+      }
+      
+      if (tile_allowed) {
+        while(!lcu_at_tile_end(encoder, range_end_tile)) {
+          ++range_end_tile;
+        }
+      }
+      
+      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d end_in_ts=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed,end_in_ts);
+      
+      if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
+        //Create a slice
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_SLICE;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->wfrow = child_state->wfrow;
+        new_child->slice = MALLOC(encoder_state_config_slice, 1);
+        if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
+          fprintf(stderr, "Could not initialize encoder_state->slice!\n");
+          return 0;
+        }
+      }
+      
+      if ((!slice_allowed || (range_end_slice < range_end_tile)) && !new_child && tile_allowed) {
+        //Create a tile
+        int tile_id = encoder->tiles_tile_id[range_start];
+        int tile_x = tile_id % encoder->tiles_num_tile_columns;
+        int tile_y = tile_id / encoder->tiles_num_tile_columns;
+        
+        int lcu_offset_x = encoder->tiles_col_bd[tile_x];
+        int lcu_offset_y = encoder->tiles_row_bd[tile_y];
+        int width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
+        int height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
+        int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
+        int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
+        
+        new_child = &child_state->children[child_count];
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_TILE;
+        new_child->global = child_state->global;
+        new_child->tile = MALLOC(encoder_state_config_tile, 1);
+        new_child->slice = child_state->slice;
+        new_child->wfrow = child_state->wfrow;
+        
+        if (!new_child->tile || !encoder_state_config_tile_init(new_child, lcu_offset_x, lcu_offset_y, width, height, width_in_lcu, height_in_lcu)) {
+          fprintf(stderr, "Could not initialize encoder_state->tile!\n");
+          return 0;
+        }
+      }
+      
+      if (new_child) {
+        child_state->children = realloc(child_state->children, sizeof(encoder_state) * (2+child_count));
+        child_state->children[1+child_count].encoder_control = NULL;
+        if (!child_state->children) {
+          fprintf(stderr, "Failed to allocate memory for children...\n");
+          return 0;
+        }
+
+        //Fix children parent (since we changed the address), except for the last one which is not ready yet
+        {
+          int i, j;
+          for (i = 0; child_state->children[i].encoder_control && i < child_count; ++i) {
+            for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
+              child_state->children[i].children[j].parent = &child_state->children[i];
+            }
+            for (j = 0; j < child_state->children[i].lcu_order_count; ++j) {
+              child_state->children[i].lcu_order[j].encoder_state = &child_state->children[i];
+            }
+            child_state->children[i].cabac.stream = &child_state->children[i].stream;
+          }
+        }
+        
+        if (!encoder_state_init(&child_state->children[child_count], child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+        child_count += 1;
+      }
+      
+      range_start = MAX(range_end_slice, range_end_tile) + 1;
+    }
+    
+    //We create wavefronts only if we have no children
+    if (children_allow_wavefront_row && child_count == 0) {
+      int first_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int last_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
+      int num_rows;
+      int i;
+      
+      assert(!(children_allow_slice || children_allow_tile));
+      assert(child_count == 0);
+      
+      for (i=start_in_ts; i<end_in_ts; ++i) {
+        const int row = encoder->tiles_ctb_addr_ts_to_rs[i] / encoder->in.width_in_lcu;
+        if (row < first_row) first_row = row;
+        if (row > last_row) last_row = row;
+      }
+      
+      num_rows = last_row - first_row + 1;
+      
+      //When entropy_coding_sync_enabled_flag is equal to 1 and the first coding tree block in a slice is not the first coding
+      //tree block of a row of coding tree blocks in a tile, it is a requirement of bitstream conformance that the last coding tree
+      //block in the slice shall belong to the same row of coding tree blocks as the first coding tree block in the slice.
+      
+      if (encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] % encoder->in.width_in_lcu != child_state->tile->lcu_offset_x) {
+        if (num_rows > 1) {
+          fprintf(stderr, "Invalid: first CTB in slice %d is not at the tile %d edge, and the slice spans on more than one row.\n", child_state->slice->id, child_state->tile->id);
+          return 0;
+        }
+      }
+      
+      //FIXME Do the same kind of check if we implement slice segments
+    
+      child_count = num_rows;
+      child_state->children = realloc(child_state->children, sizeof(encoder_state) * (num_rows + 1));
+      child_state->children[num_rows].encoder_control = NULL;
+      
+      for (i=0; i < num_rows; ++i) {
+        encoder_state *new_child = &child_state->children[i];
+        
+        new_child->encoder_control = encoder;
+        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
+        new_child->global = child_state->global;
+        new_child->tile = child_state->tile;
+        new_child->slice = child_state->slice;
+        new_child->wfrow = MALLOC(encoder_state_config_wfrow, 1);
+        
+        if (!new_child->wfrow || !encoder_state_config_wfrow_init(new_child, i)) {
+          fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
+          return 0;
+        }
+        
+        if (!encoder_state_init(new_child, child_state)) {
+          fprintf(stderr, "Unable to init child...\n");
+          return 0;
+        }
+      }
+    }
+    
+    child_state->is_leaf = (child_count == 0);
+    //This node is a leaf, compute LCU-order
+    if (child_state->is_leaf) {
+      //All LCU computations are relative to the tile
+      //Remark: this could be optimized, but since it's run only once, it's better to do it in a understandable way.
+      
+      //By default, the full tile
+      int i;
+      int lcu_id;
+      int lcu_start = 0;
+      //End is the element AFTER the end (iterate < lcu_end)
+      int lcu_end = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
+      
+      //Restrict to the current slice if needed
+      lcu_start = MAX(lcu_start, child_state->slice->start_in_ts - child_state->tile->lcu_offset_in_ts);
+      lcu_end = MIN(lcu_end, child_state->slice->end_in_ts - child_state->tile->lcu_offset_in_ts + 1);
+      
+      //Restrict to the current wavefront row if needed
+      if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+        lcu_start = MAX(lcu_start, (child_state->wfrow->lcu_offset_y) * child_state->tile->cur_pic->width_in_lcu);
+        lcu_end = MIN(lcu_end, (child_state->wfrow->lcu_offset_y + 1) * child_state->tile->cur_pic->width_in_lcu);
+      }
+      
+      child_state->lcu_order_count = lcu_end - lcu_start;
+      child_state->lcu_order = MALLOC(lcu_order_element, child_state->lcu_order_count);
+      assert(child_state->lcu_order);
+      
+      for (i = 0; i < child_state->lcu_order_count; ++i) {
+        lcu_id = lcu_start + i;
+        child_state->lcu_order[i].encoder_state = child_state;
+        child_state->lcu_order[i].id = lcu_id;
+        child_state->lcu_order[i].index = i;
+        child_state->lcu_order[i].position.x = lcu_id % child_state->tile->cur_pic->width_in_lcu;
+        child_state->lcu_order[i].position.y = lcu_id / child_state->tile->cur_pic->width_in_lcu;
+        child_state->lcu_order[i].position_px.x = child_state->lcu_order[i].position.x * LCU_WIDTH;
+        child_state->lcu_order[i].position_px.y = child_state->lcu_order[i].position.y * LCU_WIDTH;
+        child_state->lcu_order[i].size.x = MIN(LCU_WIDTH, encoder->in.width - (child_state->tile->lcu_offset_x * LCU_WIDTH + child_state->lcu_order[i].position_px.x));
+        child_state->lcu_order[i].size.y = MIN(LCU_WIDTH, encoder->in.height - (child_state->tile->lcu_offset_y * LCU_WIDTH + child_state->lcu_order[i].position_px.y));
+        child_state->lcu_order[i].first_row = lcu_in_first_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].last_row = lcu_in_last_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].first_column = lcu_in_first_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        child_state->lcu_order[i].last_column = lcu_in_last_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
+        
+        child_state->lcu_order[i].above = NULL;
+        child_state->lcu_order[i].below = NULL;
+        child_state->lcu_order[i].left = NULL;
+        child_state->lcu_order[i].right = NULL;
+        
+        if (!child_state->lcu_order[i].first_row) {
+          //Find LCU above
+          if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+            int j;
+            //For all previous wavefront rows
+            for (j=0; &child_state->parent->children[j] != child_state && child_state->parent->children[j].encoder_control; ++j) {
+              if (child_state->parent->children[j].wfrow->lcu_offset_y == child_state->wfrow->lcu_offset_y - 1) {
+                int k;
+                for (k=0; k < child_state->parent->children[j].lcu_order_count; ++k) {
+                  if (child_state->parent->children[j].lcu_order[k].position.x == child_state->lcu_order[i].position.x) {
+                    assert(child_state->parent->children[j].lcu_order[k].position.y == child_state->lcu_order[i].position.y - 1);
+                    child_state->lcu_order[i].above = &child_state->parent->children[j].lcu_order[k];
+                  }
+                }
+              }
+            }
+          } else {
+            child_state->lcu_order[i].above = &child_state->lcu_order[i-child_state->tile->cur_pic->width_in_lcu];
+          }
+          assert(child_state->lcu_order[i].above);
+          child_state->lcu_order[i].above->below = &child_state->lcu_order[i];
+        }
+        if (!child_state->lcu_order[i].first_column) {
+          child_state->lcu_order[i].left = &child_state->lcu_order[i-1];
+          assert(child_state->lcu_order[i].left->position.x == child_state->lcu_order[i].position.x - 1);
+          child_state->lcu_order[i].left->right = &child_state->lcu_order[i];
+        }
+      }
+    } else {
+      child_state->lcu_order_count = 0;
+      child_state->lcu_order = NULL;
+    }
+  }
+  
+  //Validate the structure
+  if (child_state->type == ENCODER_STATE_TYPE_TILE) {
+    if (child_state->tile->lcu_offset_in_ts < child_state->slice->start_in_ts) {
+      fprintf(stderr, "Tile %d starts before slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+    if (child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1 > child_state->slice->end_in_ts) {
+      fprintf(stderr, "Tile %d ends after slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
+      return 0;
+    }
+  }
+  
+  if (child_state->type == ENCODER_STATE_TYPE_SLICE) {
+    if (child_state->slice->start_in_ts < child_state->tile->lcu_offset_in_ts) {
+      fprintf(stderr, "Slice %d starts before tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+    if (child_state->slice->end_in_ts > child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1) {
+      fprintf(stderr, "Slice %d ends after tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
+      return 0;
+    }
+  }
+  
+  
+#ifdef _DEBUG
+  if (!parent_state) encoder_state_dump_graphviz(child_state);
+#endif //_DEBUG
+  return 1;
+}
+
+void encoder_state_finalize(encoder_state * const encoder_state) {
+  if (encoder_state->children) {
+    int i=0;
+    for (i = 0; encoder_state->children[i].encoder_control; ++i) {
+      encoder_state_finalize(&encoder_state->children[i]);
+    }
+    
+    FREE_POINTER(encoder_state->children);
+  }
+  
+  FREE_POINTER(encoder_state->lcu_order);
+  encoder_state->lcu_order_count = 0;
+  
+  if (!encoder_state->parent || (encoder_state->parent->wfrow != encoder_state->wfrow)) {
+    encoder_state_config_wfrow_finalize(encoder_state);
+    FREE_POINTER(encoder_state->wfrow);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->slice != encoder_state->slice)) {
+    encoder_state_config_slice_finalize(encoder_state);
+    FREE_POINTER(encoder_state->slice);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->tile != encoder_state->tile)) {
+    encoder_state_config_tile_finalize(encoder_state);
+    FREE_POINTER(encoder_state->tile);
+  }
+  
+  if (!encoder_state->parent || (encoder_state->parent->global != encoder_state->global)) {
+    encoder_state_config_global_finalize(encoder_state);
+    FREE_POINTER(encoder_state->global);
+  }
+  
+  bitstream_finalize(&encoder_state->stream);
+}
\ No newline at end of file
diff --git a/src/_encoderstategeometry.c b/src/_encoderstategeometry.c
new file mode 100644
index 00000000..faffbc2c
--- /dev/null
+++ b/src/_encoderstategeometry.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+
+static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
+    return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
+    return 1;
+  }
+  return 0;
+}
+
+//Return 1 if the LCU is at the first row of a structure (tile or slice)
+static int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y) {
+    return 1;
+  }
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->start_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
+    return 1;
+  }
+  
+  //One row above is before the start of the slice => it's also a boundary
+  if (lcu_addr_in_rs - encoder_state->encoder_control->in.width_in_lcu < encoder_state->slice->start_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+//Return 1 if the LCU is at the first row of a structure (tile or slice)
+static int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y + encoder_state->tile->cur_pic->height_in_lcu - 1) {
+    return 1;
+  }
+  
+  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->end_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
+    return 1;
+  }
+  
+  //One row below is before the end of the slice => it's also a boundary
+  if (lcu_addr_in_rs + encoder_state->encoder_control->in.width_in_lcu > encoder_state->slice->end_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+
+//Return 1 if the LCU is at the first column of a structure (tile or slice)
+static int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  //First column of tile?
+  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x) {
+    return 1;
+  }
+  
+  //Slice start may not be aligned with the tile, so we need to allow this
+  if (lcu_addr_in_rs == encoder_state->slice->start_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
+
+//Return 1 if the LCU is at the last column of a structure (tile or slice)
+static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
+  
+  //First column of tile?
+  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x + encoder_state->tile->cur_pic->width_in_lcu - 1) {
+    return 1;
+  }
+  
+  //Slice start may not be aligned with the tile, so we need to allow this
+  if (lcu_addr_in_rs == encoder_state->slice->end_in_rs) {
+    return 1;
+  }
+  
+  return 0;
+}
\ No newline at end of file
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 7640aa3d..f6f42227 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -45,7 +45,6 @@
 
 /* Local functions. */
 static void add_checksum(encoder_state *encoder);
-static void encode_VUI(encoder_state *encoder);
 static void encode_sao(encoder_state *encoder,
                        unsigned x_lcu, uint16_t y_lcu,
                        sao_info *sao_luma, sao_info *sao_chroma);
@@ -82,1545 +81,14 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
   encoder_state->global->cur_lambda_cost = lambda;
 }
 
-/******  BEGIN Functions to obtain geometry information from LCU  ******/
-static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == 0) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
-  }
-  return 0;
-}
+//Functions to obtain geometry information from LCU
+#include "_encoderstategeometry.c"
 
-static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == 0) return 1;
-  if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
-    return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
-    return 1;
-  }
-  return 0;
-}
-
-//Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y) {
-    return 1;
-  }
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->start_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
-    return 1;
-  }
-  
-  //One row above is before the start of the slice => it's also a boundary
-  if (lcu_addr_in_rs - encoder_state->encoder_control->in.width_in_lcu < encoder_state->slice->start_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-//Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y + encoder_state->tile->cur_pic->height_in_lcu - 1) {
-    return 1;
-  }
-  
-  if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->end_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
-    return 1;
-  }
-  
-  //One row below is before the end of the slice => it's also a boundary
-  if (lcu_addr_in_rs + encoder_state->encoder_control->in.width_in_lcu > encoder_state->slice->end_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-
-//Return 1 if the LCU is at the first column of a structure (tile or slice)
-static int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  //First column of tile?
-  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x) {
-    return 1;
-  }
-  
-  //Slice start may not be aligned with the tile, so we need to allow this
-  if (lcu_addr_in_rs == encoder_state->slice->start_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-//Return 1 if the LCU is at the last column of a structure (tile or slice)
-static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
-  const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
-  //First column of tile?
-  if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x + encoder_state->tile->cur_pic->width_in_lcu - 1) {
-    return 1;
-  }
-  
-  //Slice start may not be aligned with the tile, so we need to allow this
-  if (lcu_addr_in_rs == encoder_state->slice->end_in_rs) {
-    return 1;
-  }
-  
-  return 0;
-}
-
-/******  END Functions to obtain geometry information from LCU  ******/
-
-
-/******  BEGIN Constructors/destructors  ******/
-static int encoder_state_config_global_init(encoder_state * const encoder_state) {
-  encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
-  if(!encoder_state->global->ref) {
-    fprintf(stderr, "Failed to allocate the picture list!\n");
-    return 0;
-  }
-  encoder_state->global->ref_list = REF_PIC_LIST_0;
-  encoder_state->global->frame = 0;
-  encoder_state->global->poc = 0;
-  return 1;
-}
-
-static void encoder_state_config_global_finalize(encoder_state * const encoder_state) {
-  picture_list_destroy(encoder_state->global->ref);
-}
-
-static int encoder_state_config_tile_init(encoder_state * const encoder_state, 
-                                          const int lcu_offset_x, const int lcu_offset_y,
-                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
-  
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  encoder_state->tile->cur_pic = picture_alloc(width, height, width_in_lcu, height_in_lcu);
-
-  if (!encoder_state->tile->cur_pic) {
-    printf("Error allocating picture!\r\n");
-    return 0;
-  }
-  
-  // Init coeff data table
-  //FIXME: move them
-  encoder_state->tile->cur_pic->coeff_y = MALLOC(coefficient, width * height);
-  encoder_state->tile->cur_pic->coeff_u = MALLOC(coefficient, (width * height) >> 2);
-  encoder_state->tile->cur_pic->coeff_v = MALLOC(coefficient, (width * height) >> 2);
-  
-  encoder_state->tile->lcu_offset_x = lcu_offset_x;
-  encoder_state->tile->lcu_offset_y = lcu_offset_y;
-  
-  encoder_state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu];
-  
-  //Allocate buffers
-  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
-  encoder_state->tile->hor_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
-  encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu);
-  
-  if (encoder->sao_enable) {
-    encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-  } else {
-    encoder_state->tile->hor_buf_before_sao = NULL;
-  }
-  
-  if (encoder->wpp) {
-    encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
-    if (!encoder_state->tile->wf_jobs) {
-      printf("Error allocating wf_jobs array!\n");
-      return 0;
-    }
-  } else {
-    encoder_state->tile->wf_jobs = NULL;
-  }
-  
-  encoder_state->tile->id = encoder->tiles_tile_id[encoder_state->tile->lcu_offset_in_ts];
-  return 1;
-}
-
-static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
-  if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao);
-  
-  yuv_t_free(encoder_state->tile->hor_buf_search);
-  yuv_t_free(encoder_state->tile->ver_buf_search);
-  
-  picture_free(encoder_state->tile->cur_pic);
-  encoder_state->tile->cur_pic = NULL;
-  
-  FREE_POINTER(encoder_state->tile->wf_jobs);
-}
-
-static int encoder_state_config_slice_init(encoder_state * const encoder_state, 
-                                          const int start_address_in_ts, const int end_address_in_ts) {
-  int i = 0, slice_found=0;
-  for (i = 0; i < encoder_state->encoder_control->slice_count; ++i) {
-    if (encoder_state->encoder_control->slice_addresses_in_ts[i] == start_address_in_ts) {
-      encoder_state->slice->id = i;
-      slice_found = 1;
-      break;
-    }
-  }
-  assert(slice_found);
-  encoder_state->slice->start_in_ts = start_address_in_ts;
-  encoder_state->slice->end_in_ts = end_address_in_ts;
-  
-  encoder_state->slice->start_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[start_address_in_ts];
-  encoder_state->slice->end_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[end_address_in_ts];
-  return 1;
-}
-
-static void encoder_state_config_slice_finalize(encoder_state * const encoder_state) {
-  //Nothing to do (yet?)
-}
-
-static int encoder_state_config_wfrow_init(encoder_state * const encoder_state, 
-                                          const int lcu_offset_y) {
-  
-  encoder_state->wfrow->lcu_offset_y = lcu_offset_y;
-  return 1;
-}
-
-static void encoder_state_config_wfrow_finalize(encoder_state * const encoder_state) {
-  //Nothing to do (yet?)
-}
-
-#ifdef _DEBUG
-static void encoder_state_dump_graphviz(const encoder_state * const encoder_state) {
-  int i;
-  
-  if (!encoder_state->parent) {
-    const encoder_control * const encoder = encoder_state->encoder_control;
-    int y,x;
-    //Empty lines (easier to copy-paste)
-    printf("\n\n\n\n\n");
-    //Some styling...
-    printf("digraph EncoderStates {\n");
-    printf(" fontname = \"Bitstream Vera Sans\"\n");
-    printf(" fontsize = 8\n\n");
-    printf(" node [\n");
-    printf("  fontname = \"Bitstream Vera Sans\"\n");
-    printf("  fontsize = 8\n");
-    printf("  shape = \"record\"\n");
-    printf(" ]\n\n");
-    printf(" edge [\n");
-    printf("  arrowtail = \"empty\"\n");
-    printf(" ]\n\n");
-    
-    printf(" \"Map\" [\n");
-    printf("  shape=plaintext\n");
-    printf("  label = <<table cellborder=\"1\" cellspacing=\"0\" border=\"0\">");
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>RS Map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        
-        printf("<td>%d</td>", lcu_id_rs);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>TS Map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        
-        printf("<td>%d</td>", lcu_id_ts);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Tile map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        
-        printf("<td>%d</td>", encoder->tiles_tile_id[lcu_id_ts]);
-      }
-      printf("</tr>");
-    }
-    printf("<tr><td colspan=\"%d\" height=\"20\" valign=\"bottom\"><b>Slice map</b></td></tr>", encoder->in.width_in_lcu);
-    for (y = 0; y < encoder->in.height_in_lcu; ++y) {
-      printf("<tr>");
-      for (x = 0; x < encoder->in.width_in_lcu; ++x) {
-        const int lcu_id_rs = y * encoder->in.width_in_lcu + x;
-        const int lcu_id_ts = encoder->tiles_ctb_addr_rs_to_ts[lcu_id_rs];
-        int slice_id = 0;
-        
-        //Not efficient, but who cares
-        for (i=0; i < encoder->slice_count; ++i) {
-          if (encoder->slice_addresses_in_ts[i] <= lcu_id_ts) {
-            slice_id = i;
-          }
-        }
-        
-        printf("<td>%d</td>", slice_id);
-      }
-      printf("</tr>");
-    }
-    printf("</table>>\n ]\n");
-  }
-  
-  printf(" \"%p\" [\n", encoder_state);
-  printf("  label = \"{encoder_state|");
-  printf("+ type=%c\\l", encoder_state->type);
-  if (!encoder_state->parent || encoder_state->global != encoder_state->parent->global) {
-    printf("|+ global\\l");
-  }
-  if (!encoder_state->parent || encoder_state->tile != encoder_state->parent->tile) {
-    printf("|+ tile\\l");
-    printf(" - id = %d\\l", encoder_state->tile->id);
-    printf(" - lcu_offset_x = %d\\l", encoder_state->tile->lcu_offset_x);
-    printf(" - lcu_offset_y = %d\\l", encoder_state->tile->lcu_offset_y);
-    printf(" - lcu_offset_in_ts = %d\\l", encoder_state->tile->lcu_offset_in_ts);
-  }
-  if (!encoder_state->parent || encoder_state->slice != encoder_state->parent->slice) {
-    printf("|+ slice\\l");
-    printf(" - id = %d\\l", encoder_state->slice->id);
-    printf(" - start_in_ts = %d\\l", encoder_state->slice->start_in_ts);
-    printf(" - end_in_ts = %d\\l", encoder_state->slice->end_in_ts);
-    printf(" - start_in_rs = %d\\l", encoder_state->slice->start_in_rs);
-    printf(" - end_in_rs = %d\\l", encoder_state->slice->end_in_rs);
-  }
-  if (!encoder_state->parent || encoder_state->wfrow != encoder_state->parent->wfrow) {
-    printf("|+ wfrow\\l");
-    printf(" - lcu_offset_y = %d\\l", encoder_state->wfrow->lcu_offset_y);
-  }
-  printf("}\"\n");
-  printf(" ]\n");
-  
-  if (encoder_state->parent) {
-    printf(" \"%p\" -> \"%p\"\n", encoder_state->parent, encoder_state);
-  }
-  
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    encoder_state_dump_graphviz(&encoder_state->children[i]);
-  }
-  
-  if (!encoder_state->parent) {
-    printf("}\n");
-    //Empty lines (easier to copy-paste)
-    printf("\n\n\n\n\n");
-  }
-}
-#endif //_DEBUG
-
-int encoder_state_init(encoder_state * const child_state, encoder_state * const parent_state) {
-  //We require that, if parent_state is NULL:
-  //child_state->encoder_control is set
-  //
-  //If parent_state is not NULL, the following variable should either be set to NULL,
-  //in order to inherit from parent, or should point to a valid structure:
-  //child_state->global
-  //child_state->tile
-  //child_state->slice
-  //child_state->wfrow
-  
-  child_state->parent = parent_state;
-  child_state->children = MALLOC(encoder_state, 1);
-  child_state->children[0].encoder_control = NULL;
-  
-  if (!parent_state) {
-    const encoder_control * const encoder = child_state->encoder_control;
-    child_state->type = ENCODER_STATE_TYPE_MAIN;
-    assert(child_state->encoder_control);
-    child_state->global = MALLOC(encoder_state_config_global, 1);
-    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
-      fprintf(stderr, "Could not initialize encoder_state->global!\n");
-      return 0;
-    }
-    child_state->tile = MALLOC(encoder_state_config_tile, 1);
-    if (!child_state->tile || !encoder_state_config_tile_init(child_state, 0, 0, encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu)) {
-      fprintf(stderr, "Could not initialize encoder_state->tile!\n");
-      return 0;
-    }
-    child_state->slice = MALLOC(encoder_state_config_slice, 1);
-    if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
-      fprintf(stderr, "Could not initialize encoder_state->slice!\n");
-      return 0;
-    }
-    child_state->wfrow = MALLOC(encoder_state_config_wfrow, 1);
-    if (!child_state->wfrow || !encoder_state_config_wfrow_init(child_state, 0)) {
-      fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
-      return 0;
-    }
-  } else {
-    child_state->encoder_control = parent_state->encoder_control;
-    if (!child_state->global) child_state->global = parent_state->global;
-    if (!child_state->tile) child_state->tile = parent_state->tile;
-    if (!child_state->slice) child_state->slice = parent_state->slice;
-    if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
-  }
-  
-  //Allocate bitstream
-  if (child_state->type == ENCODER_STATE_TYPE_MAIN) {
-    //Main encoder outputs to file
-    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_FILE)) {
-      fprintf(stderr, "Could not initialize stream!\n");
-      return 0;
-    }
-    child_state->stream.file.output = child_state->encoder_control->out.file;
-  } else {
-    //Other encoders use a memory bitstream
-    if (!bitstream_init(&child_state->stream, BITSTREAM_TYPE_MEMORY)) {
-      fprintf(stderr, "Could not initialize stream!\n");
-      return 0;
-    }
-  }
-  
-  // Set CABAC output bitstream
-  child_state->cabac.stream = &child_state->stream;
-  
-  //Create sub-encoders
-  {
-    const encoder_control * const encoder = child_state->encoder_control;
-    int child_count = 0;
-    //We first check the type of this element.
-    //If it's a MAIN, it can allow both slices or tiles as child
-    //If it's a TILE, it can allow slices as child, if its parent is not a slice, or wavefront rows if there is no other children
-    //If it's a SLICE, it can allow tiles as child, if its parent is not a tile, or wavefront rows if there is no other children
-    //If it's a WAVEFRONT_ROW, it doesn't allow any children
-    int children_allow_wavefront_row = 0;
-    int children_allow_slice = 0;
-    int children_allow_tile = 0;
-    int range_start;
-    
-    int start_in_ts, end_in_ts;
-    
-    switch(child_state->type) {
-      case ENCODER_STATE_TYPE_MAIN:
-        children_allow_slice = 1;
-        children_allow_tile = 1;
-        start_in_ts = 0;
-        end_in_ts = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-        break;
-      case ENCODER_STATE_TYPE_SLICE:
-        assert(child_state->parent);
-        if (child_state->parent->type != ENCODER_STATE_TYPE_TILE) children_allow_tile = 1;
-        children_allow_wavefront_row = encoder->wpp;
-        start_in_ts = child_state->slice->start_in_ts;
-        end_in_ts = child_state->slice->end_in_ts;
-        break;
-      case ENCODER_STATE_TYPE_TILE:
-        assert(child_state->parent);
-        if (child_state->parent->type != ENCODER_STATE_TYPE_SLICE) children_allow_slice = 1;
-        children_allow_wavefront_row = encoder->wpp;
-        start_in_ts = child_state->tile->lcu_offset_in_ts;
-        end_in_ts = child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-        break;
-      case ENCODER_STATE_TYPE_WAVEFRONT_ROW:
-        //GCC tries to be too clever...
-        start_in_ts = -1;
-        end_in_ts = -1;
-        break;
-      default:
-        fprintf(stderr, "Invalid encoder_state->type %d!\n", child_state->type);
-        assert(0);
-        return 0;
-    }
-    
-    range_start = start_in_ts;
-    //printf("%c-%p: start_in_ts=%d, end_in_ts=%d\n",child_state->type, child_state, start_in_ts, end_in_ts);
-    while (range_start < end_in_ts && (children_allow_slice || children_allow_tile)) {
-      encoder_state *new_child = NULL;
-      int range_end_slice = range_start; //Will be incremented to get the range of the "thing"
-      int range_end_tile = range_start; //Will be incremented to get the range of the "thing"
-      
-      int tile_allowed = lcu_at_tile_start(encoder, range_start) && children_allow_tile;
-      int slice_allowed = lcu_at_slice_start(encoder, range_start) && children_allow_slice;
-      
-      //Find the smallest structure following the cursor
-      if (slice_allowed) {
-        while(!lcu_at_slice_end(encoder, range_end_slice)) {
-          ++range_end_slice;
-        }
-      }
-      
-      if (tile_allowed) {
-        while(!lcu_at_tile_end(encoder, range_end_tile)) {
-          ++range_end_tile;
-        }
-      }
-      
-      //printf("range_start=%d, range_end_slice=%d, range_end_tile=%d, tile_allowed=%d, slice_allowed=%d end_in_ts=%d\n",range_start,range_end_slice,range_end_tile,tile_allowed,slice_allowed,end_in_ts);
-      
-      if ((!tile_allowed || (range_end_slice >= range_end_tile)) && !new_child && slice_allowed) {
-        //Create a slice
-        new_child = &child_state->children[child_count];
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_SLICE;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
-        new_child->wfrow = child_state->wfrow;
-        new_child->slice = MALLOC(encoder_state_config_slice, 1);
-        if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
-          fprintf(stderr, "Could not initialize encoder_state->slice!\n");
-          return 0;
-        }
-      }
-      
-      if ((!slice_allowed || (range_end_slice < range_end_tile)) && !new_child && tile_allowed) {
-        //Create a tile
-        int tile_id = encoder->tiles_tile_id[range_start];
-        int tile_x = tile_id % encoder->tiles_num_tile_columns;
-        int tile_y = tile_id / encoder->tiles_num_tile_columns;
-        
-        int lcu_offset_x = encoder->tiles_col_bd[tile_x];
-        int lcu_offset_y = encoder->tiles_row_bd[tile_y];
-        int width_in_lcu = encoder->tiles_col_bd[tile_x+1]-encoder->tiles_col_bd[tile_x];
-        int height_in_lcu = encoder->tiles_row_bd[tile_y+1]-encoder->tiles_row_bd[tile_y];
-        int width = MIN(width_in_lcu * LCU_WIDTH, encoder->in.width - lcu_offset_x * LCU_WIDTH);
-        int height = MIN(height_in_lcu * LCU_WIDTH, encoder->in.height - lcu_offset_y * LCU_WIDTH);
-        
-        new_child = &child_state->children[child_count];
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_TILE;
-        new_child->global = child_state->global;
-        new_child->tile = MALLOC(encoder_state_config_tile, 1);
-        new_child->slice = child_state->slice;
-        new_child->wfrow = child_state->wfrow;
-        
-        if (!new_child->tile || !encoder_state_config_tile_init(new_child, lcu_offset_x, lcu_offset_y, width, height, width_in_lcu, height_in_lcu)) {
-          fprintf(stderr, "Could not initialize encoder_state->tile!\n");
-          return 0;
-        }
-      }
-      
-      if (new_child) {
-        child_state->children = realloc(child_state->children, sizeof(encoder_state) * (2+child_count));
-        child_state->children[1+child_count].encoder_control = NULL;
-        if (!child_state->children) {
-          fprintf(stderr, "Failed to allocate memory for children...\n");
-          return 0;
-        }
-
-        //Fix children parent (since we changed the address), except for the last one which is not ready yet
-        {
-          int i, j;
-          for (i = 0; child_state->children[i].encoder_control && i < child_count; ++i) {
-            for (j = 0; child_state->children[i].children[j].encoder_control; ++j) {
-              child_state->children[i].children[j].parent = &child_state->children[i];
-            }
-            for (j = 0; j < child_state->children[i].lcu_order_count; ++j) {
-              child_state->children[i].lcu_order[j].encoder_state = &child_state->children[i];
-            }
-            child_state->children[i].cabac.stream = &child_state->children[i].stream;
-          }
-        }
-        
-        if (!encoder_state_init(&child_state->children[child_count], child_state)) {
-          fprintf(stderr, "Unable to init child...\n");
-          return 0;
-        }
-        child_count += 1;
-      }
-      
-      range_start = MAX(range_end_slice, range_end_tile) + 1;
-    }
-    
-    //We create wavefronts only if we have no children
-    if (children_allow_wavefront_row && child_count == 0) {
-      int first_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
-      int last_row = encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] / encoder->in.width_in_lcu;
-      int num_rows;
-      int i;
-      
-      assert(!(children_allow_slice || children_allow_tile));
-      assert(child_count == 0);
-      
-      for (i=start_in_ts; i<end_in_ts; ++i) {
-        const int row = encoder->tiles_ctb_addr_ts_to_rs[i] / encoder->in.width_in_lcu;
-        if (row < first_row) first_row = row;
-        if (row > last_row) last_row = row;
-      }
-      
-      num_rows = last_row - first_row + 1;
-      
-      //When entropy_coding_sync_enabled_flag is equal to 1 and the first coding tree block in a slice is not the first coding
-      //tree block of a row of coding tree blocks in a tile, it is a requirement of bitstream conformance that the last coding tree
-      //block in the slice shall belong to the same row of coding tree blocks as the first coding tree block in the slice.
-      
-      if (encoder->tiles_ctb_addr_ts_to_rs[start_in_ts] % encoder->in.width_in_lcu != child_state->tile->lcu_offset_x) {
-        if (num_rows > 1) {
-          fprintf(stderr, "Invalid: first CTB in slice %d is not at the tile %d edge, and the slice spans on more than one row.\n", child_state->slice->id, child_state->tile->id);
-          return 0;
-        }
-      }
-      
-      //FIXME Do the same kind of check if we implement slice segments
-    
-      child_count = num_rows;
-      child_state->children = realloc(child_state->children, sizeof(encoder_state) * (num_rows + 1));
-      child_state->children[num_rows].encoder_control = NULL;
-      
-      for (i=0; i < num_rows; ++i) {
-        encoder_state *new_child = &child_state->children[i];
-        
-        new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
-        new_child->slice = child_state->slice;
-        new_child->wfrow = MALLOC(encoder_state_config_wfrow, 1);
-        
-        if (!new_child->wfrow || !encoder_state_config_wfrow_init(new_child, i)) {
-          fprintf(stderr, "Could not initialize encoder_state->wfrow!\n");
-          return 0;
-        }
-        
-        if (!encoder_state_init(new_child, child_state)) {
-          fprintf(stderr, "Unable to init child...\n");
-          return 0;
-        }
-      }
-    }
-    
-    child_state->is_leaf = (child_count == 0);
-    //This node is a leaf, compute LCU-order
-    if (child_state->is_leaf) {
-      //All LCU computations are relative to the tile
-      //Remark: this could be optimized, but since it's run only once, it's better to do it in a understandable way.
-      
-      //By default, the full tile
-      int i;
-      int lcu_id;
-      int lcu_start = 0;
-      //End is the element AFTER the end (iterate < lcu_end)
-      int lcu_end = child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu;
-      
-      //Restrict to the current slice if needed
-      lcu_start = MAX(lcu_start, child_state->slice->start_in_ts - child_state->tile->lcu_offset_in_ts);
-      lcu_end = MIN(lcu_end, child_state->slice->end_in_ts - child_state->tile->lcu_offset_in_ts + 1);
-      
-      //Restrict to the current wavefront row if needed
-      if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-        lcu_start = MAX(lcu_start, (child_state->wfrow->lcu_offset_y) * child_state->tile->cur_pic->width_in_lcu);
-        lcu_end = MIN(lcu_end, (child_state->wfrow->lcu_offset_y + 1) * child_state->tile->cur_pic->width_in_lcu);
-      }
-      
-      child_state->lcu_order_count = lcu_end - lcu_start;
-      child_state->lcu_order = MALLOC(lcu_order_element, child_state->lcu_order_count);
-      assert(child_state->lcu_order);
-      
-      for (i = 0; i < child_state->lcu_order_count; ++i) {
-        lcu_id = lcu_start + i;
-        child_state->lcu_order[i].encoder_state = child_state;
-        child_state->lcu_order[i].id = lcu_id;
-        child_state->lcu_order[i].index = i;
-        child_state->lcu_order[i].position.x = lcu_id % child_state->tile->cur_pic->width_in_lcu;
-        child_state->lcu_order[i].position.y = lcu_id / child_state->tile->cur_pic->width_in_lcu;
-        child_state->lcu_order[i].position_px.x = child_state->lcu_order[i].position.x * LCU_WIDTH;
-        child_state->lcu_order[i].position_px.y = child_state->lcu_order[i].position.y * LCU_WIDTH;
-        child_state->lcu_order[i].size.x = MIN(LCU_WIDTH, encoder->in.width - (child_state->tile->lcu_offset_x * LCU_WIDTH + child_state->lcu_order[i].position_px.x));
-        child_state->lcu_order[i].size.y = MIN(LCU_WIDTH, encoder->in.height - (child_state->tile->lcu_offset_y * LCU_WIDTH + child_state->lcu_order[i].position_px.y));
-        child_state->lcu_order[i].first_row = lcu_in_first_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].last_row = lcu_in_last_row(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].first_column = lcu_in_first_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        child_state->lcu_order[i].last_column = lcu_in_last_column(child_state, child_state->tile->lcu_offset_in_ts + lcu_id);
-        
-        child_state->lcu_order[i].above = NULL;
-        child_state->lcu_order[i].below = NULL;
-        child_state->lcu_order[i].left = NULL;
-        child_state->lcu_order[i].right = NULL;
-        
-        if (!child_state->lcu_order[i].first_row) {
-          //Find LCU above
-          if (child_state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-            int j;
-            //For all previous wavefront rows
-            for (j=0; &child_state->parent->children[j] != child_state && child_state->parent->children[j].encoder_control; ++j) {
-              if (child_state->parent->children[j].wfrow->lcu_offset_y == child_state->wfrow->lcu_offset_y - 1) {
-                int k;
-                for (k=0; k < child_state->parent->children[j].lcu_order_count; ++k) {
-                  if (child_state->parent->children[j].lcu_order[k].position.x == child_state->lcu_order[i].position.x) {
-                    assert(child_state->parent->children[j].lcu_order[k].position.y == child_state->lcu_order[i].position.y - 1);
-                    child_state->lcu_order[i].above = &child_state->parent->children[j].lcu_order[k];
-                  }
-                }
-              }
-            }
-          } else {
-            child_state->lcu_order[i].above = &child_state->lcu_order[i-child_state->tile->cur_pic->width_in_lcu];
-          }
-          assert(child_state->lcu_order[i].above);
-          child_state->lcu_order[i].above->below = &child_state->lcu_order[i];
-        }
-        if (!child_state->lcu_order[i].first_column) {
-          child_state->lcu_order[i].left = &child_state->lcu_order[i-1];
-          assert(child_state->lcu_order[i].left->position.x == child_state->lcu_order[i].position.x - 1);
-          child_state->lcu_order[i].left->right = &child_state->lcu_order[i];
-        }
-      }
-    } else {
-      child_state->lcu_order_count = 0;
-      child_state->lcu_order = NULL;
-    }
-  }
-  
-  //Validate the structure
-  if (child_state->type == ENCODER_STATE_TYPE_TILE) {
-    if (child_state->tile->lcu_offset_in_ts < child_state->slice->start_in_ts) {
-      fprintf(stderr, "Tile %d starts before slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
-      return 0;
-    }
-    if (child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1 > child_state->slice->end_in_ts) {
-      fprintf(stderr, "Tile %d ends after slice %d, in which it should be included!\n", child_state->tile->id, child_state->slice->id);
-      return 0;
-    }
-  }
-  
-  if (child_state->type == ENCODER_STATE_TYPE_SLICE) {
-    if (child_state->slice->start_in_ts < child_state->tile->lcu_offset_in_ts) {
-      fprintf(stderr, "Slice %d starts before tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
-      return 0;
-    }
-    if (child_state->slice->end_in_ts > child_state->tile->lcu_offset_in_ts + child_state->tile->cur_pic->width_in_lcu * child_state->tile->cur_pic->height_in_lcu - 1) {
-      fprintf(stderr, "Slice %d ends after tile %d, in which it should be included!\n", child_state->slice->id, child_state->tile->id);
-      return 0;
-    }
-  }
-  
-  
-#ifdef _DEBUG
-  if (!parent_state) encoder_state_dump_graphviz(child_state);
-#endif //_DEBUG
-  return 1;
-}
-
-void encoder_state_finalize(encoder_state * const encoder_state) {
-  if (encoder_state->children) {
-    int i=0;
-    for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-      encoder_state_finalize(&encoder_state->children[i]);
-    }
-    
-    FREE_POINTER(encoder_state->children);
-  }
-  
-  FREE_POINTER(encoder_state->lcu_order);
-  encoder_state->lcu_order_count = 0;
-  
-  if (!encoder_state->parent || (encoder_state->parent->wfrow != encoder_state->wfrow)) {
-    encoder_state_config_wfrow_finalize(encoder_state);
-    FREE_POINTER(encoder_state->wfrow);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->slice != encoder_state->slice)) {
-    encoder_state_config_slice_finalize(encoder_state);
-    FREE_POINTER(encoder_state->slice);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->tile != encoder_state->tile)) {
-    encoder_state_config_tile_finalize(encoder_state);
-    FREE_POINTER(encoder_state->tile);
-  }
-  
-  if (!encoder_state->parent || (encoder_state->parent->global != encoder_state->global)) {
-    encoder_state_config_global_finalize(encoder_state);
-    FREE_POINTER(encoder_state->global);
-  }
-  
-  bitstream_finalize(&encoder_state->stream);
-}
-
-/******  END Constructors/destructors  ******/
-
-/******  BEGIN Functions writing bitstream parts ******/
-
-static void encoder_state_write_bitstream_access_unit_delimiter(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  uint8_t pic_type = encoder_state->global->slicetype == SLICE_I ? 0
-                   : encoder_state->global->slicetype == SLICE_P ? 1
-                   :                                             2;
-  WRITE_U(stream, pic_type, 3, "pic_type");
-}
-
-static void encoder_state_write_bitstream_aud(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  encoder_state_write_bitstream_access_unit_delimiter(encoder_state);
-  nal_write(stream, AUD_NUT, 0, 1);
-  bitstream_align(stream);
-}
-
-static void encoder_state_write_bitstream_PTL(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-  // PTL
-  // Profile Tier
-  WRITE_U(stream, 0, 2, "general_profile_space");
-  WRITE_U(stream, 0, 1, "general_tier_flag");
-  // Main Profile == 1
-  WRITE_U(stream, 1, 5, "general_profile_idc");
-  /* Compatibility flags should be set at general_profile_idc
-   *  (so with general_profile_idc = 1, compatibility_flag[1] should be 1)
-   * According to specification, when compatibility_flag[1] is set,
-   *  compatibility_flag[2] should be set too.
-   */
-  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag[]");
-
-  WRITE_U(stream, 1, 1, "general_progressive_source_flag");
-  WRITE_U(stream, 0, 1, "general_interlaced_source_flag");
-  WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
-  WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
-
-  WRITE_U(stream, 0, 32, "XXX_reserved_zero_44bits[0..31]");
-  WRITE_U(stream, 0, 12, "XXX_reserved_zero_44bits[32..43]");
-
-  // end Profile Tier
-
-  // Level 6.2 (general_level_idc is 30 * 6.2)
-  WRITE_U(stream, 186, 8, "general_level_idc");
-
-  WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
-  WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
-
-  for (i = 1; i < 8; i++) {
-    WRITE_U(stream, 0, 2, "reserved_zero_2bits");
-  }
-
-  // end PTL
-}
-
-static void encoder_state_write_bitstream_vid_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  int i;
-#ifdef _DEBUG
-  printf("=========== Video Parameter Set ID: 0 ===========\n");
-#endif
-
-  WRITE_U(stream, 0, 4, "vps_video_parameter_set_id");
-  WRITE_U(stream, 3, 2, "vps_reserved_three_2bits" );
-  WRITE_U(stream, 0, 6, "vps_reserved_zero_6bits" );
-  WRITE_U(stream, 1, 3, "vps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "vps_temporal_id_nesting_flag");
-  WRITE_U(stream, 0xffff, 16, "vps_reserved_ffff_16bits");
-
-  encoder_state_write_bitstream_PTL(encoder_state);
-
-  WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  for (i = 0; i < 1; i++) {
-  WRITE_UE(stream, 1, "vps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "vps_num_reorder_pics");
-  WRITE_UE(stream, 0, "vps_max_latency_increase");
-  }
-
-  WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
-  WRITE_UE(stream, 0, "vps_max_op_sets_minus1");
-  WRITE_U(stream, 0, 1, "vps_timing_info_present_flag");
-
-  //IF timing info
-  //END IF
-
-  WRITE_U(stream, 0, 1, "vps_extension_flag");
-}
-
-static void encoder_state_write_bitstream_scaling_list(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-  uint32_t size_id;
-  for (size_id = 0; size_id < SCALING_LIST_SIZE_NUM; size_id++) {
-    int32_t list_id;
-    for (list_id = 0; list_id < g_scaling_list_num[size_id]; list_id++) {
-      uint8_t scaling_list_pred_mode_flag = 1;
-      int32_t pred_list_idx;
-      int32_t i;
-      uint32_t ref_matrix_id = UINT32_MAX;
-
-      for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
-        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
-                                     scalinglist_get_default(size_id, pred_list_idx) :
-                                     encoder->scaling_list.scaling_list_coeff[size_id][pred_list_idx];
-
-        if (!memcmp(encoder->scaling_list.scaling_list_coeff[size_id][list_id], pred_list, sizeof(int32_t) * MIN(8, g_scaling_list_size[size_id])) &&
-            ((size_id < SCALING_LIST_16x16) ||
-             (encoder->scaling_list.scaling_list_dc[size_id][list_id] == encoder->scaling_list.scaling_list_dc[size_id][pred_list_idx]))) {
-          ref_matrix_id = pred_list_idx;
-          scaling_list_pred_mode_flag = 0;
-          break;
-        }
-      }
-      WRITE_U(stream, scaling_list_pred_mode_flag, 1, "scaling_list_pred_mode_flag" );
-
-      if (!scaling_list_pred_mode_flag) {
-        WRITE_UE(stream, list_id - ref_matrix_id, "scaling_list_pred_matrix_id_delta");
-      } else {
-        int32_t delta;
-        const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, g_scaling_list_size[size_id]);
-        const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
-        int32_t next_coef = 8;
-        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeff[size_id][list_id];
-
-        if (size_id >= SCALING_LIST_16x16) {
-          WRITE_SE(stream, encoder->scaling_list.scaling_list_dc[size_id][list_id] - 8, "scaling_list_dc_coef_minus8");
-          next_coef = encoder->scaling_list.scaling_list_dc[size_id][list_id];
-        }
-
-        for (i = 0; i < coef_num; i++) {
-          delta     = coef_list[scan_cg[i]] - next_coef;
-          next_coef = coef_list[scan_cg[i]];
-          if (delta > 127)
-            delta -= 256;
-          if (delta < -128)
-            delta += 256;
-
-          WRITE_SE(stream, delta, "scaling_list_delta_coef");
-        }
-      }
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_seq_parameter_set(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  //FIXME: use encoder_control instead of cur_pic
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-
-#ifdef _DEBUG
-  printf("=========== Sequence Parameter Set ID: 0 ===========\n");
-#endif
-
-  // TODO: profile IDC and level IDC should be defined later on
-  WRITE_U(stream, 0, 4, "sps_video_parameter_set_id");
-  WRITE_U(stream, 1, 3, "sps_max_sub_layers_minus1");
-  WRITE_U(stream, 0, 1, "sps_temporal_id_nesting_flag");
-
-  encoder_state_write_bitstream_PTL(encoder_state);
-
-  WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
-  WRITE_UE(stream, encoder_state->encoder_control->in.video_format,
-           "chroma_format_idc");
-
-  if (encoder_state->encoder_control->in.video_format == 3) {
-    WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
-  }
-
-  WRITE_UE(stream, cur_pic->width, "pic_width_in_luma_samples");
-  WRITE_UE(stream, cur_pic->height, "pic_height_in_luma_samples");
-
-  if (cur_pic->width != encoder_state->encoder_control->in.real_width || cur_pic->height != encoder_state->encoder_control->in.real_height) {
-    // The standard does not seem to allow setting conf_win values such that
-    // the number of luma samples is not a multiple of 2. Options are to either
-    // hide one line or show an extra line of non-video. Neither seems like a
-    // very good option, so let's not even try.
-    assert(!(cur_pic->width % 2));
-    WRITE_U(stream, 1, 1, "conformance_window_flag");
-    WRITE_UE(stream, 0, "conf_win_left_offset");
-    WRITE_UE(stream, (cur_pic->width - encoder_state->encoder_control->in.real_width) >> 1,
-             "conf_win_right_offset");
-    WRITE_UE(stream, 0, "conf_win_top_offset");
-    WRITE_UE(stream, (cur_pic->height - encoder_state->encoder_control->in.real_height) >> 1,
-             "conf_win_bottom_offset");
-  } else {
-    WRITE_U(stream, 0, 1, "conformance_window_flag");
-  }
-
-  //IF window flag
-  //END IF
-
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_luma_minus8");
-  WRITE_UE(stream, encoder_state->encoder_control->bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
-  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
-
-  //for each layer
-  WRITE_UE(stream, 0, "sps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "sps_num_reorder_pics");
-  WRITE_UE(stream, 0, "sps_max_latency_increase");
-  //end for
-
-  WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
-  WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
-  WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
-  WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
-  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
-  WRITE_UE(stream, TR_DEPTH_INTRA, "max_transform_hierarchy_depth_intra");
-
-  // scaling list
-  WRITE_U(stream, encoder_state->encoder_control->scaling_list.enable, 1, "scaling_list_enable_flag");
-  if (encoder_state->encoder_control->scaling_list.enable) {
-    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
-    encoder_state_write_bitstream_scaling_list(encoder_state);
-  }
-
-  WRITE_U(stream, 0, 1, "amp_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->sao_enable ? 1 : 0, 1,
-          "sample_adaptive_offset_enabled_flag");
-  WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
-  #if ENABLE_PCM == 1
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_luma_minus1");
-    WRITE_U(stream, 7, 4, "pcm_sample_bit_depth_chroma_minus1");
-    WRITE_UE(stream, 0, "log2_min_pcm_coding_block_size_minus3");
-    WRITE_UE(stream, 2, "log2_diff_max_min_pcm_coding_block_size");
-    WRITE_U(stream, 1, 1, "pcm_loop_filter_disable_flag");
-  #endif
-
-  WRITE_UE(stream, 0, "num_short_term_ref_pic_sets");
-
-  //IF num short term ref pic sets
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "long_term_ref_pics_present_flag");
-
-  //IF long_term_ref_pics_present
-  //ENDIF
-
-  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
-          "sps_temporal_mvp_enable_flag");
-  WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
-  WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
-
-  encode_VUI(encoder_state);
-
-  WRITE_U(stream, 0, 1, "sps_extension_flag");
-}
-
-static void encoder_state_write_bitstream_pic_parameter_set(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-#ifdef _DEBUG
-  printf("=========== Picture Parameter Set ID: 0 ===========\n");
-#endif
-  WRITE_UE(stream, 0, "pic_parameter_set_id");
-  WRITE_UE(stream, 0, "seq_parameter_set_id");
-  WRITE_U(stream, 0, 1, "dependent_slice_segments_enabled_flag");
-  WRITE_U(stream, 0, 1, "output_flag_present_flag");
-  WRITE_U(stream, 0, 3, "num_extra_slice_header_bits");
-  WRITE_U(stream, ENABLE_SIGN_HIDING, 1, "sign_data_hiding_flag");
-  WRITE_U(stream, 0, 1, "cabac_init_present_flag");
-
-  WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
-  WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder_state->global->QP)-26, "pic_init_qp_minus26");
-  WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
-  WRITE_U(stream, encoder_state->encoder_control->trskip_enable, 1, "transform_skip_enabled_flag");
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
-  //if cu_qp_delta_enabled_flag
-  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
-
-  //TODO: add QP offsets
-  WRITE_SE(stream, 0, "pps_cb_qp_offset");
-  WRITE_SE(stream, 0, "pps_cr_qp_offset");
-  WRITE_U(stream, 0, 1, "pps_slice_chroma_qp_offsets_present_flag");
-  WRITE_U(stream, 0, 1, "weighted_pred_flag");
-  WRITE_U(stream, 0, 1, "weighted_bipred_idc");
-
-  //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
-  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
-  WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
-  //wavefronts
-  WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
-
-  if (encoder->tiles_enable) {
-    WRITE_UE(stream, encoder->tiles_num_tile_columns - 1, "num_tile_columns_minus1");
-    WRITE_UE(stream, encoder->tiles_num_tile_rows - 1, "num_tile_rows_minus1");
-    
-    WRITE_U(stream, encoder->tiles_uniform_spacing_flag, 1, "uniform_spacing_flag");
-    
-    if (!encoder->tiles_uniform_spacing_flag) {
-      int i;
-      for (i = 0; i < encoder->tiles_num_tile_columns - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_col_width[i] - 1, "column_width_minus1[...]");
-      }
-      for (i = 0; i < encoder->tiles_num_tile_rows - 1; ++i) {
-        WRITE_UE(stream, encoder->tiles_row_height[i] - 1, "row_height_minus1[...]");
-      }
-    }
-    WRITE_U(stream, 0, 1, "loop_filter_across_tiles_enabled_flag");
-    
-  }
-  
-  WRITE_U(stream, 0, 1, "loop_filter_across_slice_flag");
-  WRITE_U(stream, 1, 1, "deblocking_filter_control_present_flag");
-
-  //IF deblocking_filter
-    WRITE_U(stream, 0, 1, "deblocking_filter_override_enabled_flag");
-  WRITE_U(stream, encoder_state->encoder_control->deblock_enable ? 0 : 1, 1,
-          "pps_disable_deblocking_filter_flag");
-
-    //IF !disabled
-  if (encoder_state->encoder_control->deblock_enable) {
-     WRITE_SE(stream, encoder_state->encoder_control->beta_offset_div2, "beta_offset_div2");
-     WRITE_SE(stream, encoder_state->encoder_control->tc_offset_div2, "tc_offset_div2");
-    }
-
-    //ENDIF
-  //ENDIF
-  WRITE_U(stream, 0, 1, "pps_scaling_list_data_present_flag");
-  //IF scaling_list
-  //ENDIF
-  WRITE_U(stream, 0, 1, "lists_modification_present_flag");
-  WRITE_UE(stream, 0, "log2_parallel_merge_level_minus2");
-  WRITE_U(stream, 0, 1, "slice_segment_header_extension_present_flag");
-  WRITE_U(stream, 0, 1, "pps_extension_flag");
-}
-
-void encoder_state_write_bitstream_prefix_sei_version(encoder_state * const encoder_state)
-{
-#define STR_BUF_LEN 1000
-  bitstream * const stream = &encoder_state->stream;
-  int i, length;
-  char buf[STR_BUF_LEN] = { 0 };
-  char *s = buf + 16;
-  const config * const cfg = encoder_state->encoder_control->cfg;
-
-  // random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
-  static const uint8_t uuid[16] = {
-    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
-    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1
-  };
-  memcpy(buf, uuid, 16);
-
-  // user_data_payload_byte
-  s += sprintf(s, "Kvazaar HEVC Encoder v. " VERSION_STRING " - "
-                  "Copyleft 2012-2014 - http://ultravideo.cs.tut.fi/ - options:");
-  s += sprintf(s, " %dx%d", cfg->width, cfg->height);
-  s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
-               cfg->deblock_beta, cfg->deblock_tc);
-  s += sprintf(s, " sao=%d", cfg->sao_enable);
-  s += sprintf(s, " intra_period=%d", cfg->intra_period);
-  s += sprintf(s, " qp=%d", cfg->qp);
-  s += sprintf(s, " ref=%d", cfg->ref_frames);
-
-  length = (int)(s - buf + 1);  // length, +1 for \0
-
-  // Assert this so that in the future if the message gets longer, we remember
-  // to increase the buf len. Divide by 2 for margin.
-  assert(length < STR_BUF_LEN / 2);
-
-  // payloadType = 5 -> user_data_unregistered
-  WRITE_U(stream, 5, 8, "last_payload_type_byte");
-
-  // payloadSize
-  for (i = 0; i <= length - 255; i += 255)
-    WRITE_U(stream, 255, 8, "ff_byte");
-  WRITE_U(stream, length - i, 8, "last_payload_size_byte");
-
-  for (i = 0; i < length; i++)
-    WRITE_U(stream, ((uint8_t *)buf)[i], 8, "sei_payload");
-
-#undef STR_BUF_LEN
-}
-
-static void encoder_state_entry_points_explore(const encoder_state * const encoder_state, int * const r_count, int * const r_max_length) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      if (my_length > *r_max_length) {
-        *r_max_length = my_length;
-      }
-    } else {
-      encoder_state_entry_points_explore(&encoder_state->children[i], r_count, r_max_length);
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_entry_points_write(bitstream * const stream, const encoder_state * const encoder_state, const int num_entry_points, const int write_length, int * const r_count) {
-  int i;
-  for (i = 0; encoder_state->children[i].encoder_control; ++i) {
-    if (encoder_state->children[i].is_leaf) {
-      const int my_length = bitstream_tell(&encoder_state->children[i].stream)/8;
-      ++(*r_count);
-      //Don't write the last one
-      if (*r_count < num_entry_points) {
-        WRITE_U(stream, my_length - 1, write_length, "entry_point_offset-minus1")
-      }
-    } else {
-      encoder_state_write_bitstream_entry_points_write(stream, &encoder_state->children[i], num_entry_points, write_length, r_count);
-    }
-  }
-}
-
-static int num_bitcount(unsigned int n) {
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return ((n == 0) ? (-1) : pos);
-}
-
-void encoder_state_write_bitstream_slice_header(encoder_state * const encoder_state)
-{
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  bitstream * const stream = &encoder_state->stream;
-
-#ifdef _DEBUG
-  printf("=========== Slice ===========\n");
-#endif
-  WRITE_U(stream, (encoder_state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
-
-  if (encoder_state->global->pictype >= NAL_BLA_W_LP
-      && encoder_state->global->pictype <= NAL_RSV_IRAP_VCL23) {
-    WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
-  }
-
-  WRITE_UE(stream, 0, "slice_pic_parameter_set_id");
-  if (encoder_state->slice->start_in_rs > 0) {
-    //For now, we don't support dependent slice segments
-    //WRITE_U(stream, 0, 1, "dependent_slice_segment_flag");
-    WRITE_UE(stream, encoder_state->slice->start_in_rs, "slice_segment_address");
-  }
-
-  WRITE_UE(stream, encoder_state->global->slicetype, "slice_type");
-
-  // if !entropy_slice_flag
-
-    //if output_flag_present_flag
-      //WRITE_U(stream, 1, 1, "pic_output_flag");
-    //end if
-    //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (encoder_state->global->pictype != NAL_IDR_W_RADL
-      && encoder_state->global->pictype != NAL_IDR_N_LP) {
-      int j;
-      int ref_negative = encoder_state->global->ref->used_size;
-      int ref_positive = 0;
-      WRITE_U(stream, encoder_state->global->poc&0xf, 4, "pic_order_cnt_lsb");
-      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
-      WRITE_UE(stream, ref_negative, "num_negative_pics");
-      WRITE_UE(stream, ref_positive, "num_positive_pics");
-
-    for (j = 0; j < ref_negative; j++) {
-      int32_t delta_poc_minus1 = 0;
-      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
-      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
-    }
-
-    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
-  }
-
-    //end if
-  //end if
-  if (encoder->sao_enable) {
-    WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
-    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
-  }
-
-  if (encoder_state->global->slicetype != SLICE_I) {
-      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-        WRITE_UE(stream, encoder_state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
-      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
-  }
-
-  if (encoder_state->global->slicetype == SLICE_B) {
-      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
-  }
-
-  // Skip flags that are not present
-  // if !entropy_slice_flag
-    WRITE_SE(stream, 0, "slice_qp_delta");
-    //WRITE_U(stream, 1, 1, "alignment");
-   
-  if (encoder->tiles_enable || encoder->wpp) {
-    int num_entry_points = 0;
-    int max_length_seen = 0;
-    
-    encoder_state_entry_points_explore(encoder_state, &num_entry_points, &max_length_seen);
-    
-    WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
-    if (num_entry_points > 0) {
-      int entry_points_written = 0;
-      int offset_len = num_bitcount(max_length_seen) + 1;
-      WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
-      encoder_state_write_bitstream_entry_points_write(stream, encoder_state, num_entry_points, offset_len, &entry_points_written); 
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
-  const encoder_control * const encoder = main_state->encoder_control;
-  bitstream * const stream = &main_state->stream;
-
-  int i;
-
-  if (main_state->global->is_radl_frame) {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      encoder_state_write_bitstream_aud(main_state);
-
-    // Video Parameter Set (VPS)
-    nal_write(stream, NAL_VPS_NUT, 0, 1);
-    encoder_state_write_bitstream_vid_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Sequence Parameter Set (SPS)
-    nal_write(stream, NAL_SPS_NUT, 0, 1);
-    encoder_state_write_bitstream_seq_parameter_set(main_state);
-    bitstream_align(stream);
-
-    // Picture Parameter Set (PPS)
-    nal_write(stream, NAL_PPS_NUT, 0, 1);
-    encoder_state_write_bitstream_pic_parameter_set(main_state);
-    bitstream_align(stream);
-
-    if (main_state->global->frame == 0) {
-      // Prefix SEI
-      nal_write(stream, PREFIX_SEI_NUT, 0, 0);
-      encoder_state_write_bitstream_prefix_sei_version(main_state);
-      bitstream_align(stream);
-    }
-  } else {
-    // Access Unit Delimiter (AUD)
-    if (encoder->aud_enable)
-      encoder_state_write_bitstream_aud(main_state);
-  }
-
-  {
-    // Not quite sure if this is correct, but it seems to have worked so far
-    // so I tried to not change it's behavior.
-    int long_start_code = main_state->global->is_radl_frame || encoder->aud_enable ? 0 : 1;
-
-    nal_write(stream,
-              main_state->global->is_radl_frame ? NAL_IDR_W_RADL : NAL_TRAIL_R, 0, long_start_code);
-  }
-  {
-    PERFORMANCE_MEASURE_START();
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-    //FIXME: Move this...
-    bitstream_clear(&main_state->children[i].stream);
-  }
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_append,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  {
-    PERFORMANCE_MEASURE_START();
-    // Calculate checksum
-    add_checksum(main_state);
-    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
-  }
-  
-  //FIXME: Why is this needed?
-  main_state->tile->cur_pic->poc = main_state->global->poc;
-}
-
-static void encoder_state_worker_write_bitstream_leaf(void * opaque) {
-  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
-}
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
-  const encoder_control * const encoder = encoder_state->encoder_control;
-  //Write terminator of the leaf
-  assert(encoder_state->is_leaf);
-  
-  //Last LCU
-  {
-    const lcu_order_element * const lcu = &encoder_state->lcu_order[encoder_state->lcu_order_count - 1];
-    const int lcu_addr_in_ts = lcu->id + encoder_state->tile->lcu_offset_in_ts;
-    const int end_of_slice_segment_flag = lcu_at_slice_end(encoder, lcu_addr_in_ts);
-  
-    cabac_encode_bin_trm(&encoder_state->cabac, end_of_slice_segment_flag);  // end_of_slice_segment_flag
-  
-    if (!end_of_slice_segment_flag) {
-      assert(lcu_at_tile_end(encoder, lcu_addr_in_ts) || lcu->position.x == (encoder_state->tile->cur_pic->width_in_lcu - 1));
-      cabac_encode_bin_trm(&encoder_state->cabac, 1); // end_of_sub_stream_one_bit == 1
-      cabac_flush(&encoder_state->cabac);
-    } else {
-      cabac_flush(&encoder_state->cabac);
-      bitstream_align(&encoder_state->stream);
-    }
-  }
-}
-
-static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
-  //If it's not a leaf, a tile is "nothing". We only have to write sub elements
-  int i;
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-static void encoder_state_write_bitstream_slice(encoder_state * const main_state) {
-  int i;
-  encoder_state_write_bitstream_slice_header(main_state);
-  bitstream_align(&main_state->stream); 
-  
-  for (i = 0; main_state->children[i].encoder_control; ++i) {
-    //Append bitstream to main stream
-    bitstream_append(&main_state->stream, &main_state->children[i].stream);
-  }
-}
-
-
-static void encoder_state_write_bitstream(encoder_state * const main_state) {
-  int i;
-  if (!main_state->is_leaf) {
-    for (i=0; main_state->children[i].encoder_control; ++i) {
-      encoder_state *sub_state = &(main_state->children[i]);
-      encoder_state_write_bitstream(sub_state);
-    }
-    
-    switch (main_state->type) {
-      case ENCODER_STATE_TYPE_MAIN:
-        encoder_state_write_bitstream_main(main_state);
-        break;
-      case ENCODER_STATE_TYPE_TILE:
-        encoder_state_write_bitstream_tile(main_state);
-        break;
-      case ENCODER_STATE_TYPE_SLICE:
-        encoder_state_write_bitstream_slice(main_state);
-        break;
-      default:
-        fprintf(stderr, "Unsupported node type %c!\n", main_state->type);
-        assert(0);
-    }
-  }
-}
-
-
-
-
-
-
-
-
-
-
-static void encode_VUI(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  const encoder_control * const encoder = encoder_state->encoder_control;
-#ifdef _DEBUG
-  printf("=========== VUI Set ID: 0 ===========\n");
-#endif
-  if (encoder->vui.sar_width > 0 && encoder->vui.sar_height > 0) {
-    int i;
-    static const struct
-    {
-      uint8_t width;
-      uint8_t height;
-      uint8_t idc;
-    } sar[] = {
-      // aspect_ratio_idc = 0 -> unspecified
-      {  1,  1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 },
-      { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 },
-      { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12},
-      {160, 99, 13}, {  4,  3, 14}, {  3,  2, 15}, {  2,  1, 16},
-      // aspect_ratio_idc = [17..254] -> reserved
-      { 0, 0, 255 }
-    };
-
-    for (i = 0; sar[i].idc != 255; i++)
-      if (sar[i].width  == encoder->vui.sar_width &&
-          sar[i].height == encoder->vui.sar_height)
-        break;
-
-    WRITE_U(stream, 1, 1, "aspect_ratio_info_present_flag");
-    WRITE_U(stream, sar[i].idc, 8, "aspect_ratio_idc");
-    if (sar[i].idc == 255) {
-      // EXTENDED_SAR
-      WRITE_U(stream, encoder->vui.sar_width, 16, "sar_width");
-      WRITE_U(stream, encoder->vui.sar_height, 16, "sar_height");
-    }
-  } else
-    WRITE_U(stream, 0, 1, "aspect_ratio_info_present_flag");
-
-  //IF aspect ratio info
-  //ENDIF
-
-  if (encoder->vui.overscan > 0) {
-    WRITE_U(stream, 1, 1, "overscan_info_present_flag");
-    WRITE_U(stream, encoder->vui.overscan - 1, 1, "overscan_appropriate_flag");
-  } else
-    WRITE_U(stream, 0, 1, "overscan_info_present_flag");
-
-  //IF overscan info
-  //ENDIF
-
-  if (encoder->vui.videoformat != 5 || encoder->vui.fullrange ||
-      encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-      encoder->vui.colormatrix != 2) {
-    WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
-    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
-    WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
-
-    if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
-        encoder->vui.colormatrix != 2) {
-      WRITE_U(stream, 1, 1, "colour_description_present_flag");
-      WRITE_U(stream, encoder->vui.colorprim, 8, "colour_primaries");
-      WRITE_U(stream, encoder->vui.transfer, 8, "transfer_characteristics");
-      WRITE_U(stream, encoder->vui.colormatrix, 8, "matrix_coeffs");
-    } else
-      WRITE_U(stream, 0, 1, "colour_description_present_flag");
-  } else
-    WRITE_U(stream, 0, 1, "video_signal_type_present_flag");
-
-  //IF video type
-  //ENDIF
-
-  if (encoder->vui.chroma_loc > 0) {
-    WRITE_U(stream, 1, 1, "chroma_loc_info_present_flag");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_top_field");
-    WRITE_UE(stream, encoder->vui.chroma_loc, "chroma_sample_loc_type_bottom_field");
-  } else
-    WRITE_U(stream, 0, 1, "chroma_loc_info_present_flag");
-
-  //IF chroma loc info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "neutral_chroma_indication_flag");
-  WRITE_U(stream, 0, 1, "field_seq_flag");
-  WRITE_U(stream, 0, 1, "frame_field_info_present_flag");
-  WRITE_U(stream, 0, 1, "default_display_window_flag");
-
-  //IF default display window
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "vui_timing_info_present_flag");
-
-  //IF timing info
-  //ENDIF
-
-  WRITE_U(stream, 0, 1, "bitstream_restriction_flag");
-
-  //IF bitstream restriction
-  //ENDIF
-}
-
-/******  END Functions writing bitstream parts ******/
+//Constructors/destructors
+#include "_encoderstatectorsdtors.c"
 
+//Functions writing bitstream parts
+#include "_encoderstatebitstream.c"
 
 static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
   const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;

From b8acdc784a20fb7db94b85dd06b0b930cf8296fe Mon Sep 17 00:00:00 2001
From: Laurent Fasnacht <laurent.fasnacht@ces.ch>
Date: Tue, 3 Jun 2014 15:02:14 +0200
Subject: [PATCH 10/15] Fix compilation of encoder.c with -D_DEBUG

---
 src/encoder.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/encoder.c b/src/encoder.c
index 96451f0e..dc55b0a8 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -44,6 +44,26 @@
 #include "rdo.h"
 
 /* Local functions. */
+static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == 0) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
+  }
+  return 0;
+}
+
+static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+  int i;
+  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
+  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
+  for (i = 0; i < encoder->slice_count; ++i) {
+    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
+  }
+  return 0;
+}
+
 
 int encoder_control_init(encoder_control * const encoder, const config * const cfg) {
   if (!cfg) {

From bb47534b8853983331e03127a621bda75f4a0668 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 4 Jun 2014 17:45:46 +0300
Subject: [PATCH 11/15] Make encoder_state .c files their own compilation
 units.

- It's good that this module has been chopped to smaller pieces, but lets
  avoid including .c files unless we really have to. These make pretty good
  submodules on their own so just make them their own compilation units.

- Move some stuff around to avoid having to forward declare them
  in encoderstate.c.
---
 src/encoder.h                                 |   2 +-
 ...ebitstream.c => encoder_state-bitstream.c} |  52 +++-
 src/encoder_state-bitstream.h                 |  40 ++++
 ...orsdtors.c => encoder_state-ctors_dtors.c} |   7 +-
 src/encoder_state-ctors_dtors.h               |  39 +++
 ...ategeometry.c => encoder_state-geometry.c} |  52 ++--
 src/encoder_state-geometry.h                  |  45 ++++
 src/encoderstate.c                            | 223 +++++++-----------
 src/encoderstate.h                            |  14 +-
 src/global.h                                  |   1 +
 10 files changed, 301 insertions(+), 174 deletions(-)
 rename src/{_encoderstatebitstream.c => encoder_state-bitstream.c} (95%)
 create mode 100644 src/encoder_state-bitstream.h
 rename src/{_encoderstatectorsdtors.c => encoder_state-ctors_dtors.c} (99%)
 create mode 100644 src/encoder_state-ctors_dtors.h
 rename src/{_encoderstategeometry.c => encoder_state-geometry.c} (84%)
 create mode 100644 src/encoder_state-geometry.h

diff --git a/src/encoder.h b/src/encoder.h
index e8e3b761..731c557f 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -46,7 +46,7 @@ typedef struct
 enum { FORMAT_400 = 0, FORMAT_420, FORMAT_422, FORMAT_444 };
 
 /* Encoder control options, the main struct */
-typedef struct
+typedef struct encoder_control
 {
   /* Configuration */
   const config *cfg;
diff --git a/src/_encoderstatebitstream.c b/src/encoder_state-bitstream.c
similarity index 95%
rename from src/_encoderstatebitstream.c
rename to src/encoder_state-bitstream.c
index 906afc0f..0bd72b94 100644
--- a/src/_encoderstatebitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -17,7 +17,11 @@
  * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+#include "encoder_state-bitstream.h"
+
+#include "encoderstate.h"
+#include "nal.h"
+
 
 static void encoder_state_write_bitstream_access_unit_delimiter(encoder_state * const encoder_state)
 {
@@ -632,6 +636,39 @@ void encoder_state_write_bitstream_slice_header(encoder_state * const encoder_st
   }
 }
 
+
+/**
+ * \brief Add a checksum SEI message to the bitstream.
+ * \param encoder The encoder.
+ * \returns Void
+ */
+static void add_checksum(encoder_state * const encoder_state)
+{
+  bitstream * const stream = &encoder_state->stream;
+  const picture * const cur_pic = encoder_state->tile->cur_pic;
+  unsigned char checksum[3][SEI_HASH_MAX_LENGTH];
+  uint32_t checksum_val;
+  unsigned int i;
+
+  nal_write(stream, NAL_SUFFIT_SEI_NUT, 0, 0);
+
+  picture_checksum(cur_pic, checksum);
+
+  WRITE_U(stream, 132, 8, "sei_type");
+  WRITE_U(stream, 13, 8, "size");
+  WRITE_U(stream, 2, 8, "hash_type"); // 2 = checksum
+
+  for (i = 0; i < 3; ++i) {
+    // Pack bits into a single 32 bit uint instead of pushing them one byte
+    // at a time.
+    checksum_val = (checksum[i][0] << 24) + (checksum[i][1] << 16) +
+                   (checksum[i][2] << 8) + (checksum[i][3]);
+    WRITE_U(stream, checksum_val, 32, "picture_checksum");
+  }
+
+  bitstream_align(stream);
+}
+
 static void encoder_state_write_bitstream_main(encoder_state * const main_state) {
   const encoder_control * const encoder = main_state->encoder_control;
   bitstream * const stream = &main_state->stream;
@@ -700,11 +737,7 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
   main_state->tile->cur_pic->poc = main_state->global->poc;
 }
 
-static void encoder_state_worker_write_bitstream_leaf(void * opaque) {
-  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
-}
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
+void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
   const encoder_control * const encoder = encoder_state->encoder_control;
   //Write terminator of the leaf
   assert(encoder_state->is_leaf);
@@ -728,6 +761,11 @@ static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_sta
   }
 }
 
+
+void encoder_state_worker_write_bitstream_leaf(void * opaque) {
+  encoder_state_write_bitstream_leaf((encoder_state *) opaque);
+}
+
 static void encoder_state_write_bitstream_tile(encoder_state * const main_state) {
   //If it's not a leaf, a tile is "nothing". We only have to write sub elements
   int i;
@@ -749,7 +787,7 @@ static void encoder_state_write_bitstream_slice(encoder_state * const main_state
 }
 
 
-static void encoder_state_write_bitstream(encoder_state * const main_state) {
+void encoder_state_write_bitstream(encoder_state * const main_state) {
   int i;
   if (!main_state->is_leaf) {
     for (i=0; main_state->children[i].encoder_control; ++i) {
diff --git a/src/encoder_state-bitstream.h b/src/encoder_state-bitstream.h
new file mode 100644
index 00000000..2eeeb315
--- /dev/null
+++ b/src/encoder_state-bitstream.h
@@ -0,0 +1,40 @@
+#ifndef ENCODER_STATE_BITSTREAM_H_
+#define ENCODER_STATE_BITSTREAM_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include "global.h"
+
+
+// Forward declare because including the header would lead  to a cyclic
+// dependency.
+struct encoder_state;
+
+
+void encoder_state_write_bitstream_slice_header(struct encoder_state * const encoder_state);
+void encoder_state_write_bitstream(struct encoder_state * const main_state);
+void encoder_state_write_bitstream_leaf(struct encoder_state * const encoder_state);
+void encoder_state_worker_write_bitstream_leaf(void * opaque);
+
+
+#endif // ENCODER_STATE_BITSTREAM_H_
diff --git a/src/_encoderstatectorsdtors.c b/src/encoder_state-ctors_dtors.c
similarity index 99%
rename from src/_encoderstatectorsdtors.c
rename to src/encoder_state-ctors_dtors.c
index 44e4c0fd..50f062fb 100644
--- a/src/_encoderstatectorsdtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -17,7 +17,12 @@
  * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+#include "encoder_state-ctors_dtors.h"
+
+#include <stdlib.h>
+
+#include "encoderstate.h"
+
 
 static int encoder_state_config_global_init(encoder_state * const encoder_state) {
   encoder_state->global->ref = picture_list_init(MAX_REF_PIC_COUNT);
diff --git a/src/encoder_state-ctors_dtors.h b/src/encoder_state-ctors_dtors.h
new file mode 100644
index 00000000..b62de6e5
--- /dev/null
+++ b/src/encoder_state-ctors_dtors.h
@@ -0,0 +1,39 @@
+#ifndef ENCODER_STATE_CTORS_DTORS_H_
+#define ENCODER_STATE_CTORS_DTORS_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include "global.h"
+
+
+// Forward declare because including the header would lead  to a cyclic
+// dependency.
+struct encoder_state;
+
+
+int encoder_state_init(struct encoder_state * child_state, struct encoder_state * parent_state);
+void encoder_state_finalize(struct encoder_state *encoder_state);
+void encoder_state_init_lambda(struct encoder_state *encoder_state);
+
+
+#endif // ENCODER_STATE_CTORS_DTORS_H_
diff --git a/src/_encoderstategeometry.c b/src/encoder_state-geometry.c
similarity index 84%
rename from src/_encoderstategeometry.c
rename to src/encoder_state-geometry.c
index faffbc2c..dacd75ae 100644
--- a/src/_encoderstategeometry.c
+++ b/src/encoder_state-geometry.c
@@ -17,9 +17,13 @@
  * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-//This file MUST NOT BE COMPILED directly. It's included in encoderstate.c
+#include "encoder_state-geometry.h"
 
-static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+#include "encoderstate.h"
+
+
+
+int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
   int i;
   assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
   if (lcu_addr_in_ts == 0) return 1;
@@ -29,7 +33,7 @@ static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_add
   return 0;
 }
 
-static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
   int i;
   assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
   if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
@@ -39,7 +43,7 @@ static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_
   return 0;
 }
 
-static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
+int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
   assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
   if (lcu_addr_in_ts == 0) return 1;
   if (encoder->tiles_tile_id[lcu_addr_in_ts - 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
@@ -48,7 +52,7 @@ static int lcu_at_tile_start(const encoder_control * const encoder, int lcu_addr
   return 0;
 }
 
-static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
+int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
   assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
   if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
   if (encoder->tiles_tile_id[lcu_addr_in_ts + 1] != encoder->tiles_tile_id[lcu_addr_in_ts]) {
@@ -58,76 +62,76 @@ static int lcu_at_tile_end(const encoder_control * const encoder, int lcu_addr_i
 }
 
 //Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+int lcu_in_first_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
   const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
+
   if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y) {
     return 1;
   }
-  
+
   if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->start_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
     return 1;
   }
-  
+
   //One row above is before the start of the slice => it's also a boundary
   if (lcu_addr_in_rs - encoder_state->encoder_control->in.width_in_lcu < encoder_state->slice->start_in_rs) {
     return 1;
   }
-  
+
   return 0;
 }
 
 //Return 1 if the LCU is at the first row of a structure (tile or slice)
-static int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+int lcu_in_last_row(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
   const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
+
   if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_y + encoder_state->tile->cur_pic->height_in_lcu - 1) {
     return 1;
   }
-  
+
   if (lcu_addr_in_rs / encoder_state->encoder_control->in.width_in_lcu == encoder_state->slice->end_in_rs / encoder_state->encoder_control->in.width_in_lcu) {
     return 1;
   }
-  
+
   //One row below is before the end of the slice => it's also a boundary
   if (lcu_addr_in_rs + encoder_state->encoder_control->in.width_in_lcu > encoder_state->slice->end_in_rs) {
     return 1;
   }
-  
+
   return 0;
 }
 
 
 //Return 1 if the LCU is at the first column of a structure (tile or slice)
-static int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+int lcu_in_first_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
   const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
+
   //First column of tile?
   if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x) {
     return 1;
   }
-  
+
   //Slice start may not be aligned with the tile, so we need to allow this
   if (lcu_addr_in_rs == encoder_state->slice->start_in_rs) {
     return 1;
   }
-  
+
   return 0;
 }
 
 //Return 1 if the LCU is at the last column of a structure (tile or slice)
-static int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
+int lcu_in_last_column(const encoder_state * const encoder_state, int lcu_addr_in_ts) {
   const int lcu_addr_in_rs = encoder_state->encoder_control->tiles_ctb_addr_ts_to_rs[lcu_addr_in_ts];
-  
+
   //First column of tile?
   if (lcu_addr_in_rs % encoder_state->encoder_control->in.width_in_lcu == encoder_state->tile->lcu_offset_x + encoder_state->tile->cur_pic->width_in_lcu - 1) {
     return 1;
   }
-  
+
   //Slice start may not be aligned with the tile, so we need to allow this
   if (lcu_addr_in_rs == encoder_state->slice->end_in_rs) {
     return 1;
   }
-  
+
   return 0;
-}
\ No newline at end of file
+}
diff --git a/src/encoder_state-geometry.h b/src/encoder_state-geometry.h
new file mode 100644
index 00000000..8c1729d0
--- /dev/null
+++ b/src/encoder_state-geometry.h
@@ -0,0 +1,45 @@
+#ifndef ENCODER_STATE_GEOMETRY_H_
+#define ENCODER_STATE_GEOMETRY_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2014 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/*
+ * \file
+ */
+
+#include "global.h"
+
+
+// Forward declare because including the header would lead  to a cyclic
+// dependency.
+struct encoder_control;
+struct encoder_state;
+
+
+int lcu_at_slice_start(const struct  encoder_control * const encoder, int lcu_addr_in_ts);
+int lcu_at_slice_end(const struct  encoder_control * const encoder, int lcu_addr_in_ts);
+int lcu_at_tile_start(const struct  encoder_control * const encoder, int lcu_addr_in_ts);
+int lcu_at_tile_end(const struct  encoder_control * const encoder, int lcu_addr_in_ts);
+int lcu_in_first_row(const struct encoder_state * const encoder_state, int lcu_addr_in_ts);
+int lcu_in_last_row(const struct encoder_state * const encoder_state, int lcu_addr_in_ts);
+int lcu_in_first_column(const struct encoder_state * const encoder_state, int lcu_addr_in_ts);
+int lcu_in_last_column(const struct encoder_state * const encoder_state, int lcu_addr_in_ts);
+
+
+#endif // ENCODER_STATE_GEOMETRY_H_
diff --git a/src/encoderstate.c b/src/encoderstate.c
index f6f42227..3025b885 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -43,14 +43,6 @@
 #include "sao.h"
 #include "rdo.h"
 
-/* Local functions. */
-static void add_checksum(encoder_state *encoder);
-static void encode_sao(encoder_state *encoder,
-                       unsigned x_lcu, uint16_t y_lcu,
-                       sao_info *sao_luma, sao_info *sao_chroma);
-
-static void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state);
-static void encoder_state_worker_write_bitstream_leaf(void * opaque);
 
 /*!
   \brief Initializes lambda-value for current QP
@@ -81,15 +73,6 @@ void encoder_state_init_lambda(encoder_state * const encoder_state)
   encoder_state->global->cur_lambda_cost = lambda;
 }
 
-//Functions to obtain geometry information from LCU
-#include "_encoderstategeometry.c"
-
-//Constructors/destructors
-#include "_encoderstatectorsdtors.c"
-
-//Functions writing bitstream parts
-#include "_encoderstatebitstream.c"
-
 static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
   const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
   const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
@@ -183,6 +166,91 @@ static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, c
 }
 
 
+static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
+                             color_index color_i)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  sao_eo_cat i;
+
+  // Skip colors with no SAO.
+  //FIXME: for now, we always have SAO for all channels
+  if (color_i == COLOR_Y && 0) return;
+  if (color_i != COLOR_Y && 0) return;
+
+  /// sao_type_idx_luma:   TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
+  /// sao_type_idx_chroma: TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
+  // Encode sao_type_idx for Y and U+V.
+  if (color_i != COLOR_V) {
+    cabac->ctx = &(cabac->ctx_sao_type_idx_model);;
+    CABAC_BIN(cabac, sao->type != SAO_TYPE_NONE, "sao_type_idx");
+    if (sao->type == SAO_TYPE_BAND) {
+      CABAC_BIN_EP(cabac, 0, "sao_type_idx_ep");
+    } else if (sao->type == SAO_TYPE_EDGE) {
+      CABAC_BIN_EP(cabac, 1, "sao_type_idx_ep");
+    }
+  }
+
+  if (sao->type == SAO_TYPE_NONE) return;
+
+  /// sao_offset_abs[][][][]: TR, cMax = (1 << (Min(bitDepth, 10) - 5)) - 1,
+  ///                         cRiceParam = 0, bins = {bypass x N}
+  for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
+    cabac_write_unary_max_symbol_ep(cabac, abs(sao->offsets[i]), SAO_ABS_OFFSET_MAX);
+  }
+
+  /// sao_offset_sign[][][][]: FL, cMax = 1, bins = {bypass}
+  /// sao_band_position[][][]: FL, cMax = 31, bins = {bypass x N}
+  /// sao_eo_class_luma:       FL, cMax = 3, bins = {bypass x 3}
+  /// sao_eo_class_chroma:     FL, cMax = 3, bins = {bypass x 3}
+  if (sao->type == SAO_TYPE_BAND) {
+    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
+      // Positive sign is coded as 0.
+      if(sao->offsets[i] != 0) {
+        CABAC_BIN_EP(cabac, sao->offsets[i] < 0 ? 1 : 0, "sao_offset_sign");
+      }
+    }
+    // TODO: sao_band_position
+    // FL cMax=31 (5 bits)
+    CABAC_BINS_EP(cabac, sao->band_position, 5, "sao_band_position");
+  } else if (color_i != COLOR_V) {
+    CABAC_BINS_EP(cabac, sao->eo_class, 2, "sao_eo_class");
+  }
+}
+
+static void encode_sao_merge_flags(encoder_state * const encoder_state, sao_info *sao, unsigned x_ctb, unsigned y_ctb)
+{
+  cabac_data * const cabac = &encoder_state->cabac;
+  // SAO merge flags are not present for the first row and column.
+  if (x_ctb > 0) {
+    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
+    CABAC_BIN(cabac, sao->merge_left_flag, "sao_merge_left_flag");
+  }
+  if (y_ctb > 0 && !sao->merge_left_flag) {
+    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
+    CABAC_BIN(cabac, sao->merge_up_flag, "sao_merge_up_flag");
+  }
+}
+
+
+/**
+ * \brief Encode SAO information.
+ */
+static void encode_sao(encoder_state * const encoder_state,
+                       unsigned x_lcu, uint16_t y_lcu,
+                       sao_info *sao_luma, sao_info *sao_chroma)
+{
+  // TODO: transmit merge flags outside sao_info
+  encode_sao_merge_flags(encoder_state, sao_luma, x_lcu, y_lcu);
+
+  // If SAO is merged, nothing else needs to be coded.
+  if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
+    encode_sao_color(encoder_state, sao_luma, COLOR_Y);
+    encode_sao_color(encoder_state, sao_chroma, COLOR_U);
+    encode_sao_color(encoder_state, sao_chroma, COLOR_V);
+  }
+}
+
+
 static void encoder_state_worker_encode_lcu(void * opaque) {
   const lcu_order_element * const lcu = opaque;
   encoder_state *encoder_state = lcu->encoder_state;
@@ -674,39 +742,6 @@ int read_one_frame(FILE* file, const encoder_state * const encoder_state)
   return 1;
 }
 
-/**
- * \brief Add a checksum SEI message to the bitstream.
- * \param encoder The encoder.
- * \returns Void
- */
-static void add_checksum(encoder_state * const encoder_state)
-{
-  bitstream * const stream = &encoder_state->stream;
-  const picture * const cur_pic = encoder_state->tile->cur_pic;
-  unsigned char checksum[3][SEI_HASH_MAX_LENGTH];
-  uint32_t checksum_val;
-  unsigned int i;
-
-  nal_write(stream, NAL_SUFFIT_SEI_NUT, 0, 0);
-
-  picture_checksum(cur_pic, checksum);
-
-  WRITE_U(stream, 132, 8, "sei_type");
-  WRITE_U(stream, 13, 8, "size");
-  WRITE_U(stream, 2, 8, "hash_type"); // 2 = checksum
-
-  for (i = 0; i < 3; ++i) {
-    // Pack bits into a single 32 bit uint instead of pushing them one byte
-    // at a time.
-    checksum_val = (checksum[i][0] << 24) + (checksum[i][1] << 16) +
-                   (checksum[i][2] << 8) + (checksum[i][3]);
-    WRITE_U(stream, checksum_val, 32, "picture_checksum");
-  }
-
-  bitstream_align(stream);
-}
-
-
 
 void encoder_next_frame(encoder_state *encoder_state) {
   const encoder_control * const encoder = encoder_state->encoder_control;
@@ -736,94 +771,6 @@ void encoder_next_frame(encoder_state *encoder_state) {
 }
 
 
-
-
-
-
-static void encode_sao_color(encoder_state * const encoder_state, sao_info *sao,
-                             color_index color_i)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  sao_eo_cat i;
-
-  // Skip colors with no SAO.
-  //FIXME: for now, we always have SAO for all channels
-  if (color_i == COLOR_Y && 0) return;
-  if (color_i != COLOR_Y && 0) return;
-
-  /// sao_type_idx_luma:   TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
-  /// sao_type_idx_chroma: TR, cMax = 2, cRiceParam = 0, bins = {0, bypass}
-  // Encode sao_type_idx for Y and U+V.
-  if (color_i != COLOR_V) {
-    cabac->ctx = &(cabac->ctx_sao_type_idx_model);;
-    CABAC_BIN(cabac, sao->type != SAO_TYPE_NONE, "sao_type_idx");
-    if (sao->type == SAO_TYPE_BAND) {
-      CABAC_BIN_EP(cabac, 0, "sao_type_idx_ep");
-    } else if (sao->type == SAO_TYPE_EDGE) {
-      CABAC_BIN_EP(cabac, 1, "sao_type_idx_ep");
-    }
-  }
-
-  if (sao->type == SAO_TYPE_NONE) return;
-
-  /// sao_offset_abs[][][][]: TR, cMax = (1 << (Min(bitDepth, 10) - 5)) - 1,
-  ///                         cRiceParam = 0, bins = {bypass x N}
-  for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
-    cabac_write_unary_max_symbol_ep(cabac, abs(sao->offsets[i]), SAO_ABS_OFFSET_MAX);
-  }
-
-  /// sao_offset_sign[][][][]: FL, cMax = 1, bins = {bypass}
-  /// sao_band_position[][][]: FL, cMax = 31, bins = {bypass x N}
-  /// sao_eo_class_luma:       FL, cMax = 3, bins = {bypass x 3}
-  /// sao_eo_class_chroma:     FL, cMax = 3, bins = {bypass x 3}
-  if (sao->type == SAO_TYPE_BAND) {
-    for (i = SAO_EO_CAT1; i <= SAO_EO_CAT4; ++i) {
-      // Positive sign is coded as 0.
-      if(sao->offsets[i] != 0) {
-        CABAC_BIN_EP(cabac, sao->offsets[i] < 0 ? 1 : 0, "sao_offset_sign");
-      }
-    }
-    // TODO: sao_band_position
-    // FL cMax=31 (5 bits)
-    CABAC_BINS_EP(cabac, sao->band_position, 5, "sao_band_position");
-  } else if (color_i != COLOR_V) {
-    CABAC_BINS_EP(cabac, sao->eo_class, 2, "sao_eo_class");
-  }
-}
-
-static void encode_sao_merge_flags(encoder_state * const encoder_state, sao_info *sao, unsigned x_ctb, unsigned y_ctb)
-{
-  cabac_data * const cabac = &encoder_state->cabac;
-  // SAO merge flags are not present for the first row and column.
-  if (x_ctb > 0) {
-    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
-    CABAC_BIN(cabac, sao->merge_left_flag, "sao_merge_left_flag");
-  }
-  if (y_ctb > 0 && !sao->merge_left_flag) {
-    cabac->ctx = &(cabac->ctx_sao_merge_flag_model);
-    CABAC_BIN(cabac, sao->merge_up_flag, "sao_merge_up_flag");
-  }
-}
-
-/**
- * \brief Encode SAO information.
- */
-static void encode_sao(encoder_state * const encoder_state,
-                       unsigned x_lcu, uint16_t y_lcu,
-                       sao_info *sao_luma, sao_info *sao_chroma)
-{
-  // TODO: transmit merge flags outside sao_info
-  encode_sao_merge_flags(encoder_state, sao_luma, x_lcu, y_lcu);
-
-  // If SAO is merged, nothing else needs to be coded.
-  if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
-    encode_sao_color(encoder_state, sao_luma, COLOR_Y);
-    encode_sao_color(encoder_state, sao_chroma, COLOR_U);
-    encode_sao_color(encoder_state, sao_chroma, COLOR_V);
-  }
-}
-
-
 void encode_coding_tree(encoder_state * const encoder_state,
                         uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
 {
diff --git a/src/encoderstate.h b/src/encoderstate.h
index da04ad67..5f57bf6c 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -35,6 +35,16 @@
 #include "scalinglist.h"
 #include "threadqueue.h"
 
+
+// Submodules
+// Functions to obtain geometry information from LCU
+#include "encoder_state-geometry.h"
+// Constructors/destructors
+#include "encoder_state-ctors_dtors.h"
+// Functions writing bitstream parts
+#include "encoder_state-bitstream.h"
+
+
 typedef enum {
   ENCODER_STATE_TYPE_INVALID = 'i',
   ENCODER_STATE_TYPE_MAIN = 'M',
@@ -148,9 +158,7 @@ typedef struct encoder_state {
   cabac_data cabac;
 } encoder_state;
 
-int encoder_state_init(encoder_state * child_state, encoder_state * parent_state);
-void encoder_state_finalize(encoder_state *encoder_state);
-void encoder_state_init_lambda(encoder_state *encoder_state);
+
 
 void encode_one_frame(encoder_state *encoder_state);
 int read_one_frame(FILE* file, const encoder_state *encoder);
diff --git a/src/global.h b/src/global.h
index 5d51911f..961e27a1 100644
--- a/src/global.h
+++ b/src/global.h
@@ -28,6 +28,7 @@
  * so that any file can refer to integer types with exact widths.
  */
 
+#include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <limits.h>

From 575652fe5064943119cc1f97dfc029d1737d4016 Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 4 Jun 2014 17:46:26 +0300
Subject: [PATCH 12/15] Add missing files to VS project.

---
 build/kvazaar_lib/kvazaar_lib.vcxproj         | 19 ++++++++
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 48 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index 92789485..10e332fe 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -79,6 +79,10 @@
       <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads\x64</AdditionalLibraryDirectories>
       <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
     </Lib>
+    <ClCompile>
+      <DisableSpecificWarnings>
+      </DisableSpecificWarnings>
+    </ClCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <YASM>
@@ -114,6 +118,10 @@
     <ClCompile Include="..\..\src\context.c" />
     <ClCompile Include="..\..\src\encmain.c" />
     <ClCompile Include="..\..\src\encoder.c" />
+    <ClCompile Include="..\..\src\encoderstate.c" />
+    <ClCompile Include="..\..\src\encoder_state-bitstream.c" />
+    <ClCompile Include="..\..\src\encoder_state-ctors_dtors.c" />
+    <ClCompile Include="..\..\src\encoder_state-geometry.c" />
     <ClCompile Include="..\..\src\extras\getopt.c" />
     <ClCompile Include="..\..\src\filter.c" />
     <ClCompile Include="..\..\src\inter.c" />
@@ -124,6 +132,15 @@
     <ClCompile Include="..\..\src\sao.c" />
     <ClCompile Include="..\..\src\scalinglist.c" />
     <ClCompile Include="..\..\src\search.c" />
+    <ClInclude Include="..\..\src\encoder_state-bitstream.h" />
+    <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" />
+    <ClInclude Include="..\..\src\encoder_state-geometry.h" />
+    <ClInclude Include="..\..\src\strategies\nal-generic.c" />
+    <ClInclude Include="..\..\src\strategies\nal.c" />
+    <ClInclude Include="..\..\src\strategies\picture-generic.c" />
+    <ClInclude Include="..\..\src\strategies\picture-sse2.c" />
+    <ClInclude Include="..\..\src\strategies\picture-sse41.c" />
+    <ClInclude Include="..\..\src\strategies\picture.c" />
     <ClCompile Include="..\..\src\strategyselector.c" />
     <ClCompile Include="..\..\src\tables.c" />
     <ClCompile Include="..\..\src\threadqueue.c" />
@@ -135,6 +152,7 @@
     <ClInclude Include="..\..\src\config.h" />
     <ClInclude Include="..\..\src\context.h" />
     <ClInclude Include="..\..\src\encoder.h" />
+    <ClInclude Include="..\..\src\encoderstate.h" />
     <ClInclude Include="..\..\src\extras\getopt.h" />
     <ClInclude Include="..\..\src\filter.h" />
     <ClInclude Include="..\..\src\global.h" />
@@ -146,6 +164,7 @@
     <ClInclude Include="..\..\src\sao.h" />
     <ClInclude Include="..\..\src\scalinglist.h" />
     <ClInclude Include="..\..\src\search.h" />
+    <ClInclude Include="..\..\src\strategies\picture.h" />
     <ClInclude Include="..\..\src\tables.h" />
     <ClInclude Include="..\..\src\threadqueue.h" />
     <ClInclude Include="..\..\src\threads.h" />
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index ebab9e19..0ded22ba 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -19,6 +19,9 @@
     <Filter Include="Header Files\x86">
       <UniqueIdentifier>{d1533c70-cb9a-419d-9a38-5161894ec359}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Header Files\strategies">
+      <UniqueIdentifier>{84b28f88-c0de-4ee8-8566-c2c56d2b0f6e}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\encmain.c">
@@ -81,6 +84,18 @@
     <ClCompile Include="..\..\src\threadqueue.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\encoder_state-bitstream.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\encoder_state-ctors_dtors.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\encoder_state-geometry.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\encoderstate.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\global.h">
@@ -146,6 +161,39 @@
     <ClInclude Include="..\..\src\threadqueue.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\strategies\picture.h">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\encoderstate.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\picture-sse41.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\picture-sse2.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\picture-generic.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\nal.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\nal-generic.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\picture.c">
+      <Filter>Header Files\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\encoder_state-bitstream.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\encoder_state-geometry.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\x86\cpu.asm">

From cb7d7f9e1588d0874477dca4529a13a4903e29db Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 4 Jun 2014 17:52:28 +0300
Subject: [PATCH 13/15] Update Makefile.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index b70d4ab5..e4131eca 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -53,7 +53,7 @@ CC = gcc
 CCFLAGS = $(DFLAGS) -I. -Wall -Wtype-limits
 LDFLAGS += -lm
 LD = gcc -pthread -lrt
-OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o encoder.o encoderstate.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o scalinglist.o search.o strategyselector.o tables.o threadqueue.o transform.o
+OBJS = interface_main.o encmain.o bitstream.o cabac.o config.o context.o encoder.o encoderstate.o filter.o inter.o intra.o nal.o picture.o rdo.o sao.o scalinglist.o search.o strategyselector.o tables.o threadqueue.o transform.o encoder_state-bitstream.o encoder_state-ctors_dtors.o encoder_state-geometry.o
 PROG  = ./kvazaar
 PROGS = $(PROG)
 

From 4f5c87fc5e9351bc3007d586ca587dbea9cfe19f Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 4 Jun 2014 17:56:05 +0300
Subject: [PATCH 14/15] Remove duplicate function definition.

---
 src/encoder.c | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index dc55b0a8..b061ba7c 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -43,27 +43,6 @@
 #include "sao.h"
 #include "rdo.h"
 
-/* Local functions. */
-static int lcu_at_slice_start(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == 0) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts) return 1;
-  }
-  return 0;
-}
-
-static int lcu_at_slice_end(const encoder_control * const encoder, int lcu_addr_in_ts) {
-  int i;
-  assert(lcu_addr_in_ts >= 0 && lcu_addr_in_ts < encoder->in.height_in_lcu * encoder->in.width_in_lcu);
-  if (lcu_addr_in_ts == encoder->in.height_in_lcu * encoder->in.width_in_lcu - 1) return 1;
-  for (i = 0; i < encoder->slice_count; ++i) {
-    if (encoder->slice_addresses_in_ts[i] == lcu_addr_in_ts + 1) return 1;
-  }
-  return 0;
-}
-
 
 int encoder_control_init(encoder_control * const encoder, const config * const cfg) {
   if (!cfg) {

From 31b1bbc21548ae5ea80ec8fc22a391ddd9589ecc Mon Sep 17 00:00:00 2001
From: Ari Koivula <ari@koivu.la>
Date: Wed, 4 Jun 2014 18:00:50 +0300
Subject: [PATCH 15/15] Address implicit declaration of warnings.

---
 src/encoder_state-bitstream.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 0bd72b94..91316c48 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -19,6 +19,8 @@
 
 #include "encoder_state-bitstream.h"
 
+#include <string.h>
+
 #include "encoderstate.h"
 #include "nal.h"