From 96a0f03298ea6d58e5c738544e80fe46dc4a0d26 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 11 Oct 2013 11:40:37 +0300
Subject: [PATCH 01/19] Refactoring encoder.c in preparation for adding
 merge-mode

---
 src/encoder.c | 187 +++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 94 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 07e4a785..142de191 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -959,14 +959,14 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
               // parseRefFrmIdx
               int32_t ref_frame = cur_cu->inter.mv_ref;
                   
-                  cabac.ctx = &g_cu_ref_pic_model[0];
+              cabac.ctx = &g_cu_ref_pic_model[0];
               CABAC_BIN(&cabac, (ref_frame == 0) ? 0 : 1, "ref_frame_flag");
     
               if (ref_frame > 0) {
                 uint32_t i;
                 uint32_t ref_num = encoder->ref_idx_num[ref_list_idx] - 2;
 
-                    cabac.ctx = &g_cu_ref_pic_model[1];
+                cabac.ctx = &g_cu_ref_pic_model[1];
                 ref_frame--;
 
                 for (i = 0; i < ref_num; ++i) {
@@ -976,14 +976,11 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
                     CABAC_BIN(&cabac, symbol, "ref_frame_flag2");
                   } else {
                     CABAC_BIN_EP(&cabac, symbol, "ref_frame_flag2");
-                      }
-
-                  if (symbol == 0) {
-                        break;
-                      }
-                    }
                   }
+                  if (symbol == 0) break;
                 }
+              }
+            }
 
             // Get MV candidates
             inter_get_mv_cand(encoder, x_ctb, y_ctb, depth, mv_cand);
@@ -992,145 +989,147 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
             cur_cu->inter.mv_ref = 0; // Default to candidate 0
 
             // Only check when candidates are different
-                if (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1]) {
+            if (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1]) {
               uint16_t cand_1_diff = abs(cur_cu->inter.mv[0] - mv_cand[0][0]) + abs(
                                        cur_cu->inter.mv[1] - mv_cand[0][1]);
               uint16_t cand_2_diff = abs(cur_cu->inter.mv[0] - mv_cand[1][0]) + abs(
                                        cur_cu->inter.mv[1] - mv_cand[1][1]);
 
               // Select candidate 1 if it's closer
-                  if (cand_2_diff < cand_1_diff) {
+              if (cand_2_diff < cand_1_diff) {
                 cur_cu->inter.mv_ref = 1;
-                  }
-                }
+              }
+            }
 
             if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
               const int32_t mvd_hor = cur_cu->inter.mv[0] - mv_cand[cur_cu->inter.mv_ref][0];
               const int32_t mvd_ver = cur_cu->inter.mv[1] - mv_cand[cur_cu->inter.mv_ref][1];
               const int8_t hor_abs_gr0 = mvd_hor != 0;
               const int8_t ver_abs_gr0 = mvd_ver != 0;
-                    const uint32_t mvd_hor_abs = abs(mvd_hor);
-                    const uint32_t mvd_ver_abs = abs(mvd_ver);
+              const uint32_t mvd_hor_abs = abs(mvd_hor);
+              const uint32_t mvd_ver_abs = abs(mvd_ver);
 
-                    cabac.ctx = &g_cu_mvd_model[0];
-                    CABAC_BIN(&cabac, (mvd_hor!=0)?1:0, "abs_mvd_greater0_flag_hor");
-                    CABAC_BIN(&cabac, (mvd_ver!=0)?1:0, "abs_mvd_greater0_flag_ver");
+              cabac.ctx = &g_cu_mvd_model[0];
+              CABAC_BIN(&cabac, (mvd_hor!=0)?1:0, "abs_mvd_greater0_flag_hor");
+              CABAC_BIN(&cabac, (mvd_ver!=0)?1:0, "abs_mvd_greater0_flag_ver");
 
-                    cabac.ctx = &g_cu_mvd_model[1];
+              cabac.ctx = &g_cu_mvd_model[1];
 
               if (hor_abs_gr0) {
-                      CABAC_BIN(&cabac, (mvd_hor_abs>1)?1:0, "abs_mvd_greater1_flag_hor");
-                    }
+                CABAC_BIN(&cabac, (mvd_hor_abs>1)?1:0, "abs_mvd_greater1_flag_hor");
+              }
 
               if (ver_abs_gr0) {
-                      CABAC_BIN(&cabac, (mvd_ver_abs>1)?1:0, "abs_mvd_greater1_flag_ver");
-                    }
+                CABAC_BIN(&cabac, (mvd_ver_abs>1)?1:0, "abs_mvd_greater1_flag_ver");
+              }
 
               if (hor_abs_gr0) {
                 if (mvd_hor_abs > 1) {
-                        cabac_write_ep_ex_golomb(&cabac,mvd_hor_abs-2, 1);
-                      }
+                  cabac_write_ep_ex_golomb(&cabac,mvd_hor_abs-2, 1);
+                }
 
-                      CABAC_BIN_EP(&cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
-                    }
+                CABAC_BIN_EP(&cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
+              }
 
               if (ver_abs_gr0) {
                 if (mvd_ver_abs > 1) {
-                        cabac_write_ep_ex_golomb(&cabac,mvd_ver_abs-2, 1);
-                      }
-
-                      CABAC_BIN_EP(&cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
-                    }
-
-              // Inter reconstruction
-              inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
-                          y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
-                          encoder->in.cur_pic);
-
-              // Mark this block as "coded" (can be used for predictions..)
-              picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
+                  cabac_write_ep_ex_golomb(&cabac,mvd_ver_abs-2, 1);
                 }
 
+                CABAC_BIN_EP(&cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
+              }
+            }
+
             // Signal which candidate MV to use
             cabac_write_unary_max_symbol(&cabac, g_mvp_idx_model, cur_cu->inter.mv_ref, 1,
-                                         AMVP_MAX_NUM_CANDS - 1);
-              }
-            }
+                                        AMVP_MAX_NUM_CANDS - 1);
           }
+          }
+        } // for ref_list
+    } // if !merge
+
+
+    // Inter reconstruction
+    inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
+                y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
+                encoder->in.cur_pic);
+
+    // Mark this block as "coded" (can be used for predictions..)
+    picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
 
           
-          if (1) {
+    {
             pixel *base_y  = &encoder->in.cur_pic->y_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH))   + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH)))  *encoder->in.width];
             pixel *base_u = &encoder->in.cur_pic->u_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
             pixel *base_v = &encoder->in.cur_pic->v_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
-            uint32_t width = LCU_WIDTH>>depth;
+      uint32_t width = LCU_WIDTH>>depth;
 
-            /* INTRAPREDICTION VARIABLES */
-            int16_t pred[LCU_WIDTH*LCU_WIDTH+1];
-            int16_t predU[LCU_WIDTH*LCU_WIDTH>>2];
-            int16_t predV[LCU_WIDTH*LCU_WIDTH>>2];
+      /* INTRAPREDICTION VARIABLES */
+      int16_t pred[LCU_WIDTH*LCU_WIDTH+1];
+      int16_t predU[LCU_WIDTH*LCU_WIDTH>>2];
+      int16_t predV[LCU_WIDTH*LCU_WIDTH>>2];
 
             pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH))   + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH)))  *encoder->in.width];
             pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
             pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
 
-            /* TODO: dynamic memory allocation */
-            int16_t coeff_y[LCU_WIDTH*LCU_WIDTH*2];
-            int16_t coeff_u[LCU_WIDTH*LCU_WIDTH>>1];
-            int16_t coeff_v[LCU_WIDTH*LCU_WIDTH>>1];
-            int8_t residual = 0;
+      /* TODO: dynamic memory allocation */
+      int16_t coeff_y[LCU_WIDTH*LCU_WIDTH*2];
+      int16_t coeff_u[LCU_WIDTH*LCU_WIDTH>>1];
+      int16_t coeff_v[LCU_WIDTH*LCU_WIDTH>>1];
+      int8_t residual = 0;
 
-            /* Initialize helper structure for transform */
-            transform_info ti;
-            memset(&ti, 0, sizeof(transform_info));
+      /* Initialize helper structure for transform */
+      transform_info ti;
+      memset(&ti, 0, sizeof(transform_info));
 
-            ti.x_ctb = x_ctb; ti.y_ctb = y_ctb;
+      ti.x_ctb = x_ctb; ti.y_ctb = y_ctb;
 
-            /* Base pointers */
-            ti.base =  base_y; ti.base_u = base_u; ti.base_v = base_v;
-            ti.base_stride = encoder->in.width;
+      /* Base pointers */
+      ti.base =  base_y; ti.base_u = base_u; ti.base_v = base_v;
+      ti.base_stride = encoder->in.width;
 
-            // Prediction pointers
-            ti.pred =  pred; ti.pred_u = predU; ti.pred_v = predV;
-            ti.pred_stride = (LCU_WIDTH>>depth);
+      // Prediction pointers
+      ti.pred =  pred; ti.pred_u = predU; ti.pred_v = predV;
+      ti.pred_stride = (LCU_WIDTH>>depth);
 
-            // Reconstruction pointers
-            ti.recbase = recbase_y; ti.recbase_u = recbase_u; ti.recbase_v = recbase_v;
-            ti.recbase_stride = encoder->in.width;
+      // Reconstruction pointers
+      ti.recbase = recbase_y; ti.recbase_u = recbase_u; ti.recbase_v = recbase_v;
+      ti.recbase_stride = encoder->in.width;
 
-            // Coeff pointers
-            ti.coeff[0] = coeff_y; ti.coeff[1] = coeff_u; ti.coeff[2] = coeff_v;
-            ti.block_type = CU_INTER;
+      // Coeff pointers
+      ti.coeff[0] = coeff_y; ti.coeff[1] = coeff_u; ti.coeff[2] = coeff_v;
+      ti.block_type = CU_INTER;
        
-            // Handle transforms, quant and reconstruction
-            ti.idx = 0;
-            encode_transform_tree(encoder,&ti, depth);
+      // Handle transforms, quant and reconstruction
+      ti.idx = 0;
+      encode_transform_tree(encoder,&ti, depth);
 
-            // Coded block pattern
-            ti.cb_top[0] = (ti.cb[0] & 0x1 || ti.cb[1] & 0x1 || ti.cb[2] & 0x1 || ti.cb[3] & 0x1)?1:0;
-            ti.cb_top[1] = (ti.cb[0] & 0x2 || ti.cb[1] & 0x2 || ti.cb[2] & 0x2 || ti.cb[3] & 0x2)?1:0;
-            ti.cb_top[2] = (ti.cb[0] & 0x4 || ti.cb[1] & 0x4 || ti.cb[2] & 0x4 || ti.cb[3] & 0x4)?1:0;
+      // Coded block pattern
+      ti.cb_top[0] = (ti.cb[0] & 0x1 || ti.cb[1] & 0x1 || ti.cb[2] & 0x1 || ti.cb[3] & 0x1)?1:0;
+      ti.cb_top[1] = (ti.cb[0] & 0x2 || ti.cb[1] & 0x2 || ti.cb[2] & 0x2 || ti.cb[3] & 0x2)?1:0;
+      ti.cb_top[2] = (ti.cb[0] & 0x4 || ti.cb[1] & 0x4 || ti.cb[2] & 0x4 || ti.cb[3] & 0x4)?1:0;
 
-            residual = ti.cb_top[0] | ti.cb_top[1] | ti.cb_top[2];
-            if(depth == 0)  {
-              picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb    ,depth+1,ti.cb[0] & 0x1);
-              picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb    ,depth+1,ti.cb[1] & 0x1);
-              picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb + 4,depth+1,ti.cb[2] & 0x1);
-              picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb + 4,depth+1,ti.cb[3] & 0x1);
-            } else  {
-              picture_set_block_residual(encoder->in.cur_pic,x_ctb,y_ctb,depth,ti.cb_top[0]);
-            }
+      residual = ti.cb_top[0] | ti.cb_top[1] | ti.cb_top[2];
+      if(depth == 0)  {
+        picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb    ,depth+1,ti.cb[0] & 0x1);
+        picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb    ,depth+1,ti.cb[1] & 0x1);
+        picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb + 4,depth+1,ti.cb[2] & 0x1);
+        picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb + 4,depth+1,ti.cb[3] & 0x1);
+      } else  {
+        picture_set_block_residual(encoder->in.cur_pic,x_ctb,y_ctb,depth,ti.cb_top[0]);
+      }
             
 
-            cabac.ctx = &g_cu_qt_root_cbf_model;
-            CABAC_BIN(&cabac, residual, "rqt_root_cbf");
-            // Code (possible) coeffs to bitstream
-            ti.idx = 0;
-            if(residual) {
-              encode_transform_coeff(encoder, &ti,depth, 0);
-            }
-          }
-        }
+      cabac.ctx = &g_cu_qt_root_cbf_model;
+      CABAC_BIN(&cabac, residual, "rqt_root_cbf");
+      // Code (possible) coeffs to bitstream
+      ti.idx = 0;
+      if(residual) {
+        encode_transform_coeff(encoder, &ti,depth, 0);
+      }
+    }
+        
 
     // END for each part
   } else if (cur_cu->type == CU_INTRA) {

From 52335adda04180f3be95f66a256af4288beb9d2c Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 11 Oct 2013 11:59:10 +0300
Subject: [PATCH 02/19] Split merge candidate derivation to its own function

---
 src/inter.c | 56 ++++++++++++++++++++++++++++++++++-------------------
 src/inter.h |  2 ++
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/src/inter.c b/src/inter.c
index e2a26b3e..bbdc41a0 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -227,18 +227,21 @@ void inter_recon(picture* ref,int32_t xpos, int32_t ypos,int32_t width, const in
 }
 
 /**
- * \brief Get MV prediction for current block
+ * \brief Get merge candidates for current block
  * \param encoder encoder control struct to use
  * \param x_cu block x position in SCU
  * \param y_cu block y position in SCU
  * \param depth current block depth
- * \param mv_pred[2][2] 2x motion vector prediction
+ * \param b0 candidate b0
+ * \param b1 candidate b1
+ * \param b2 candidate b2
+ * \param a0 candidate a0
+ * \param a1 candidate a1
  */
-void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2])
+void inter_get_spatial_merge_candidates(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, 
+                                        cu_info **b0, cu_info **b1,cu_info **b2,cu_info **a0,cu_info **a1)
 {
   uint8_t cur_block_in_scu = (LCU_WIDTH>>depth) / CU_MIN_SIZE_PIXELS; //!< the width of the current block on SCU
-  uint8_t candidates = 0;
-  
   /*
   Predictor block locations
   ____      _______
@@ -248,37 +251,50 @@ void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int
    __|         |
   |A1|_________|
   |A0|
-  */
-  cu_info *b0, *b1, *b2, *a0, *a1;
-
-  b0 = b1 = b2 = a0 = a1 = NULL;
+  */ 
 
   // A0 and A1 availability testing
   if (x_cu != 0) {    
-    a1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-    if (!a1->coded) a1 = NULL;
+    *a1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+    if (!(*a1)->coded) *a1 = NULL;
 
     if (y_cu + cur_block_in_scu < encoder->in.height_in_lcu<<MAX_DEPTH) {
-      a0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if (!a0->coded) a0 = NULL;
+      *a0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if (!(*a0)->coded) *a0 = NULL;
     }
   }
 
   // B0, B1 and B2 availability testing
   if (y_cu != 0) {
-
     if (x_cu + cur_block_in_scu < encoder->in.width_in_lcu<<MAX_DEPTH) {
-      b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if (!b0->coded) b0 = NULL;
+      *b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if (!(*b0)->coded) *b0 = NULL;
     }
-    b1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-    if (!b1->coded) b1 = NULL;
+    *b1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+    if (!(*b1)->coded) *b1 = NULL;
 
     if (x_cu != 0) {
-      b2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if(!b2->coded) b2 = NULL;
+      *b2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if(!(*b2)->coded) *b2 = NULL;
     }
   }
+}
+
+/**
+ * \brief Get MV prediction for current block
+ * \param encoder encoder control struct to use
+ * \param x_cu block x position in SCU
+ * \param y_cu block y position in SCU
+ * \param depth current block depth
+ * \param mv_pred[2][2] 2x motion vector prediction
+ */
+void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2])
+{  
+  uint8_t candidates = 0;
+
+  cu_info *b0, *b1, *b2, *a0, *a1;
+  b0 = b1 = b2 = a0 = a1 = NULL;
+  inter_get_spatial_merge_candidates(encoder, x_cu, y_cu, depth, &b0, &b1, &b2, &a0, &a1);
 
   // Left predictors
   if (a0 && a0->type == CU_INTER) {
diff --git a/src/inter.h b/src/inter.h
index 58c35c89..4553f894 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -21,6 +21,8 @@
 void inter_set_block(picture* pic,uint32_t x_cu, uint32_t y_cu, uint8_t depth, cu_info *cur_cu);
 void inter_recon(picture *ref,int32_t xpos, int32_t ypos,int32_t width, const int16_t mv[2], picture* dst);
 
+void inter_get_spatial_merge_candidates(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, 
+                                        cu_info **b0, cu_info **b1,cu_info **b2,cu_info **a0,cu_info **a1);
 void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2]);
 
 #endif

From db266e74ff7afe0f0d563e2d4965b2e5d278164e Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 11 Oct 2013 16:12:04 +0300
Subject: [PATCH 03/19] Added merge mode selection (NOT WORKING!) and a
 function to get candidates

---
 src/encoder.c | 82 ++++++++++++++++++++++++-----------------------
 src/inter.c   | 88 +++++++++++++++++++++++++++++++++++++++++++++++++--
 src/inter.h   |  1 +
 3 files changed, 130 insertions(+), 41 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 142de191..88e7841a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -741,26 +741,26 @@ void encode_slice_header(encoder_control* encoder)
     for (j = 0; j < ref_negative; j++) {
         WRITE_UE(encoder->stream, 0, "delta_poc_s0_minus1");
         WRITE_U(encoder->stream,1,1, "used_by_curr_pic_s0_flag");
-      }
-
-      //WRITE_UE(encoder->stream, 0, "short_term_ref_pic_set_idx");
     }
 
+    //WRITE_UE(encoder->stream, 0, "short_term_ref_pic_set_idx");
+  }
+
     //end if
   //end if
   if (encoder->sao_enable) {
       WRITE_U(encoder->stream, 1,1, "slice_sao_luma_flag");
       WRITE_U(encoder->stream, 0,1, "slice_sao_chroma_flag");
-    }
+  }
     
   if (encoder->in.cur_pic->slicetype != SLICE_I) {
       WRITE_U(encoder->stream, 0, 1, "num_ref_idx_active_override_flag");
-      WRITE_UE(encoder->stream, 0, "five_minus_max_num_merge_cand");
-    }
+      WRITE_UE(encoder->stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+  }
 
   if (encoder->in.cur_pic->slicetype == SLICE_B) {
       WRITE_U(encoder->stream, 0, 1, "mvd_l1_zero_flag");
-    }
+  }
 
   // Skip flags that are not present
   // if !entropy_slice_flag
@@ -903,53 +903,57 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // FOR each part
     // Mergeflag
     uint8_t merge_flag = 0;
-        cabac.ctx = &g_cu_merge_flag_ext_model;
+    int16_t unary_idx = 0;
+    int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
+    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);    
+    for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
+      if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
+         merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
+        //merge_flag = 1;
+        break;
+      }
+    }
+    cabac.ctx = &g_cu_merge_flag_ext_model;
     CABAC_BIN(&cabac, merge_flag, "MergeFlag");
 
     if (merge_flag) { //merge
-      // MergeIndex
-      int16_t unary_idx = 0; //pcCU->getMergeIndex( uiAbsPartIdx );
-      int16_t num_cand  = 0; //pcCU->getSlice()->getMaxNumMergeCand();
-          int32_t ui;
-
       if (num_cand > 1) {
+        int32_t ui;
         for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui == unary_idx) ? 0 : 1;
+          int32_t symbol = (ui != unary_idx);
 
           if (ui == 0) {
                 cabac.ctx = &g_cu_merge_idx_ext_model;
                 CABAC_BIN(&cabac, symbol, "MergeIndex");
           } else {
                 CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
-              }
-
-          if (symbol == 0) {
-                break;
-              }
-            }
           }
+
+          if (symbol == 0) break;
+        }
+      }
     } else {
       uint32_t ref_list_idx;
-          int16_t mv_cand[2][2];
+      int16_t mv_cand[2][2];
 
-          /*
-          // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
-          if(encoder->in.cur_pic->slicetype == SLICE_B)
-          {
-            // Code Inter Dir
-            const UInt uiInterDir = pcCU->getInterDir( uiAbsPartIdx ) - 1;
-            const UInt uiCtx      = pcCU->getCtxInterDir( uiAbsPartIdx );
-            ContextModel *pCtx    = m_cCUInterDirSCModel.get( 0 );
-            if (pcCU->getPartitionSize(uiAbsPartIdx) == SIZE_2Nx2N || pcCU->getHeight(uiAbsPartIdx) != 8 )
-            {
-              m_pcBinIf->encodeBin( uiInterDir == 2 ? 1 : 0, *( pCtx + uiCtx ) );
-            }
-            if (uiInterDir < 2)
-            {
-              m_pcBinIf->encodeBin( uiInterDir, *( pCtx + 4 ) );
-            }
-          }
-          */
+      /*
+      // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
+      if(encoder->in.cur_pic->slicetype == SLICE_B)
+      {
+        // Code Inter Dir
+        const UInt uiInterDir = pcCU->getInterDir( uiAbsPartIdx ) - 1;
+        const UInt uiCtx      = pcCU->getCtxInterDir( uiAbsPartIdx );
+        ContextModel *pCtx    = m_cCUInterDirSCModel.get( 0 );
+        if (pcCU->getPartitionSize(uiAbsPartIdx) == SIZE_2Nx2N || pcCU->getHeight(uiAbsPartIdx) != 8 )
+        {
+          m_pcBinIf->encodeBin( uiInterDir == 2 ? 1 : 0, *( pCtx + uiCtx ) );
+        }
+        if (uiInterDir < 2)
+        {
+          m_pcBinIf->encodeBin( uiInterDir, *( pCtx + 4 ) );
+        }
+      }
+      */
 
       for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
             //if(encoder->ref_idx_num[uiRefListIdx] > 0)
diff --git a/src/inter.c b/src/inter.c
index bbdc41a0..2e76b210 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -328,15 +328,99 @@ void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int
   }
 
 #if ENABLE_TEMPORAL_MVP
-  if(candidates < 2) {
+  if(candidates < AMVP_MAX_NUM_CANDS) {
     //TODO: add temporal mv predictor
   }
 #endif
 
   // Fill with (0,0)
-  while (candidates < 2) {
+  while (candidates < AMVP_MAX_NUM_CANDS) {
     mv_cand[candidates][0] = 0;
     mv_cand[candidates][1] = 0;
     candidates++;
   }
 }
+
+/**
+ * \brief Get merge predictions for current block
+ * \param encoder encoder control struct to use
+ * \param x_cu block x position in SCU
+ * \param y_cu block y position in SCU
+ * \param depth current block depth
+ * \param mv_pred[MRG_MAX_NUM_CANDS][2] MRG_MAX_NUM_CANDS motion vector prediction
+ */
+uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][2])
+{  
+  uint8_t candidates = 0;
+  uint8_t i = 0;
+  int8_t duplicate = 0;
+
+  cu_info *b0, *b1, *b2, *a0, *a1;
+  b0 = b1 = b2 = a0 = a1 = NULL;
+  inter_get_spatial_merge_candidates(encoder, x_cu, y_cu, depth, &b0, &b1, &b2, &a0, &a1);
+
+#define CHECK_DUPLICATE(X,Y) {duplicate = 0; for(i = 0; i < candidates; i++) { \
+                                               if(mv_cand[i][0] == (X) && mv_cand[i][1] == (Y)) { \
+                                               duplicate = 1; break; } }}
+
+  if (a1 && a1->type == CU_INTER) {
+      mv_cand[candidates][0] = a1->inter.mv[0];
+      mv_cand[candidates][1] = a1->inter.mv[1];
+      candidates++;
+  }
+
+  if (b1 && b1->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(b1->inter.mv[0],b1->inter.mv[1]);
+    if(!duplicate) {
+      mv_cand[candidates][0] = b1->inter.mv[0];
+      mv_cand[candidates][1] = b1->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if (b0 && b0->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(b0->inter.mv[0],b0->inter.mv[1]);
+    if(!duplicate) {
+      mv_cand[candidates][0] = b0->inter.mv[0];
+      mv_cand[candidates][1] = b0->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if (a0 && a0->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(a0->inter.mv[0],a0->inter.mv[1]);
+    if(!duplicate) {
+      mv_cand[candidates][0] = a0->inter.mv[0];
+      mv_cand[candidates][1] = a0->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if(b2 && b2->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(b2->inter.mv[0],b2->inter.mv[1]);
+    if(!duplicate) {
+      mv_cand[candidates][0] = b2->inter.mv[0];
+      mv_cand[candidates][1] = b2->inter.mv[1];
+      candidates++;
+    }
+  }
+
+
+#if ENABLE_TEMPORAL_MVP
+  if(candidates < AMVP_MAX_NUM_CANDS) {
+    //TODO: add temporal mv predictor
+  }
+#endif
+
+  // Fill with (0,0)
+  /*
+  i = candidates;
+  while (candidates < MRG_MAX_NUM_CANDS) {
+    mv_cand[candidates][0] = 0;
+    mv_cand[candidates][1] = 0;
+    candidates++;
+  }
+  */
+  return candidates;
+}
+
diff --git a/src/inter.h b/src/inter.h
index 4553f894..e3c7f4a6 100644
--- a/src/inter.h
+++ b/src/inter.h
@@ -24,5 +24,6 @@ void inter_recon(picture *ref,int32_t xpos, int32_t ypos,int32_t width, const in
 void inter_get_spatial_merge_candidates(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, 
                                         cu_info **b0, cu_info **b1,cu_info **b2,cu_info **a0,cu_info **a1);
 void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2]);
+uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][2]);
 
 #endif

From d9e6d8413daab40d19638f1019d7d661ee446c27 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 15 Oct 2013 17:56:50 +0300
Subject: [PATCH 04/19] Added coeff data to picture-struct

---
 src/encmain.c | 7 +++++++
 src/picture.c | 5 +++++
 src/picture.h | 2 ++
 3 files changed, 14 insertions(+)

diff --git a/src/encmain.c b/src/encmain.c
index 0bd39731..7e6b038d 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -154,6 +154,9 @@ int main(int argc, char *argv[])
 
   init_encoder_input(&encoder->in, input, cfg->width, cfg->height);
 
+  // Init coeff data table
+  encoder->in.cur_pic->coeff = MALLOC(coefficient, cfg->width * cfg->height);
+
   // Start coding cycle while data on input and not on the last frame
   while(!feof(input) && (!cfg->frames || encoder->frame < cfg->frames)) {
     int32_t diff;
@@ -202,6 +205,10 @@ int main(int argc, char *argv[])
     // TODO: reuse memory from old reference
     encoder->in.cur_pic = picture_init(encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu);
 
+    // Copy pointer from the last cur_pic because we don't want to reallocate it
+    encoder->in.cur_pic->coeff = encoder->ref->pics[0]->coeff;
+    encoder->ref->pics[0]->coeff = NULL;
+
     encoder->frame++;
   }
   // Coding finished
diff --git a/src/picture.c b/src/picture.c
index 9b77d23c..5e89aa7b 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -244,6 +244,8 @@ picture *picture_init(int32_t width, int32_t height,
     memset(pic->cu_array[i], 0, sizeof(cu_info) * cu_array_size);
   }
 
+  pic->coeff = NULL;
+
   return pic;
 }
 
@@ -275,6 +277,9 @@ int picture_destroy(picture *pic)
   free(pic->cu_array);
   pic->cu_array = NULL;
 
+  free(pic->coeff);
+  pic->coeff = NULL;
+
   return 1;
 }
 
diff --git a/src/picture.h b/src/picture.h
index 764d621c..9cf4d65d 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -73,6 +73,8 @@ typedef struct
   pixel* u_recdata;     // \brief Pointer to reconstructed U-data.
   pixel* v_recdata;     // \brief Pointer to reconstructed V-data.
 
+  coefficient* coeff;   //!< \brief coefficient pointer
+
   int32_t width;          // \brief Luma pixel array width.
   int32_t height;         // \brief Luma pixel array height.
   int32_t height_in_lcu;  // \brief Picture width in number of LCU's.

From d236d58981fdb61d18d085731c2c072389997af6 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Thu, 17 Oct 2013 15:14:22 +0300
Subject: [PATCH 05/19] Added more data to cu_info and renamed "residual" to
 "coeff_y/u/v" in the struct

---
 src/filter.c  |  4 ++--
 src/picture.c |  6 +++---
 src/picture.h | 40 ++++++++++++++++++++++------------------
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/filter.c b/src/filter.c
index 49883a41..e2e6c04f 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -198,8 +198,8 @@ void filter_deblock_edge_luma(encoder_control *encoder,
         // Intra blocks have strength 2
         if(cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
           strength = 2;          
-          // Non-zero residual and transform boundary
-        } else if(cu_q->residual || cu_p->residual) {
+          // Non-zero residual/coeffs and transform boundary
+        } else if(cu_q->coeff_y || cu_p->coeff_y) {
           strength = 1;
           // Absolute motion vector diff between blocks >= 1 (Integer pixel)
         } else if((abs(cu_q->inter.mv[0] - cu_p->inter.mv[0]) >= 4) || (abs(cu_q->inter.mv[1] - cu_p->inter.mv[1]) >= 4)) {
diff --git a/src/picture.c b/src/picture.c
index 5e89aa7b..1253a083 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -26,10 +26,10 @@
  * \param x_scu  x SCU position (smallest CU)
  * \param y_scu  y SCU position (smallest CU)
  * \param depth  current CU depth
- * \param residual  residual status
+ * \param coeff_y  residual status
  */
 void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
-                                uint8_t depth, int8_t residual)
+                                uint8_t depth, int8_t coeff_y)
 {
   uint32_t x, y;
   int width_in_scu = pic->width_in_lcu << MAX_DEPTH;
@@ -38,7 +38,7 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
   for (y = y_scu; y < y_scu + block_scu_width; ++y) {
     int cu_row = y * width_in_scu;
     for (x = x_scu; x < x_scu + block_scu_width; ++x) {
-      pic->cu_array[MAX_DEPTH][cu_row + x].residual = residual;
+      pic->cu_array[MAX_DEPTH][cu_row + x].coeff_y = coeff_y;
     }
   }
 }
diff --git a/src/picture.h b/src/picture.h
index 9cf4d65d..12088a89 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -52,10 +52,14 @@ typedef struct
  */
 typedef struct
 {  
-  int8_t type;
-  int8_t depth;
-  int8_t coded;
-  int8_t residual;
+  int8_t type;       //!< \brief block type, CU_INTER / CU_INTRA
+  int8_t depth;      //!< \brief depth / size of this block
+  int8_t part_size;  //!< \brief Currently only 2Nx2N, TODO: AMP/SMP/NxN parts
+  int8_t tr_depth;   //!< \brief transform depth
+  int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
+  int8_t coeff_y;    //!< \brief is there coded coeffs Y
+  int8_t coeff_u;    //!< \brief is there coded coeffs U
+  int8_t coeff_v;    //!< \brief is there coded coeffs V
   cu_info_intra intra;
   cu_info_inter inter;
 } cu_info;
@@ -65,22 +69,22 @@ typedef struct
  */
 typedef struct
 {
-  pixel* y_data;        // \brief Pointer to luma pixel array.
-  pixel* u_data;        // \brief Pointer to chroma U pixel array.
-  pixel* v_data;        // \brief Pointer to chroma V pixel array.
+  pixel* y_data;        //!< \brief Pointer to luma pixel array.
+  pixel* u_data;        //!< \brief Pointer to chroma U pixel array.
+  pixel* v_data;        //!< \brief Pointer to chroma V pixel array.
 
-  pixel* y_recdata;     // \brief Pointer to reconstructed Y-data.
-  pixel* u_recdata;     // \brief Pointer to reconstructed U-data.
-  pixel* v_recdata;     // \brief Pointer to reconstructed V-data.
+  pixel* y_recdata;     //!< \brief Pointer to reconstructed Y-data.
+  pixel* u_recdata;     //!< \brief Pointer to reconstructed U-data.
+  pixel* v_recdata;     //!< \brief Pointer to reconstructed V-data.
 
   coefficient* coeff;   //!< \brief coefficient pointer
 
-  int32_t width;          // \brief Luma pixel array width.
-  int32_t height;         // \brief Luma pixel array height.
-  int32_t height_in_lcu;  // \brief Picture width in number of LCU's.
-  int32_t width_in_lcu;   // \brief Picture height in number of LCU's.
-  uint8_t referenced;     // \brief Whether this picture is referenced.
-  cu_info** cu_array;           // \brief Info for each CU at each depth.
+  int32_t width;          //!< \brief Luma pixel array width.
+  int32_t height;         //!< \brief Luma pixel array height.
+  int32_t height_in_lcu;  //!< \brief Picture width in number of LCU's.
+  int32_t width_in_lcu;   //!< \brief Picture height in number of LCU's.
+  uint8_t referenced;     //!< \brief Whether this picture is referenced.
+  cu_info** cu_array;     //!< \brief Info for each CU at each depth.
   uint8_t type;
   uint8_t slicetype;
 } picture;
@@ -90,8 +94,8 @@ typedef struct
  */
 typedef struct
 {
-  picture** pics;          // \brief Pointer to array of picture pointers.
-  unsigned int size;       // \brief Array size.
+  picture** pics;          //!< \brief Pointer to array of picture pointers.
+  unsigned int size;       //!< \brief Array size.
   unsigned int used_size;
 } picture_list;
 

From dda53f48a7f70fd79047e5b9d03cdf429af8c828 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 18 Oct 2013 11:39:13 +0300
Subject: [PATCH 06/19] Refactoring encoder transform/quant related functions,
 cu_info and picture

---
 src/encmain.c |  18 ++-
 src/encoder.c | 368 ++++++++++++++++----------------------------------
 src/encoder.h |  37 +----
 src/global.h  |   1 +
 src/picture.c |  34 +++--
 src/picture.h |  16 ++-
 6 files changed, 167 insertions(+), 307 deletions(-)

diff --git a/src/encmain.c b/src/encmain.c
index 7e6b038d..d64f8860 100644
--- a/src/encmain.c
+++ b/src/encmain.c
@@ -155,7 +155,14 @@ int main(int argc, char *argv[])
   init_encoder_input(&encoder->in, input, cfg->width, cfg->height);
 
   // Init coeff data table
-  encoder->in.cur_pic->coeff = MALLOC(coefficient, cfg->width * cfg->height);
+  encoder->in.cur_pic->coeff_y = MALLOC(coefficient, cfg->width * cfg->height);
+  encoder->in.cur_pic->coeff_u = MALLOC(coefficient, (cfg->width * cfg->height) >> 2);
+  encoder->in.cur_pic->coeff_v = MALLOC(coefficient, (cfg->width * cfg->height) >> 2);
+
+  // Init predicted data table
+  encoder->in.cur_pic->pred_y = MALLOC(pixel, cfg->width * cfg->height);
+  encoder->in.cur_pic->pred_u = MALLOC(pixel, (cfg->width * cfg->height) >> 2);
+  encoder->in.cur_pic->pred_v = MALLOC(pixel, (cfg->width * cfg->height) >> 2);
 
   // Start coding cycle while data on input and not on the last frame
   while(!feof(input) && (!cfg->frames || encoder->frame < cfg->frames)) {
@@ -206,8 +213,13 @@ int main(int argc, char *argv[])
     encoder->in.cur_pic = picture_init(encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu);
 
     // Copy pointer from the last cur_pic because we don't want to reallocate it
-    encoder->in.cur_pic->coeff = encoder->ref->pics[0]->coeff;
-    encoder->ref->pics[0]->coeff = NULL;
+    MOVE_POINTER(encoder->in.cur_pic->coeff_y,encoder->ref->pics[0]->coeff_y);
+    MOVE_POINTER(encoder->in.cur_pic->coeff_u,encoder->ref->pics[0]->coeff_u);
+    MOVE_POINTER(encoder->in.cur_pic->coeff_v,encoder->ref->pics[0]->coeff_v);
+    
+    MOVE_POINTER(encoder->in.cur_pic->pred_y,encoder->ref->pics[0]->pred_y);
+    MOVE_POINTER(encoder->in.cur_pic->pred_u,encoder->ref->pics[0]->pred_u);
+    MOVE_POINTER(encoder->in.cur_pic->pred_v,encoder->ref->pics[0]->pred_v);
 
     encoder->frame++;
   }
diff --git a/src/encoder.c b/src/encoder.c
index 88e7841a..507e7d4b 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -887,7 +887,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
   // Prediction mode
   if (encoder->in.cur_pic->slicetype != SLICE_I) {
     cabac.ctx = &g_cu_pred_mode_model;
-    CABAC_BIN(&cabac, (cur_cu->type == CU_INTRA) ? 1 : 0, "PredMode");
+    CABAC_BIN(&cabac, (cur_cu->type == CU_INTRA), "PredMode");
   }
 
   // Signal PartSize on max depth
@@ -1061,79 +1061,20 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // Mark this block as "coded" (can be used for predictions..)
     picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
 
-          
-    {
-            pixel *base_y  = &encoder->in.cur_pic->y_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH))   + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH)))  *encoder->in.width];
-            pixel *base_u = &encoder->in.cur_pic->u_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
-            pixel *base_v = &encoder->in.cur_pic->v_data[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
-      uint32_t width = LCU_WIDTH>>depth;
-
-      /* INTRAPREDICTION VARIABLES */
-      int16_t pred[LCU_WIDTH*LCU_WIDTH+1];
-      int16_t predU[LCU_WIDTH*LCU_WIDTH>>2];
-      int16_t predV[LCU_WIDTH*LCU_WIDTH>>2];
-
-            pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH))   + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH)))  *encoder->in.width];
-            pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
-            pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)) + (y_ctb*(LCU_WIDTH>>(MAX_DEPTH+1)))*(encoder->in.width>>1)];
-
-      /* TODO: dynamic memory allocation */
-      int16_t coeff_y[LCU_WIDTH*LCU_WIDTH*2];
-      int16_t coeff_u[LCU_WIDTH*LCU_WIDTH>>1];
-      int16_t coeff_v[LCU_WIDTH*LCU_WIDTH>>1];
-      int8_t residual = 0;
-
-      /* Initialize helper structure for transform */
-      transform_info ti;
-      memset(&ti, 0, sizeof(transform_info));
-
-      ti.x_ctb = x_ctb; ti.y_ctb = y_ctb;
-
-      /* Base pointers */
-      ti.base =  base_y; ti.base_u = base_u; ti.base_v = base_v;
-      ti.base_stride = encoder->in.width;
-
-      // Prediction pointers
-      ti.pred =  pred; ti.pred_u = predU; ti.pred_v = predV;
-      ti.pred_stride = (LCU_WIDTH>>depth);
-
-      // Reconstruction pointers
-      ti.recbase = recbase_y; ti.recbase_u = recbase_u; ti.recbase_v = recbase_v;
-      ti.recbase_stride = encoder->in.width;
-
-      // Coeff pointers
-      ti.coeff[0] = coeff_y; ti.coeff[1] = coeff_u; ti.coeff[2] = coeff_v;
-      ti.block_type = CU_INTER;
-       
-      // Handle transforms, quant and reconstruction
-      ti.idx = 0;
-      encode_transform_tree(encoder,&ti, depth);
-
-      // Coded block pattern
-      ti.cb_top[0] = (ti.cb[0] & 0x1 || ti.cb[1] & 0x1 || ti.cb[2] & 0x1 || ti.cb[3] & 0x1)?1:0;
-      ti.cb_top[1] = (ti.cb[0] & 0x2 || ti.cb[1] & 0x2 || ti.cb[2] & 0x2 || ti.cb[3] & 0x2)?1:0;
-      ti.cb_top[2] = (ti.cb[0] & 0x4 || ti.cb[1] & 0x4 || ti.cb[2] & 0x4 || ti.cb[3] & 0x4)?1:0;
-
-      residual = ti.cb_top[0] | ti.cb_top[1] | ti.cb_top[2];
-      if(depth == 0)  {
-        picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb    ,depth+1,ti.cb[0] & 0x1);
-        picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb    ,depth+1,ti.cb[1] & 0x1);
-        picture_set_block_residual(encoder->in.cur_pic,x_ctb    ,y_ctb + 4,depth+1,ti.cb[2] & 0x1);
-        picture_set_block_residual(encoder->in.cur_pic,x_ctb + 4,y_ctb + 4,depth+1,ti.cb[3] & 0x1);
-      } else  {
-        picture_set_block_residual(encoder->in.cur_pic,x_ctb,y_ctb,depth,ti.cb_top[0]);
-      }
-            
+    encode_transform_tree(encoder,x_ctb, y_ctb, depth);
 
+    // Only need to signal coded block flag if not skipped or merged
+    // skip = no coded residual, merge = coded residual
+    if (!cur_cu->merged) {
       cabac.ctx = &g_cu_qt_root_cbf_model;
-      CABAC_BIN(&cabac, residual, "rqt_root_cbf");
-      // Code (possible) coeffs to bitstream
-      ti.idx = 0;
-      if(residual) {
-        encode_transform_coeff(encoder, &ti,depth, 0);
-      }
+      CABAC_BIN(&cabac, cur_cu->coeff_y | cur_cu->coeff_u | cur_cu->coeff_v, "rqt_root_cbf");
     }
-        
+    // Code (possible) coeffs to bitstream
+     
+    if(cur_cu->coeff_y | cur_cu->coeff_u | cur_cu->coeff_v) {
+      encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0);
+    }
+
 
     // END for each part
   } else if (cur_cu->type == CU_INTRA) {
@@ -1149,9 +1090,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     uint32_t width = LCU_WIDTH>>depth;
 
     // INTRAPREDICTION VARIABLES
-    int16_t pred_y[LCU_WIDTH * LCU_WIDTH + 1];
-    int16_t pred_u[LCU_WIDTH * LCU_WIDTH >> 2];
-    int16_t pred_v[LCU_WIDTH * LCU_WIDTH >> 2];
+    int16_t pred_y[LCU_WIDTH * LCU_WIDTH];
 
     pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))     + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
     pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
@@ -1251,8 +1190,8 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
           if (intra_pred_mode == allowed_chroma_dir[i]) {
             allowed_chroma_dir[i] = 34; /* VER+8 mode */
               break;
-            }
           }
+        }
 
         for (i = 0; i < 4; i++) {
           if (intra_pred_mode_chroma_temp == allowed_chroma_dir[i]) {
@@ -1269,54 +1208,8 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     
     // Coeff
     // Transform tree
-      {
-        // TODO: dynamic memory allocation
-        int16_t coeff_y[LCU_WIDTH * LCU_WIDTH * 2];
-        int16_t coeff_u[LCU_WIDTH * LCU_WIDTH >> 1];
-        int16_t coeff_v[LCU_WIDTH * LCU_WIDTH >> 1];
-
-        // Initialize helper structure for transform
-        transform_info ti;
-        memset(&ti, 0, sizeof(transform_info));
-
-        ti.x_ctb = x_ctb; ti.y_ctb = y_ctb;
-
-        // Base pointers
-        ti.base =  base_y; ti.base_u = base_u; ti.base_v = base_v;
-        ti.base_stride = encoder->in.width;
-
-        // Prediction pointers
-        ti.pred =  pred_y; ti.pred_u = pred_u; ti.pred_v = pred_v;
-        ti.pred_stride = (LCU_WIDTH>>depth);
-
-        // Reconstruction pointers
-        ti.recbase = recbase_y; ti.recbase_u = recbase_u; ti.recbase_v = recbase_v;
-        ti.recbase_stride = encoder->in.width;
-
-        // Coeff pointers
-        ti.coeff[0] = coeff_y; ti.coeff[1] = coeff_u; ti.coeff[2] = coeff_v;
-
-        // Prediction info
-        ti.intra_pred_mode = intra_pred_mode;
-        ti.intra_pred_mode_chroma = intra_pred_mode_chroma;
-        
-        // Handle transforms, quant and reconstruction
-        ti.idx = 0;
-        ti.block_type = CU_INTRA;
-        encode_transform_tree(encoder,&ti, depth);
-
-        // Coded block pattern
-        ti.cb_top[0] = (ti.cb[0] & 0x1 || ti.cb[1] & 0x1 || ti.cb[2] & 0x1
-                        || ti.cb[3] & 0x1) ? 1 : 0;
-        ti.cb_top[1] = (ti.cb[0] & 0x2 || ti.cb[1] & 0x2 || ti.cb[2] & 0x2
-                        || ti.cb[3] & 0x2) ? 1 : 0;
-        ti.cb_top[2] = (ti.cb[0] & 0x4 || ti.cb[1] & 0x4 || ti.cb[2] & 0x4
-                        || ti.cb[3] & 0x4) ? 1 : 0;
-        
-        // Code (possible) coeffs to bitstream
-        ti.idx = 0;
-        encode_transform_coeff(encoder, &ti,depth, 0);
-      }
+    encode_transform_tree(encoder, x_ctb, y_ctb, depth);
+    encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0);
     // end Transform tree
     // end Coeff
 
@@ -1374,80 +1267,44 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
   
 }
 
-void encode_transform_tree(encoder_control *encoder, transform_info *ti,
-                           uint8_t depth)
+void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu, uint8_t depth)
 {
   // we have 64>>depth transform size
   int x,y,i;
   int32_t width = LCU_WIDTH>>depth;
-
-  if (depth == 0) { // Split 64x64
-    // Prepare for multi-level splitting
-    ti->split[ti->idx] = 1<<depth;
-  }
+  cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
 
   // Split transform and increase depth
-  if (ti->split[ti->idx] & (1 << depth)) {
-    uint8_t change = 1<<(MAX_DEPTH-1-depth);    
-    ti->idx = 0; encode_transform_tree(encoder,ti,depth+1);
-    ti->x_ctb += change;
-    ti->idx = 1; encode_transform_tree(encoder,ti,depth+1);
-    ti->x_ctb -= change; ti->y_ctb += change;
-    ti->idx = 2; encode_transform_tree(encoder,ti,depth+1);
-    ti->x_ctb += change;
-    ti->idx = 3; encode_transform_tree(encoder,ti,depth+1);
+  if (depth == 0 || cur_cu->tr_depth > depth) {
+    uint8_t offset = 1<<(MAX_DEPTH-1-depth);
+    encode_transform_tree(encoder, x_cu, y_cu, depth+1);
+    encode_transform_tree(encoder, x_cu + offset, y_cu, depth+1);
+    encode_transform_tree(encoder, x_cu, y_cu + offset, depth+1);
+    encode_transform_tree(encoder, x_cu + offset, y_cu + offset, depth+1);
     return;
   }
   
   {
-    uint8_t cb_y = 0, cb_u = 0, cb_v = 0;
-    int32_t coeff_fourth = ((LCU_WIDTH>>(depth))*(LCU_WIDTH>>(depth)))+1;
+    // INTRAPREDICTION VARIABLES
+    pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];    
+    int32_t recbase_stride = encoder->in.width;
 
-    int32_t base_stride    = ti->base_stride;
-    int32_t recbase_stride = ti->recbase_stride;
-    int32_t pred_stride    = ti->pred_stride;
+    pixel *base_y    = &encoder->in.cur_pic->y_data[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    pixel *base_u    = &encoder->in.cur_pic->u_data[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    pixel *base_v    = &encoder->in.cur_pic->v_data[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];    
+    int32_t base_stride = encoder->in.width;
 
-    int32_t recbase_offset[4] = {
-      0, width, ti->recbase_stride * width,
-      ti->recbase_stride * width + width
-    };
-    int32_t base_offset[4]    = {
-      0, width, ti->base_stride * width,
-      ti->base_stride * width + width 
-    };
-    int32_t pred_offset[4]    = {
-      0, width, ti->pred_stride * width, 
-      ti->pred_stride * width + width
-    };
-    
-    int32_t recbase_offset_c[4] = {
-      0, width >> 1, (ti->recbase_stride >> 1) * (width >> 1),
-      (ti->recbase_stride >> 1) *(width >> 1) + (width >> 1) 
-    };
-    int32_t base_offset_c[4]    = {
-      0, width >> 1, (ti->base_stride >> 1) * (width >> 1),
-      (ti->base_stride >> 1) * (width >> 1)   + (width >> 1) 
-    };
-    int32_t pred_offset_c[4]    = {
-      0, width >> 1, (ti->pred_stride >> 1) * (width >> 1), 
-      (ti->pred_stride >> 1) * (width >> 1)   + (width >> 1) 
-    };
-    
-    pixel *base_y    = &ti->base[base_offset[ti->idx]];
-    pixel *base_u    = &ti->base_u[base_offset_c[ti->idx]];
-    pixel *base_v    = &ti->base_v[base_offset_c[ti->idx]];
-    
-    pixel *recbase_y = &ti->recbase[recbase_offset[ti->idx]];
-    pixel *recbase_u = &ti->recbase_u[recbase_offset_c[ti->idx]];
-    pixel *recbase_v = &ti->recbase_v[recbase_offset_c[ti->idx]];
-    
-    int16_t *pred_y    = &ti->pred[pred_offset[ti->idx]];
-    int16_t *pred_u    = &ti->pred_u[pred_offset_c[ti->idx]];
-    int16_t *pred_v    = &ti->pred_v[pred_offset_c[ti->idx]];
-    
-    int16_t *coeff_y   = &ti->coeff[0][ti->idx * coeff_fourth];
-    int16_t *coeff_u   = &ti->coeff[1][ti->idx * coeff_fourth >> 1];
-    int16_t *coeff_v   = &ti->coeff[2][ti->idx * coeff_fourth >> 1];
+    pixel *pred_y    = &encoder->in.cur_pic->pred_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    pixel *pred_u    = &encoder->in.cur_pic->pred_u[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    pixel *pred_v    = &encoder->in.cur_pic->pred_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    int32_t pred_stride = encoder->in.width;
+
+    int16_t *coeff_y   = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    int16_t *coeff_u   = &encoder->in.cur_pic->coeff_u[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    int16_t *coeff_v   = &encoder->in.cur_pic->coeff_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    int32_t coeff_stride = encoder->in.width;
 
     // Quant and transform here...
     int16_t block[LCU_WIDTH*LCU_WIDTH>>2];
@@ -1461,7 +1318,7 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
 
     uint32_t ac_sum = 0;
     uint32_t ctx_idx;
-    uint32_t scan_idx_luma = SCAN_DIAG;
+    uint32_t scan_idx_luma   = SCAN_DIAG;
     uint32_t scan_idx_chroma = SCAN_DIAG;
     uint8_t dir_mode;
     #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
@@ -1478,20 +1335,21 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
       default: ctx_idx = 0; break;
     }
 
-    if(ti->block_type == CU_INTRA)
+    if(cur_cu->type == CU_INTRA)
     {
       //if multiple scans supported for transform size
       if (ctx_idx > 3 && ctx_idx < 6) {
-        scan_idx_luma = abs((int32_t) ti->intra_pred_mode - 26) < 5 ? 1 : (abs((int32_t)ti->intra_pred_mode - 10) < 5 ? 2 : 0);
+        scan_idx_luma = abs((int32_t) cur_cu->intra.mode - 26) < 5 ? 1 : (abs((int32_t)cur_cu->intra.mode - 10) < 5 ? 2 : 0);
       }
-
+      // TODO : chroma intra prediction
+      cur_cu->intra.mode_chroma = 36;
       // Chroma scanmode
       ctx_idx++;
-      dir_mode = ti->intra_pred_mode_chroma;
+      dir_mode = cur_cu->intra.mode_chroma; 
 
       if (dir_mode == 36) {
         // TODO: support NxN
-        dir_mode = ti->intra_pred_mode;
+        dir_mode = cur_cu->intra.mode;
       }
 
       if (ctx_idx > 4 && ctx_idx < 7) { // if multiple scans supported for transform size
@@ -1499,51 +1357,52 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
       }
 
       // Build reconstructed block to use in prediction with extrapolated borders
-      intra_build_reference_border(encoder->in.cur_pic, ti->x_ctb, ti->y_ctb,
+      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
                                    (LCU_WIDTH >> (depth)) * 2 + 8, rec, (LCU_WIDTH >> (depth)) * 2 + 8, 0);
       intra_recon(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8,
-                  ti->x_ctb * (LCU_WIDTH >> (MAX_DEPTH)), ti->y_ctb * (LCU_WIDTH >> (MAX_DEPTH)),
-                  width, pred_y, pred_stride, ti->intra_pred_mode, 0);
+                  x_cu * (LCU_WIDTH >> (MAX_DEPTH)), y_cu * (LCU_WIDTH >> (MAX_DEPTH)),
+                  width, pred_y, pred_stride, cur_cu->intra.mode, 0);
 
       // Filter DC-prediction
-      if (ti->intra_pred_mode == 1 && width < 32) {
+      if (cur_cu->intra.mode == 1 && width < 32) {
         intra_dc_pred_filtering(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8, pred_y,
                                 width, LCU_WIDTH >> depth, LCU_WIDTH >> depth);
       }
-
-      if (ti->intra_pred_mode_chroma != 36
-          && ti->intra_pred_mode_chroma == ti->intra_pred_mode) {
-          ti->intra_pred_mode_chroma = 36;
-        }
+      
+      // TODO : chroma intra prediction
+      if (cur_cu->intra.mode_chroma != 36
+          && cur_cu->intra.mode_chroma == cur_cu->intra.mode) {
+          cur_cu->intra.mode_chroma = 36;
+      }
     
-      intra_build_reference_border(encoder->in.cur_pic, ti->x_ctb, ti->y_ctb,
+      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
                                    (LCU_WIDTH >> (depth + 1)) * 2 + 8, rec,
                                    (LCU_WIDTH >> (depth + 1)) * 2 + 8,
                                    1);
       intra_recon(rec_shift_u, 
                   (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                  ti->x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  ti->y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                  x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                  y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
                   width >> 1,
                   pred_u,
                   pred_stride >> 1,
-                  ti->intra_pred_mode_chroma != 36 ? ti->intra_pred_mode_chroma : ti->intra_pred_mode,
+                  cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
                   1);
-      intra_build_reference_border(encoder->in.cur_pic, ti->x_ctb, ti->y_ctb,
+      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
                                    (LCU_WIDTH >> (depth + 1)) * 2 + 8,
                                    rec, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
                                    2);
       intra_recon(rec_shift_u, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                  ti->x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  ti->y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                  x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                  y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
                   width >> 1,
                   pred_v,
                   pred_stride >> 1,
-                  ti->intra_pred_mode_chroma != 36 ? ti->intra_pred_mode_chroma : ti->intra_pred_mode,
+                  cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
                   1);
 
     // This affects reconstruction, do after that
-      picture_set_block_coded(encoder->in.cur_pic, ti->x_ctb, ti->y_ctb, depth, 1);
+      picture_set_block_coded(encoder->in.cur_pic, x_cu, y_cu, depth, 1);
     } else  { // Inter mode
       for(y = 0; y < LCU_WIDTH>>depth; y++) {
         for(x = 0; x < LCU_WIDTH>>depth; x++) {
@@ -1582,21 +1441,21 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
 
     // Transform and quant residual to coeffs
     transform2d(block,pre_quant_coeff,width,0);
-    quant(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, ti->block_type);
+    quant(encoder, pre_quant_coeff, coeff_y, width, width, &ac_sum, 0, scan_idx_luma, cur_cu->type);
 
     // Check for non-zero coeffs
     for (i = 0; i < width * width; i++) {
       if (coeff_y[i] != 0) {
         // Found one, we can break here
-        cb_y = 1;
+        cur_cu->coeff_y = 1;
         break;
       }
     }
         
     // if non-zero coeffs
-    if (cb_y) {
+    if (cur_cu->coeff_y) {
       // RECONSTRUCT for predictions
-      dequant(encoder, coeff_y, pre_quant_coeff, width, width, 0, ti->block_type);
+      dequant(encoder, coeff_y, pre_quant_coeff, width, width, 0, cur_cu->type);
       itransform2d(block,pre_quant_coeff,width,0);
 
       i = 0;
@@ -1614,9 +1473,9 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
       for (y = 0; y < LCU_WIDTH >> depth; y++) {
         for (x = 0; x < LCU_WIDTH >> depth; x++) {
           recbase_y[x + y * recbase_stride] = (uint8_t)CLIP(0, 255, pred_y[x + y * pred_stride]);
-    }
         }
       }
+    }
 
     if (encoder->in.video_format != FORMAT_400) {
       // Chroma U
@@ -1633,12 +1492,12 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
 
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
       quant(encoder, pre_quant_coeff, coeff_u, width >> 1, width >> 1, &ac_sum, 2,
-            scan_idx_chroma, ti->block_type);
+            scan_idx_chroma, cur_cu->type);
 
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_u[i] != 0) {
           // Found one, we can break here
-          cb_u = 1;
+          cur_cu->coeff_u = 1;
           break;
         }
       }
@@ -1657,19 +1516,19 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
 
       transform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
       quant(encoder, pre_quant_coeff, coeff_v, width >> 1, width >> 1, &ac_sum, 3,
-            scan_idx_chroma, ti->block_type);
+            scan_idx_chroma, cur_cu->type);
 
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_v[i] != 0) {
           // Found one, we can break here
-          cb_v = 1;
+          cur_cu->coeff_v = 1;
           break;
         }
       }
           
-      if (cb_u) {
+      if (cur_cu->coeff_u) {
         // RECONSTRUCT for predictions
-        dequant(encoder, coeff_u, pre_quant_coeff, width >> 1, width >> 1, 2, ti->block_type);
+        dequant(encoder, coeff_u, pre_quant_coeff, width >> 1, width >> 1, 2, cur_cu->type);
         itransform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
 
         i = 0;
@@ -1689,13 +1548,13 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
           for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
             recbase_u[x + y * (recbase_stride >> 1)] = (uint8_t)CLIP(0, 255,
                                                                      pred_u[x + y * (pred_stride >> 1)]);
-      }
           }
         }
+      }
       
-      if (cb_v) {
+      if (cur_cu->coeff_v) {
         // RECONSTRUCT for predictions
-        dequant(encoder, coeff_v, pre_quant_coeff, width >> 1, width >> 1, 3, ti->block_type);
+        dequant(encoder, coeff_v, pre_quant_coeff, width >> 1, width >> 1, 3, cur_cu->type);
         itransform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
 
         i = 0;
@@ -1715,25 +1574,23 @@ void encode_transform_tree(encoder_control *encoder, transform_info *ti,
           for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
             recbase_v[x + y * (recbase_stride >> 1)] = (uint8_t)CLIP(0, 255,
                                                                      pred_v[x + y * (pred_stride >> 1)]);
-      }
           }
         }
       }
-    
-    // Store coded block pattern
-    ti->cb[ti->idx] = cb_y | (cb_u << 1) | (cb_v << 2);
-    // END INTRAPREDICTION
+    }
+
     return;
   }
 
   // end Residual Coding
 }
 
-void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
+void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
                             int8_t depth, int8_t tr_depth)
 {
+  cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
   int8_t width = LCU_WIDTH>>depth;
-  int8_t split = ((ti->split[ti->idx]&(1<<depth))||!depth)?1:0;
+  int8_t split = (cur_cu->tr_depth > depth||!depth);
   int8_t cb_y, cb_u, cb_v;
   int32_t coeff_fourth = ((LCU_WIDTH>>(depth))*(LCU_WIDTH>>(depth)))+1;
   
@@ -1747,39 +1604,42 @@ void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
   // Chroma data is also signaled BEFORE transform split
   // Chroma data is not signaled if it was set to 0 before split
   if (encoder->in.video_format != FORMAT_400) {
+    uint8_t offset = 1<<(MAX_DEPTH-1-depth);
+
+    cu_info *cur_cu_idx_2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cur_cu_idx_3 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cur_cu_idx_4 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+
     // Non-zero chroma U Tcoeffs
-    int8_t cb_flag = (tr_depth == 0) ? ti->cb_top[1] : ((ti->cb[ti->idx] & 0x2) ? 1
-                                                        : 0);
+    int8_t cb_flag = (!split) ? cur_cu->coeff_u : (cur_cu->coeff_u | cur_cu_idx_2->coeff_u | cur_cu_idx_3->coeff_u  | cur_cu_idx_4->coeff_u);
     cabac.ctx = &g_qt_cbf_model_chroma[tr_depth];
 
-    if (tr_depth == 0 || ti->cb_top[1]) {
+    if (tr_depth == 0 /*|| ti->cb_top[1]*/) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_u");
     }
 
     // Non-zero chroma V Tcoeffs
     // NOTE: Using the same ctx as before
-    cb_flag = (tr_depth == 0) ? ti->cb_top[2] : ((ti->cb[ti->idx] & 0x4) ? 1 : 0);
+    cb_flag = (!split) ? cur_cu->coeff_v : (cur_cu->coeff_v | cur_cu_idx_2->coeff_v | cur_cu_idx_3->coeff_v  | cur_cu_idx_4->coeff_v);
 
-    if (tr_depth == 0 || ti->cb_top[2]) {
+    if (tr_depth == 0 /*|| ti->cb_top[2]*/) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_v");
     }
   }
   
   if (split) {
-    ti->idx = 0; encode_transform_coeff(encoder, ti, depth + 1, tr_depth + 1);
-    ti->idx = 1; encode_transform_coeff(encoder, ti, depth + 1, tr_depth + 1);
-    ti->idx = 2; encode_transform_coeff(encoder, ti, depth + 1, tr_depth + 1);
-    ti->idx = 3; encode_transform_coeff(encoder, ti, depth + 1, tr_depth + 1);
+    uint8_t offset = 1<<(MAX_DEPTH-1-depth);
+    encode_transform_coeff(encoder, x_cu, y_cu, depth + 1, tr_depth + 1);
+    encode_transform_coeff(encoder, x_cu + offset, y_cu,  depth + 1, tr_depth + 1);
+    encode_transform_coeff(encoder, x_cu, y_cu + offset,  depth + 1, tr_depth + 1);
+    encode_transform_coeff(encoder, x_cu + offset, y_cu + offset,  depth + 1, tr_depth + 1);
     return;
   }
 
-  cb_y = ti->cb[ti->idx] & 0x1;
-  cb_u = (ti->cb[ti->idx] & 0x2) ? 1 : 0;
-  cb_v = (ti->cb[ti->idx] & 0x4) ? 1 : 0;
-  if(ti->block_type == CU_INTRA || tr_depth || cb_u || cb_v) {
+  if(cur_cu->type == CU_INTRA || tr_depth || cb_u || cb_v) {
       // Non-zero luma Tcoeffs
-      cabac.ctx = &g_qt_cbf_model_luma[tr_depth ? 0 : 1];
-      CABAC_BIN(&cabac, cb_y, "cbf_luma");
+      cabac.ctx = &g_qt_cbf_model_luma[!tr_depth];
+      CABAC_BIN(&cabac, cur_cu->coeff_y, "cbf_luma");
   }
 
 
@@ -1802,8 +1662,8 @@ void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
 
     // CoeffNxN
     // Residual Coding
-    if (cb_y) {
-      if (ti->block_type == CU_INTER) {
+    if (cur_cu->coeff_y) {
+      if (cur_cu->type == CU_INTER) {
         scan_idx = SCAN_DIAG;
       } else {
         // Luma (Intra) scanmode
@@ -1818,18 +1678,18 @@ void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
       encode_coeff_nxn(encoder, &ti->coeff[0][ti->idx * coeff_fourth], width, 0, scan_idx);
     }
 
-    if (cb_u || cb_v) {
+    if (cur_cu->coeff_u || cur_cu->coeff_v) {
       int8_t chroma_width = width >> 1;
-      if(ti->block_type == CU_INTER) {
+      if(cur_cu->type == CU_INTER) {
         scan_idx = SCAN_DIAG;
       } else {
         // Chroma scanmode
         ctx_idx++;
-        dir_mode = ti->intra_pred_mode_chroma;
+        dir_mode = cur_cu->intra.mode_chroma;
 
         if (dir_mode == 36) {
           // TODO: support NxN
-          dir_mode = ti->intra_pred_mode;
+          dir_mode = cur_cu->intra.mode;
         }
 
         scan_idx = SCAN_DIAG;
@@ -1839,12 +1699,12 @@ void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
         }
       }
 
-      if (cb_u) {
+      if (cur_cu->coeff_u) {
         encode_coeff_nxn(encoder, &ti->coeff[1][ti->idx * coeff_fourth >> 1],
                          chroma_width, 2, scan_idx);
       }
 
-      if (cb_v) {
+      if (cur_cu->coeff_v) {
         encode_coeff_nxn(encoder, &ti->coeff[2][ti->idx * coeff_fourth >> 1],
                          chroma_width, 2, scan_idx);
       }
@@ -1852,7 +1712,7 @@ void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
   }
 }
 
-void encode_coeff_nxn(encoder_control *encoder, int16_t *coeff, uint8_t width,
+void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t width,
                       uint8_t type, int8_t scan_mode)
 {
   int c1 = 1;
diff --git a/src/encoder.h b/src/encoder.h
index 2d37686f..f88046ad 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -67,39 +67,6 @@ typedef struct
   int8_t tc_offset_div2;   // \brief (deblocking)tc offset (div 2), range -6...6
 } encoder_control;
 
-typedef struct
-{
-  int8_t idx;
-  pixel *base;
-  pixel *base_u;
-  pixel *base_v;
-  
-  pixel *recbase;
-  pixel *recbase_u;
-  pixel *recbase_v;
-  
-  int16_t *pred;
-  int16_t *pred_u;
-  int16_t *pred_v;
-
-  int32_t base_stride;
-  int32_t recbase_stride;
-  int32_t pred_stride;
-  
-  // TODO: unify luma+chroma arrays
-  int16_t *coeff[3];
-  int8_t cb_top[3];
-  int8_t cb[4];
-  int8_t intra_pred_mode;
-  int8_t intra_pred_mode_chroma;
-  int32_t split[4];
-
-  int8_t block_type;
-
-  int32_t x_ctb,y_ctb;
-
-} transform_info;
-
 void init_tables(void);
 void init_encoder_control(encoder_control *control, bitstream *output);
 void init_encoder_input(encoder_input *input, FILE* inputfile,
@@ -119,9 +86,9 @@ void encode_last_significant_xy(encoder_control *encoder, uint8_t lastpos_x,
                                 uint8_t type, uint8_t scan);
 void encode_coeff_nxn(encoder_control *encoder, int16_t *coeff, uint8_t width,
                       uint8_t type, int8_t scan_mode);
-void encode_transform_tree(encoder_control *encoder, transform_info *ti,
+void encode_transform_tree(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                            uint8_t depth);
-void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
+void encode_transform_coeff(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                             int8_t depth, int8_t tr_depth);
 
 extern int16_t g_lambda_cost[55];
diff --git a/src/global.h b/src/global.h
index d54a322c..ced9716d 100644
--- a/src/global.h
+++ b/src/global.h
@@ -108,5 +108,6 @@ typedef int16_t coefficient;
 #endif
 
 #define FREE_POINTER(pointer) { free(pointer); pointer = NULL; }
+#define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; }
 
 #endif
\ No newline at end of file
diff --git a/src/picture.c b/src/picture.c
index 1253a083..d88e56a9 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -244,7 +244,8 @@ picture *picture_init(int32_t width, int32_t height,
     memset(pic->cu_array[i], 0, sizeof(cu_info) * cu_array_size);
   }
 
-  pic->coeff = NULL;
+  pic->coeff_y = NULL; pic->coeff_u = NULL; pic->coeff_v = NULL;
+  pic->pred_y = NULL; pic->pred_u = NULL; pic->pred_v = NULL;
 
   return pic;
 }
@@ -277,8 +278,13 @@ int picture_destroy(picture *pic)
   free(pic->cu_array);
   pic->cu_array = NULL;
 
-  free(pic->coeff);
-  pic->coeff = NULL;
+  FREE_POINTER(pic->coeff_y);
+  FREE_POINTER(pic->coeff_u);
+  FREE_POINTER(pic->coeff_v);
+
+  FREE_POINTER(pic->pred_y);
+  FREE_POINTER(pic->pred_u);
+  FREE_POINTER(pic->pred_v);
 
   return 1;
 }
@@ -414,7 +420,7 @@ unsigned satd_16bit_8x8_general(int16_t *piOrg, int32_t iStrideOrg, int16_t *piC
       } \
     } \
     return sum; \
-  }
+    }
 
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
 SATD_NXN(8, int16_t, 16bit)
@@ -422,7 +428,7 @@ SATD_NXN(16, int16_t, 16bit)
 SATD_NXN(32, int16_t, 16bit)
 SATD_NXN(64, int16_t, 16bit)
 
-
+  for (y = 0; y < 32; y += 8) {
 // Function macro for defining SAD calculating functions 
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type, suffix) \
@@ -438,7 +444,7 @@ SATD_NXN(64, int16_t, 16bit)
       } \
     } \
     return sum; \
-  }
+    }
 
 // These macros define sad_16bit_nxn functions for n = 4, 8, 16, 32, 64
 // with function signatures of cost_16bit_nxn_func.
@@ -469,9 +475,9 @@ cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n)
     return &satd_16bit_64x64;
   default:
     return NULL;
+    }
   }
-}
-
+  
 /**
  * \brief  Get a function that calculates SAD for NxN block.
  * 
@@ -480,7 +486,7 @@ cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n)
  * \returns  Pointer to cost_16bit_nxn_func.
  */
 cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
-{
+  {
   switch (n) {
   case 4:
     return &sad_16bit_4x4;
@@ -494,7 +500,7 @@ cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
     return &sad_16bit_64x64;
   default:
     return NULL;
-  }
+  }  
 }
 
 /**
@@ -510,7 +516,7 @@ unsigned satd_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
 {
   cost_16bit_nxn_func sad_func = get_satd_16bit_nxn_func(n);
   return sad_func(block1, block2);
-}
+  }
 
 /**
  * \brief Calculate SAD for NxN block of size N.
@@ -532,10 +538,10 @@ unsigned sad_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
     for (row = 0; row < n; row += n) {
       for (x = 0; x < n; ++x) {
         sum += abs(block1[row + x] - block2[row + x]);
-      }
-    }
-    return sum;
   }
+    }
+  return sum;
+}
 }
 
 /**
diff --git a/src/picture.h b/src/picture.h
index 12088a89..1ad6eced 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -32,6 +32,7 @@ enum { REF_PIC_LIST_0 = 0, REF_PIC_LIST_1 = 1, REF_PIC_LIST_X = 100 };
 typedef struct
 {
   int8_t mode;
+  int8_t mode_chroma;
   uint32_t cost;
 } cu_info_intra;
 
@@ -57,9 +58,15 @@ typedef struct
   int8_t part_size;  //!< \brief Currently only 2Nx2N, TODO: AMP/SMP/NxN parts
   int8_t tr_depth;   //!< \brief transform depth
   int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
+  int8_t skipped;    //!< \brief flag to indicate this block is skipped
+  int8_t merged;     //!< \brief flag to indicate this block is merged
   int8_t coeff_y;    //!< \brief is there coded coeffs Y
   int8_t coeff_u;    //!< \brief is there coded coeffs U
   int8_t coeff_v;    //!< \brief is there coded coeffs V
+
+  int8_t coeff_top_y;    //!< \brief is there coded coeffs Y in top level
+  int8_t coeff_top_u;    //!< \brief is there coded coeffs U in top level
+  int8_t coeff_top_v;    //!< \brief is there coded coeffs V in top level
   cu_info_intra intra;
   cu_info_inter inter;
 } cu_info;
@@ -77,7 +84,13 @@ typedef struct
   pixel* u_recdata;     //!< \brief Pointer to reconstructed U-data.
   pixel* v_recdata;     //!< \brief Pointer to reconstructed V-data.
 
-  coefficient* coeff;   //!< \brief coefficient pointer
+  pixel* pred_y;        //!< \brief Pointer to predicted Y
+  pixel* pred_u;        //!< \brief Pointer to predicted U
+  pixel* pred_v;        //!< \brief Pointer to predicted V
+
+  coefficient* coeff_y;   //!< \brief coefficient pointer Y
+  coefficient* coeff_u;   //!< \brief coefficient pointer U
+  coefficient* coeff_v;   //!< \brief coefficient pointer V
 
   int32_t width;          //!< \brief Luma pixel array width.
   int32_t height;         //!< \brief Luma pixel array height.
@@ -121,6 +134,7 @@ int picture_list_rem(picture_list *list, int n, int8_t destroy);
 
 typedef unsigned (*cost_16bit_nxn_func)(int16_t *block1, int16_t *block2);
 
+
 cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n);
 cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n);
 

From c44f0ff5406718e53ecaba3e5cdd0876c6fdf2ac Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 18 Oct 2013 14:23:21 +0300
Subject: [PATCH 07/19] Refactoring: all int16_t pixel info in intra to pixel
 typedef

---
 src/encoder.c | 91 +++++++++++++++++++++++++++++++++++++++------------
 src/global.h  |  2 +-
 src/intra.c   | 49 ++++++++++++++-------------
 src/intra.h   | 16 ++++-----
 src/picture.c | 27 ++++++++-------
 src/picture.h |  6 ++--
 src/search.c  | 15 ++++-----
 7 files changed, 125 insertions(+), 81 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 507e7d4b..a3dd0b0a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1090,15 +1090,15 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     uint32_t width = LCU_WIDTH>>depth;
 
     // INTRAPREDICTION VARIABLES
-    int16_t pred_y[LCU_WIDTH * LCU_WIDTH];
+    pixel pred_y[LCU_WIDTH * LCU_WIDTH];
 
     pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))     + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
     pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
     pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
 
     // SEARCH BEST INTRA MODE (AGAIN)
-    int16_t rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
-    int16_t *rec_shift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
+    pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
+    pixel *rec_shift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
     intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
                                  (LCU_WIDTH >> (depth)) * 2 + 8, rec, 
                                  (LCU_WIDTH >> (depth)) * 2 + 8, 0);
@@ -1301,9 +1301,12 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     pixel *pred_v    = &encoder->in.cur_pic->pred_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
     int32_t pred_stride = encoder->in.width;
 
-    int16_t *coeff_y   = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
-    int16_t *coeff_u   = &encoder->in.cur_pic->coeff_u[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
-    int16_t *coeff_v   = &encoder->in.cur_pic->coeff_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
+    coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
+    coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
+    coefficient *orig_coeff_y   = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    coefficient *orig_coeff_u   = &encoder->in.cur_pic->coeff_u[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    coefficient *orig_coeff_v   = &encoder->in.cur_pic->coeff_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
     int32_t coeff_stride = encoder->in.width;
 
     // Quant and transform here...
@@ -1312,9 +1315,9 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
     // INTRA PREDICTION
     // TODO: split to a function!
-    int16_t rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
-    int16_t *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
-    int16_t *rec_shift_u = &rec[(LCU_WIDTH >> (depth + 1)) * 2 + 8 + 1];
+    pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
+    pixel *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
+    pixel *rec_shift_u = &rec[(LCU_WIDTH >> (depth + 1)) * 2 + 8 + 1];
 
     uint32_t ac_sum = 0;
     uint32_t ctx_idx;
@@ -1454,14 +1457,21 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         
     // if non-zero coeffs
     if (cur_cu->coeff_y) {
+      i = 0;
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
+          orig_coeff_y[x + y * coeff_stride] = coeff_y[i];
+          i++;
+        }
+      }
       // RECONSTRUCT for predictions
       dequant(encoder, coeff_y, pre_quant_coeff, width, width, 0, cur_cu->type);
       itransform2d(block,pre_quant_coeff,width,0);
 
       i = 0;
 
-      for (y = 0; y < LCU_WIDTH >> depth; y++) {
-        for (x = 0; x < LCU_WIDTH >> depth; x++) {
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
           int16_t val = block[i++] + pred_y[x + y * pred_stride];
           //TODO: support 10+bits
           recbase_y[x + y * recbase_stride] = (uint8_t)CLIP(0, 255, val);
@@ -1470,8 +1480,8 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       // END RECONTRUCTION
     } else {
       // without coeffs, we only use the prediction
-      for (y = 0; y < LCU_WIDTH >> depth; y++) {
-        for (x = 0; x < LCU_WIDTH >> depth; x++) {
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
           recbase_y[x + y * recbase_stride] = (uint8_t)CLIP(0, 255, pred_y[x + y * pred_stride]);
         }
       }
@@ -1526,7 +1536,18 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         }
       }
           
-      if (cur_cu->coeff_u) {
+      if (cur_cu->coeff_u || cur_cu->coeff_v) { 
+        i = 0;
+        for (y = 0; y < width>>1; y++) {
+          for (x = 0; x < width>>1; x++) {
+            orig_coeff_u[x + y * (coeff_stride>>1)] = coeff_u[i];
+            orig_coeff_v[x + y * (coeff_stride>>1)] = coeff_v[i];
+            i++;
+          }
+        }
+      }
+
+      if (cur_cu->coeff_u) {        
         // RECONSTRUCT for predictions
         dequant(encoder, coeff_u, pre_quant_coeff, width >> 1, width >> 1, 2, cur_cu->type);
         itransform2d(block,pre_quant_coeff,LCU_WIDTH>>(depth+1),65535);
@@ -1590,8 +1611,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 {
   cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
   int8_t width = LCU_WIDTH>>depth;
-  int8_t split = (cur_cu->tr_depth > depth||!depth);
-  int8_t cb_y, cb_u, cb_v;
+  int8_t split = 0;//(cur_cu->tr_depth > depth||!depth);
   int32_t coeff_fourth = ((LCU_WIDTH>>(depth))*(LCU_WIDTH>>(depth)))+1;
   
   if (depth != 0 && depth != MAX_DEPTH + 1) {
@@ -1636,7 +1656,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     return;
   }
 
-  if(cur_cu->type == CU_INTRA || tr_depth || cb_u || cb_v) {
+  if(cur_cu->type == CU_INTRA || tr_depth || cur_cu->coeff_u || cur_cu->coeff_v) {
       // Non-zero luma Tcoeffs
       cabac.ctx = &g_qt_cbf_model_luma[!tr_depth];
       CABAC_BIN(&cabac, cur_cu->coeff_y, "cbf_luma");
@@ -1644,10 +1664,39 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
 
   {
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
+    coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
+    coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
+    int32_t coeff_stride = encoder->in.width;
+
     uint32_t ctx_idx;
     uint32_t scan_idx = SCAN_DIAG;
     uint32_t dir_mode;
 
+    if (cur_cu->coeff_y) {
+      int x,y;
+      coefficient *orig_pos = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+      for (y = 0; y < width; y++) {
+        for (x = 0; x < width; x++) {
+          coeff_y[x+y*width] = orig_pos[x];
+        }
+        orig_pos += coeff_stride;
+      }
+    }
+    if (cur_cu->coeff_u || cur_cu->coeff_v) {
+      int x,y;
+      coefficient *orig_pos_u = &encoder->in.cur_pic->coeff_u[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+      coefficient *orig_pos_v = &encoder->in.cur_pic->coeff_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+      for (y = 0; y < (width>>1); y++) {
+        for (x = 0; x < (width>>1); x++) {
+          coeff_u[x+y*(width>>1)] = orig_pos_u[x];
+          coeff_v[x+y*(width>>1)] = orig_pos_v[x];
+        }
+        orig_pos_u += coeff_stride>>1;
+        orig_pos_v += coeff_stride>>1;
+      }
+    }      
+
     switch (width) {
       case  2: ctx_idx = 6; break;
       case  4: ctx_idx = 5; break;
@@ -1667,7 +1716,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         scan_idx = SCAN_DIAG;
       } else {
         // Luma (Intra) scanmode
-        dir_mode = ti->intra_pred_mode;
+        dir_mode = cur_cu->intra.mode;
 
         //if multiple scans supported for transform size
         if (ctx_idx > 3 && ctx_idx < 6) {
@@ -1675,7 +1724,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         }
       }
 
-      encode_coeff_nxn(encoder, &ti->coeff[0][ti->idx * coeff_fourth], width, 0, scan_idx);
+      encode_coeff_nxn(encoder, coeff_y, width, 0, scan_idx);
     }
 
     if (cur_cu->coeff_u || cur_cu->coeff_v) {
@@ -1700,12 +1749,12 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       }
 
       if (cur_cu->coeff_u) {
-        encode_coeff_nxn(encoder, &ti->coeff[1][ti->idx * coeff_fourth >> 1],
+        encode_coeff_nxn(encoder, coeff_u,
                          chroma_width, 2, scan_idx);
       }
 
       if (cur_cu->coeff_v) {
-        encode_coeff_nxn(encoder, &ti->coeff[2][ti->idx * coeff_fourth >> 1],
+        encode_coeff_nxn(encoder, coeff_v,
                          chroma_width, 2, scan_idx);
       }
     }
diff --git a/src/global.h b/src/global.h
index ced9716d..3c37ebf7 100644
--- a/src/global.h
+++ b/src/global.h
@@ -45,7 +45,7 @@ typedef int16_t coefficient;
 #define LCU_WIDTH 64 /*!< Largest Coding Unit (IT'S 64x64, DO NOT TOUCH!) */
 
 #define MAX_INTER_SEARCH_DEPTH 3
-#define MIN_INTER_SEARCH_DEPTH 0
+#define MIN_INTER_SEARCH_DEPTH 1
 
 #define MAX_INTRA_SEARCH_DEPTH 3 /*!< Max search depth -> min block size (3 == 8x8) */
 #define MIN_INTRA_SEARCH_DEPTH 1 /*!< Min search depth -> max block size (0 == 64x64) */
diff --git a/src/intra.c b/src/intra.c
index f2953227..fc2372d8 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -77,7 +77,7 @@ int8_t intra_get_block_mode(picture *pic, uint32_t x_cu, uint32_t y_cu, uint8_t
  * \param width block width
  * \returns DC prediction
 */
-int16_t intra_get_dc_pred(int16_t *pic, uint16_t picwidth, uint32_t xpos, uint32_t ypos, uint8_t width)
+int16_t intra_get_dc_pred(pixel *pic, uint16_t picwidth, uint32_t xpos, uint32_t ypos, uint8_t width)
 {
   int32_t i, sum = 0;
 
@@ -155,11 +155,11 @@ int8_t intra_get_dir_luma_predictor(picture* pic, uint32_t x_cu, uint32_t y_cu,
  * \param preds output buffer for 3 predictions 
  * \returns (predictions are found)?1:0
  */
-void intra_filter(int16_t *ref, int32_t stride,int32_t width, int8_t mode)
+void intra_filter(pixel *ref, int32_t stride,int32_t width, int8_t mode)
 {
   #define FWIDTH (LCU_WIDTH*2+1)
-  int16_t filtered[FWIDTH * FWIDTH]; //!< temporary buffer for filtered samples
-  int16_t *filteredShift = &filtered[FWIDTH+1]; //!< pointer to temporary buffer with offset (1,1)
+  pixel filtered[FWIDTH * FWIDTH]; //!< temporary buffer for filtered samples
+  pixel *filteredShift = &filtered[FWIDTH+1]; //!< pointer to temporary buffer with offset (1,1)
   int x,y;
 
   if (!mode) {
@@ -213,8 +213,8 @@ void intra_filter(int16_t *ref, int32_t stride,int32_t width, int8_t mode)
 
  This function derives the prediction samples for planar mode (intra coding).
 */
-int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t recstride, uint32_t xpos,
-                         uint32_t ypos, uint32_t width, int16_t *dst, int32_t dststride, uint32_t *sad_out)
+int16_t intra_prediction(pixel *orig, int32_t origstride, pixel *rec, int32_t recstride, uint32_t xpos,
+                         uint32_t ypos, uint32_t width, pixel *dst, int32_t dststride, uint32_t *sad_out)
 {
   uint32_t best_sad = 0xffffffff;
   uint32_t sad = 0;
@@ -225,11 +225,11 @@ int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t
 
   // Temporary block arrays
   // TODO: alloc with alignment
-  int16_t pred[LCU_WIDTH * LCU_WIDTH + 1];  
-  int16_t orig_block[LCU_WIDTH * LCU_WIDTH + 1];  
-  int16_t rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
+  pixel pred[LCU_WIDTH * LCU_WIDTH + 1];  
+  pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];  
+  pixel rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
   
-  int16_t* rec_filtered = &rec_filtered_temp[recstride + 1]; //!< pointer to rec_filtered_temp with offset of (1,1)
+  pixel* rec_filtered = &rec_filtered_temp[recstride + 1]; //!< pointer to rec_filtered_temp with offset of (1,1)
   pixel *orig_shift = &orig[xpos + ypos*origstride];  //!< pointer to orig with offset of (1,1)
   int8_t filter = (width<32); // TODO: chroma support
 
@@ -318,10 +318,10 @@ int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t
  * \param chroma chroma-block flag
 
 */
-void intra_recon(int16_t* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride, int8_t mode, int8_t chroma)
+void intra_recon(pixel* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride, int8_t mode, int8_t chroma)
 {
   int32_t x,y,i;
-  int16_t pred[LCU_WIDTH * LCU_WIDTH];
+  pixel pred[LCU_WIDTH * LCU_WIDTH];
   int8_t filter = !chroma&&(width<32);
   #define COPY_PRED_TO_DST() for(y = 0; y < (int32_t)width; y++)  { for(x = 0; x < (int32_t)width; x++) { dst[x+y*dststride] = pred[x+y*width]; } }
 
@@ -362,12 +362,12 @@ void intra_recon(int16_t* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,u
  * \param chroma signaling if chroma is used, 0 = luma, 1 = U and 2 = V    
  *
  */
-void intra_build_reference_border(picture *pic, int32_t x_cu, int32_t y_cu,int16_t outwidth, int16_t *dst, int32_t dststride, int8_t chroma)
+void intra_build_reference_border(picture *pic, int32_t x_cu, int32_t y_cu,int16_t outwidth, pixel *dst, int32_t dststride, int8_t chroma)
 {
   int32_t left_column; //!< left column iterator
-  int16_t val;         //!< variable to store extrapolated value
+  pixel val;         //!< variable to store extrapolated value
   int32_t i;           //!< index iterator
-  int16_t dc_val       = 1<<(g_bitdepth-1); //!< default predictor value
+  pixel dc_val       = 1<<(g_bitdepth-1); //!< default predictor value
   int32_t top_row;     //!< top row iterator
   int32_t src_width    = (pic->width>>(chroma?1:0)); //!< source picture width
   int32_t src_height   = (pic->height>>(chroma?1:0));//!< source picture height
@@ -443,7 +443,7 @@ const int32_t inv_ang_table[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256};
  * \brief this functions constructs the angular intra prediction from border samples
  *
  */
-void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int32_t dst_stride, int32_t width,
+void intra_get_angular_pred(pixel* src, int32_t src_stride, pixel* dst, int32_t dst_stride, int32_t width,
                            int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter)
 {
   int32_t k,l;
@@ -460,10 +460,10 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
   int32_t inv_angle       = inv_ang_table[abs_ang];
 
   // Do angular predictions
-  int16_t *ref_main;
-  int16_t *ref_side;
-  int16_t  ref_above[2 * LCU_WIDTH + 1];
-  int16_t  ref_left[2 * LCU_WIDTH + 1];
+  pixel *ref_main;
+  pixel *ref_side;
+  pixel  ref_above[2 * LCU_WIDTH + 1];
+  pixel  ref_left[2 * LCU_WIDTH + 1];
 
   abs_ang           = ang_table[abs_ang];
   intra_pred_angle  = sign_ang * abs_ang;
@@ -522,7 +522,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
         // Do linear filtering
         for (l = 0; l < blk_size; l++) {
           ref_main_index        = l + delta_int + 1;
-          dst[k * dst_stride + l] = (int16_t) ( (minus_delta_fract * ref_main[ref_main_index]
+          dst[k * dst_stride + l] = (pixel) ( (minus_delta_fract * ref_main[ref_main_index]
                                                  + delta_fract * ref_main[ref_main_index + 1] + 16) >> 5);
         }
       } else {
@@ -536,7 +536,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
 
   // Flip the block if this is the horizontal mode
   if (mode_hor) {
-    int16_t tmp;
+    pixel tmp;
     for (k=0;k<blk_size-1;k++) {
       for (l=k+1;l<blk_size;l++) {
         tmp                 = dst[k * dst_stride + l];
@@ -551,7 +551,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
 
 
 
-void intra_dc_pred_filtering(int16_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t width, int32_t height )
+void intra_dc_pred_filtering(pixel *src, int32_t src_stride, pixel *dst, int32_t dst_stride, int32_t width, int32_t height )
 {
   int32_t x, y, dst_stride2, src_stride2;
 
@@ -580,9 +580,8 @@ void intra_dc_pred_filtering(int16_t *src, int32_t src_stride, int16_t *dst, int
  
   This function derives the prediction samples for planar mode (intra coding).
 */
-void intra_get_planar_pred(int16_t* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride)
+void intra_get_planar_pred(pixel* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride)
 {
-  int16_t dc_val = 1<<(g_bitdepth-1);
   int32_t k, l, bottom_left, top_right;
   int32_t hor_pred;
   int32_t left_column[LCU_WIDTH+1], top_row[LCU_WIDTH+1], bottom_row[LCU_WIDTH+1], right_column[LCU_WIDTH+1];
diff --git a/src/intra.h b/src/intra.h
index c99ba312..627da971 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -21,19 +21,19 @@ void intra_set_block_mode(picture* pic,uint32_t x_ctb, uint32_t y_ctb, uint8_t d
 int8_t intra_get_block_mode(picture* pic, uint32_t x_ctb, uint32_t y_ctb, uint8_t depth);
 
 int8_t intra_get_dir_luma_predictor(picture* pic,uint32_t x_ctb, uint32_t y_ctb, uint8_t depth, int8_t* preds);
-void intra_dc_pred_filtering(int16_t* src, int32_t src_stride, int16_t* dst, int32_t dst_stride, int32_t width, int32_t height );
+void intra_dc_pred_filtering(pixel* src, int32_t src_stride, pixel* dst, int32_t dst_stride, int32_t width, int32_t height );
 
-void intra_build_reference_border(picture* pic, int32_t x_ctb, int32_t y_ctb, int16_t out_width, int16_t* dst, int32_t dst_stride, int8_t chroma);
-void intra_filter(int16_t* ref, int32_t stride, int32_t width, int8_t mode);
+void intra_build_reference_border(picture* pic, int32_t x_ctb, int32_t y_ctb, int16_t out_width, pixel* dst, int32_t dst_stride, int8_t chroma);
+void intra_filter(pixel* ref, int32_t stride, int32_t width, int8_t mode);
 
 /* Predictions */
-int16_t intra_prediction(pixel* orig, int32_t orig_stride, int16_t* rec, int32_t rec_stride,  uint32_t x_pos, uint32_t ypos, uint32_t width, int16_t* dst, int32_t dst_stride, uint32_t *sad);
+int16_t intra_prediction(pixel* orig, int32_t orig_stride, pixel* rec, int32_t rec_stride,  uint32_t x_pos, uint32_t ypos, uint32_t width, pixel* dst, int32_t dst_stride, uint32_t *sad);
 
-int16_t intra_get_dc_pred(int16_t* pic, uint16_t pic_width, uint32_t x_pos, uint32_t y_pos, uint8_t width);
-void intra_get_planar_pred(int16_t* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride);
-void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* p_dst, int32_t dst_stride, int32_t width, int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter);
+int16_t intra_get_dc_pred(pixel* pic, uint16_t pic_width, uint32_t x_pos, uint32_t y_pos, uint8_t width);
+void intra_get_planar_pred(pixel* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride);
+void intra_get_angular_pred(pixel* src, int32_t src_stride, pixel* p_dst, int32_t dst_stride, int32_t width, int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter);
 
-void intra_recon(int16_t* rec, uint32_t rec_stride, uint32_t x_pos, uint32_t y_pos, uint32_t width, int16_t* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
+void intra_recon(pixel* rec, uint32_t rec_stride, uint32_t x_pos, uint32_t y_pos, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
 
 
 #endif
diff --git a/src/picture.c b/src/picture.c
index d88e56a9..3983d392 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -313,7 +313,7 @@ double image_psnr(pixel *frame1, pixel *frame2, int32_t x, int32_t y)
 /**
  * \brief  Calculate SATD between two 8x8 blocks inside bigger arrays.
  */
-unsigned satd_16bit_8x8_general(int16_t *piOrg, int32_t iStrideOrg, int16_t *piCur, int32_t iStrideCur)
+unsigned satd_16bit_8x8_general(pixel *piOrg, int32_t iStrideOrg, pixel *piCur, int32_t iStrideCur)
 {
   int32_t k, i, j, jj, sad=0;
   int32_t diff[64], m1[8][8], m2[8][8], m3[8][8];
@@ -423,12 +423,11 @@ unsigned satd_16bit_8x8_general(int16_t *piOrg, int32_t iStrideOrg, int16_t *piC
     }
 
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
-SATD_NXN(8, int16_t, 16bit)
-SATD_NXN(16, int16_t, 16bit)
-SATD_NXN(32, int16_t, 16bit)
-SATD_NXN(64, int16_t, 16bit)
+SATD_NXN(8, pixel, 16bit)
+SATD_NXN(16, pixel, 16bit)
+SATD_NXN(32, pixel, 16bit)
+SATD_NXN(64, pixel, 16bit)
 
-  for (y = 0; y < 32; y += 8) {
 // Function macro for defining SAD calculating functions 
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type, suffix) \
@@ -444,16 +443,16 @@ SATD_NXN(64, int16_t, 16bit)
       } \
     } \
     return sum; \
-    }
+  }
 
 // These macros define sad_16bit_nxn functions for n = 4, 8, 16, 32, 64
 // with function signatures of cost_16bit_nxn_func.
 // They are used through get_sad_16bit_nxn_func.
-SAD_NXN(4, int16_t, 16bit)
-SAD_NXN(8, int16_t, 16bit)
-SAD_NXN(16, int16_t, 16bit)
-SAD_NXN(32, int16_t, 16bit)
-SAD_NXN(64, int16_t, 16bit)
+SAD_NXN(4, pixel, 16bit)
+SAD_NXN(8, pixel, 16bit)
+SAD_NXN(16, pixel, 16bit)
+SAD_NXN(32, pixel, 16bit)
+SAD_NXN(64, pixel, 16bit)
 
 /**
  * \brief  Get a function that calculates SATD for NxN block.
@@ -512,7 +511,7 @@ cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
  * 
  * \returns       Sum of Absolute Transformed Differences (SATD)
  */
-unsigned satd_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
+unsigned satd_nxn_16bit(pixel *block1, pixel *block2, unsigned n)
 {
   cost_16bit_nxn_func sad_func = get_satd_16bit_nxn_func(n);
   return sad_func(block1, block2);
@@ -527,7 +526,7 @@ unsigned satd_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
  * 
  * \returns       Sum of Absolute Differences
  */
-unsigned sad_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
+unsigned sad_nxn_16bit(pixel *block1, pixel *block2, unsigned n)
 {
   cost_16bit_nxn_func sad_func = get_sad_16bit_nxn_func(n);
   if (sad_func) {
diff --git a/src/picture.h b/src/picture.h
index 1ad6eced..ec892683 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -132,14 +132,14 @@ int picture_list_destroy(picture_list *list);
 int picture_list_add(picture_list *list, picture *pic);
 int picture_list_rem(picture_list *list, int n, int8_t destroy);
 
-typedef unsigned (*cost_16bit_nxn_func)(int16_t *block1, int16_t *block2);
+typedef unsigned (*cost_16bit_nxn_func)(pixel *block1, pixel *block2);
 
 
 cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n);
 cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n);
 
-unsigned satd_16bit_nxn(int16_t *block1, int16_t *block2, unsigned n);
-unsigned sad_16bit_nxn(int16_t *block1, int16_t *block2, unsigned n);
+unsigned satd_16bit_nxn(pixel *block1, pixel *block2, unsigned n);
+unsigned sad_16bit_nxn(pixel *block1, pixel *block2, unsigned n);
 
 unsigned calc_sad(picture *pic, picture *ref, 
                   int pic_x, int pic_y, int ref_x, int ref_y, 
diff --git a/src/search.c b/src/search.c
index 7fe6bacc..e9aaac1f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -171,13 +171,13 @@ void hexagon_search(picture *pic, picture *ref,
  * \brief
  */
 void search_buildReferenceBorder(picture *pic, int32_t x_ctb, int32_t y_ctb,
-                                 int16_t outwidth, int16_t *dst, 
+                                 int16_t outwidth, pixel *dst, 
                                  int32_t dststride, int8_t chroma)
 {
   int32_t left_col; // left column iterator
-  int16_t val;      // variable to store extrapolated value
+  pixel val;      // variable to store extrapolated value
   int32_t i;        // index iterator
-  int16_t dc_val = 1 << (g_bitdepth - 1); // default predictor value
+  pixel dc_val = 1 << (g_bitdepth - 1); // default predictor value
   int32_t top_row;  // top row iterator
   int32_t src_width = (pic->width >> (chroma ? 1 : 0));   // source picture width
   int32_t src_height = (pic->height >> (chroma ? 1 : 0)); // source picture height
@@ -325,12 +325,9 @@ void search_tree(encoder_control *encoder,
     uint32_t width = LCU_WIDTH >> depth;
 
     // INTRAPREDICTION
-    int16_t pred[LCU_WIDTH * LCU_WIDTH + 1];
-    int16_t rec[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8)];
-    int16_t *recShift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
-
-    //int16_t *pred = (int16_t*)malloc(LCU_WIDTH*LCU_WIDTH*sizeof(int16_t));
-    //int16_t *rec = (int16_t*)malloc((LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)*sizeof(int16_t));
+    pixel pred[LCU_WIDTH * LCU_WIDTH + 1];
+    pixel rec[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8)];
+    pixel *recShift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
 
     // Build reconstructed block to use in prediction with extrapolated borders
     search_buildReferenceBorder(encoder->in.cur_pic, x_ctb, y_ctb,

From 927155de2b8fae4997b3ce245c7993c9fdff238d Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 18 Oct 2013 16:23:15 +0300
Subject: [PATCH 08/19] Set correct block residual bit -> fixes deblocking

---
 src/encoder.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/encoder.c b/src/encoder.c
index a3dd0b0a..185265de 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1457,6 +1457,8 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         
     // if non-zero coeffs
     if (cur_cu->coeff_y) {
+
+      picture_set_block_residual(encoder->in.cur_pic,x_cu,y_cu,depth,1);
       i = 0;
       for (y = 0; y < width; y++) {
         for (x = 0; x < width; x++) {

From bc9ddb64e57a74957d2c26e76b8e703a360a91d8 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 18 Oct 2013 16:54:11 +0300
Subject: [PATCH 09/19] Fixed luma reconstruction overflow/underflow

---
 src/encoder.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 185265de..2b51323a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1474,9 +1474,9 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
       for (y = 0; y < width; y++) {
         for (x = 0; x < width; x++) {
-          int16_t val = block[i++] + pred_y[x + y * pred_stride];
+          int val = block[i++] + pred_y[x + y * pred_stride];
           //TODO: support 10+bits
-          recbase_y[x + y * recbase_stride] = (uint8_t)CLIP(0, 255, val);
+          recbase_y[x + y * recbase_stride] = (pixel)CLIP(0, 255, val);
         }
       }
       // END RECONTRUCTION
@@ -1484,7 +1484,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       // without coeffs, we only use the prediction
       for (y = 0; y < width; y++) {
         for (x = 0; x < width; x++) {
-          recbase_y[x + y * recbase_stride] = (uint8_t)CLIP(0, 255, pred_y[x + y * pred_stride]);
+          recbase_y[x + y * recbase_stride] = (pixel)CLIP(0, 255, pred_y[x + y * pred_stride]);
         }
       }
     }

From 2efcc1267882d1ce44e66b3f4db0da00dd793ccf Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Fri, 18 Oct 2013 17:42:16 +0300
Subject: [PATCH 10/19] Fixed DC-filtering stride

---
 src/encoder.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 2b51323a..125019b8 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1369,7 +1369,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       // Filter DC-prediction
       if (cur_cu->intra.mode == 1 && width < 32) {
         intra_dc_pred_filtering(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8, pred_y,
-                                width, LCU_WIDTH >> depth, LCU_WIDTH >> depth);
+                                pred_stride, LCU_WIDTH >> depth, LCU_WIDTH >> depth);
       }
       
       // TODO : chroma intra prediction
@@ -1628,12 +1628,12 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
   if (encoder->in.video_format != FORMAT_400) {
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
 
-    cu_info *cur_cu_idx_2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
-    cu_info *cur_cu_idx_3 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-    cu_info *cur_cu_idx_4 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    //cu_info *cur_cu_idx_2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    //cu_info *cur_cu_idx_3 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    //cu_info *cur_cu_idx_4 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
 
     // Non-zero chroma U Tcoeffs
-    int8_t cb_flag = (!split) ? cur_cu->coeff_u : (cur_cu->coeff_u | cur_cu_idx_2->coeff_u | cur_cu_idx_3->coeff_u  | cur_cu_idx_4->coeff_u);
+    int8_t cb_flag = (!split) ? cur_cu->coeff_u : cur_cu->coeff_u;//(cur_cu->coeff_u | cur_cu_idx_2->coeff_u | cur_cu_idx_3->coeff_u  | cur_cu_idx_4->coeff_u);
     cabac.ctx = &g_qt_cbf_model_chroma[tr_depth];
 
     if (tr_depth == 0 /*|| ti->cb_top[1]*/) {
@@ -1642,7 +1642,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
     // Non-zero chroma V Tcoeffs
     // NOTE: Using the same ctx as before
-    cb_flag = (!split) ? cur_cu->coeff_v : (cur_cu->coeff_v | cur_cu_idx_2->coeff_v | cur_cu_idx_3->coeff_v  | cur_cu_idx_4->coeff_v);
+    cb_flag = (!split) ? cur_cu->coeff_v : cur_cu->coeff_v;//(cur_cu->coeff_v | cur_cu_idx_2->coeff_v | cur_cu_idx_3->coeff_v  | cur_cu_idx_4->coeff_v);
 
     if (tr_depth == 0 /*|| ti->cb_top[2]*/) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_v");

From bcb900371f3f608799c16cd5d1fd36d815d1d3a2 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 12:09:18 +0300
Subject: [PATCH 11/19] Added top_coeff array to cu_info and implemented
 derivation logic

---
 src/encoder.c | 58 ++++++++++++++++++++++++++++-----------------------
 src/picture.h |  6 +++---
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 125019b8..97e90584 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -840,8 +840,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
       }
       if (!border || (border_split_x && border_split_y)) {
         encode_coding_tree(encoder, x_ctb + change, y_ctb + change, depth + 1);
-      }
-      
+      }      
       return;
     }
   }
@@ -1057,21 +1056,19 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
                 y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
                 encoder->in.cur_pic);
-
     // Mark this block as "coded" (can be used for predictions..)
     picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
-
     encode_transform_tree(encoder,x_ctb, y_ctb, depth);
 
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
     if (!cur_cu->merged) {
       cabac.ctx = &g_cu_qt_root_cbf_model;
-      CABAC_BIN(&cabac, cur_cu->coeff_y | cur_cu->coeff_u | cur_cu->coeff_v, "rqt_root_cbf");
+      CABAC_BIN(&cabac, cur_cu->coeff_top_y[depth] | cur_cu->coeff_top_u[depth] | cur_cu->coeff_top_v[depth], "rqt_root_cbf");
     }
     // Code (possible) coeffs to bitstream
      
-    if(cur_cu->coeff_y | cur_cu->coeff_u | cur_cu->coeff_v) {
+    if(cur_cu->coeff_top_y[depth] | cur_cu->coeff_top_u[depth] | cur_cu->coeff_top_v[depth]) {
       encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0);
     }
 
@@ -1277,10 +1274,23 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
   // Split transform and increase depth
   if (depth == 0 || cur_cu->tr_depth > depth) {
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
+    cu_info *cu_a =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cu_b =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cu_c =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
     encode_transform_tree(encoder, x_cu, y_cu, depth+1);
     encode_transform_tree(encoder, x_cu + offset, y_cu, depth+1);
     encode_transform_tree(encoder, x_cu, y_cu + offset, depth+1);
     encode_transform_tree(encoder, x_cu + offset, y_cu + offset, depth+1);
+
+    // Derive coded coeff flags from the next depth
+    cur_cu->coeff_top_y[depth] = cur_cu->coeff_top_y[depth+1] | cu_a->coeff_top_y[depth+1] | cu_b->coeff_top_y[depth+1]
+                                  | cu_c->coeff_top_y[depth+1];        
+    cur_cu->coeff_top_u[depth] = cur_cu->coeff_top_u[depth+1] | cu_a->coeff_top_u[depth+1] | cu_b->coeff_top_u[depth+1]
+                                  | cu_c->coeff_top_u[depth+1];
+    cur_cu->coeff_top_v[depth] = cur_cu->coeff_top_v[depth+1] | cu_a->coeff_top_v[depth+1] | cu_b->coeff_top_v[depth+1]
+                                  | cu_c->coeff_top_v[depth+1];
+
+
     return;
   }
   
@@ -1301,7 +1311,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     pixel *pred_v    = &encoder->in.cur_pic->pred_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
     int32_t pred_stride = encoder->in.width;
 
-    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH<<2];
     coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient *orig_coeff_y   = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
@@ -1450,7 +1460,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     for (i = 0; i < width * width; i++) {
       if (coeff_y[i] != 0) {
         // Found one, we can break here
-        cur_cu->coeff_y = 1;
+        cur_cu->coeff_top_y[depth] = cur_cu->coeff_y = 1;
         break;
       }
     }
@@ -1509,7 +1519,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_u[i] != 0) {
           // Found one, we can break here
-          cur_cu->coeff_u = 1;
+          cur_cu->coeff_top_u[depth] = cur_cu->coeff_u = 1;
           break;
         }
       }
@@ -1533,7 +1543,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
       for (i = 0; i < width *width >> 2; i++) {
         if (coeff_v[i] != 0) {
           // Found one, we can break here
-          cur_cu->coeff_v = 1;
+          cur_cu->coeff_top_v[depth] = cur_cu->coeff_v = 1;
           break;
         }
       }
@@ -1628,23 +1638,19 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
   if (encoder->in.video_format != FORMAT_400) {
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
 
-    //cu_info *cur_cu_idx_2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
-    //cu_info *cur_cu_idx_3 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-    //cu_info *cur_cu_idx_4 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-
     // Non-zero chroma U Tcoeffs
-    int8_t cb_flag = (!split) ? cur_cu->coeff_u : cur_cu->coeff_u;//(cur_cu->coeff_u | cur_cu_idx_2->coeff_u | cur_cu_idx_3->coeff_u  | cur_cu_idx_4->coeff_u);
+    int8_t cb_flag = (!split) ? cur_cu->coeff_u : cur_cu->coeff_top_u[depth];
     cabac.ctx = &g_qt_cbf_model_chroma[tr_depth];
 
-    if (tr_depth == 0 /*|| ti->cb_top[1]*/) {
+    if (tr_depth == 0  || cur_cu->coeff_top_u[depth]) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_u");
     }
 
     // Non-zero chroma V Tcoeffs
     // NOTE: Using the same ctx as before
-    cb_flag = (!split) ? cur_cu->coeff_v : cur_cu->coeff_v;//(cur_cu->coeff_v | cur_cu_idx_2->coeff_v | cur_cu_idx_3->coeff_v  | cur_cu_idx_4->coeff_v);
+    cb_flag = (!split) ? cur_cu->coeff_v : cur_cu->coeff_top_v[depth];
 
-    if (tr_depth == 0 /*|| ti->cb_top[2]*/) {
+    if (tr_depth == 0  || cur_cu->coeff_top_v[depth]) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_v");
     }
   }
@@ -1666,7 +1672,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 
 
   {
-    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH+1];
     coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
     int32_t coeff_stride = encoder->in.width;
@@ -1800,7 +1806,7 @@ void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t widt
     if (coeff[i] != 0) {
       num_nonzero++;
     }
-  }  
+  }
 
   scan_cg = g_sig_last_scan[scan_mode][log2_block_size > 3 ? log2_block_size - 3 : 0];
 
@@ -1820,7 +1826,7 @@ void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t widt
 
     if (coeff[pos_last] != 0) {
       sig_coeffgroup_flag[(num_blk_side * (POSY >> shift) + (POSX >> shift))] = 1;
-  }
+    }
 
     num_nonzero -= (coeff[pos_last] != 0) ? 1 : 0;
     #undef POSY
@@ -1885,7 +1891,7 @@ void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t widt
                                              log2_block_size, width, type);
           cabac.ctx = &baseCtx[ctx_sig];
           CABAC_BIN(&cabac, sig, "significant_coeff_flag");
-    }
+        }
 
         if (sig) {
           abs_coeff[num_non_zero] = abs(coeff[blk_pos]);
@@ -1894,14 +1900,14 @@ void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t widt
 
           if (last_nz_pos_in_cg == -1) {
             last_nz_pos_in_cg = scan_pos_sig;
-        }
+          }
 
           first_nz_pos_in_cg  = scan_pos_sig;
-          }
         }
+      }
     } else {
       scan_pos_sig = sub_pos - 1;
-      }
+    }
 
     if (num_non_zero > 0) {
       int8_t sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >=
@@ -1912,7 +1918,7 @@ void encode_coeff_nxn(encoder_control *encoder, coefficient *coeff, uint8_t widt
 
       if (c1 == 0) {
         ctx_set++;
-    }
+      }
 
       c1 = 1;
 
diff --git a/src/picture.h b/src/picture.h
index ec892683..2ba93b2f 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -64,9 +64,9 @@ typedef struct
   int8_t coeff_u;    //!< \brief is there coded coeffs U
   int8_t coeff_v;    //!< \brief is there coded coeffs V
 
-  int8_t coeff_top_y;    //!< \brief is there coded coeffs Y in top level
-  int8_t coeff_top_u;    //!< \brief is there coded coeffs U in top level
-  int8_t coeff_top_v;    //!< \brief is there coded coeffs V in top level
+  int8_t coeff_top_y[MAX_DEPTH+1];  //!< \brief is there coded coeffs Y in top level
+  int8_t coeff_top_u[MAX_DEPTH+1];  //!< \brief is there coded coeffs U in top level
+  int8_t coeff_top_v[MAX_DEPTH+1];  //!< \brief is there coded coeffs V in top level
   cu_info_intra intra;
   cu_info_inter inter;
 } cu_info;

From 4cec2963ace35367463d305b4e937190b985a28b Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 12:33:11 +0300
Subject: [PATCH 12/19] Fixed transform splitting to allow 64x64 inter blocks

---
 src/encoder.c | 19 ++++++++++++++-----
 src/global.h  |  2 +-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 97e90584..1a0a7d9a 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1623,7 +1623,7 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 {
   cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
   int8_t width = LCU_WIDTH>>depth;
-  int8_t split = 0;//(cur_cu->tr_depth > depth||!depth);
+  int8_t split = (cur_cu->tr_depth > depth||!depth);
   int32_t coeff_fourth = ((LCU_WIDTH>>(depth))*(LCU_WIDTH>>(depth)))+1;
   
   if (depth != 0 && depth != MAX_DEPTH + 1) {
@@ -1639,27 +1639,36 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
 
     // Non-zero chroma U Tcoeffs
-    int8_t cb_flag = (!split) ? cur_cu->coeff_u : cur_cu->coeff_top_u[depth];
+    int8_t cb_flag = !split ? cur_cu->coeff_u : cur_cu->coeff_top_u[depth];
     cabac.ctx = &g_qt_cbf_model_chroma[tr_depth];
 
-    if (tr_depth == 0  || cur_cu->coeff_top_u[depth]) {
+    if (tr_depth == 0  || cur_cu->coeff_top_u[depth-1]) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_u");
     }
 
     // Non-zero chroma V Tcoeffs
     // NOTE: Using the same ctx as before
-    cb_flag = (!split) ? cur_cu->coeff_v : cur_cu->coeff_top_v[depth];
+    cb_flag = !split ? cur_cu->coeff_v : cur_cu->coeff_top_v[depth];
 
-    if (tr_depth == 0  || cur_cu->coeff_top_v[depth]) {
+    if (tr_depth == 0  || cur_cu->coeff_top_v[depth-1]) {
       CABAC_BIN(&cabac, cb_flag, "cbf_chroma_v");
     }
   }
   
   if (split) {
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
+    cu_info *cu_a =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cu_b =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+    cu_info *cu_c =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
     encode_transform_coeff(encoder, x_cu, y_cu, depth + 1, tr_depth + 1);
+    cu_a->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_a->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
+    cu_a->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
     encode_transform_coeff(encoder, x_cu + offset, y_cu,  depth + 1, tr_depth + 1);
+    cu_b->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_b->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
+    cu_b->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
     encode_transform_coeff(encoder, x_cu, y_cu + offset,  depth + 1, tr_depth + 1);
+    cu_c->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_c->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
+    cu_c->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
     encode_transform_coeff(encoder, x_cu + offset, y_cu + offset,  depth + 1, tr_depth + 1);
     return;
   }
diff --git a/src/global.h b/src/global.h
index 3c37ebf7..ced9716d 100644
--- a/src/global.h
+++ b/src/global.h
@@ -45,7 +45,7 @@ typedef int16_t coefficient;
 #define LCU_WIDTH 64 /*!< Largest Coding Unit (IT'S 64x64, DO NOT TOUCH!) */
 
 #define MAX_INTER_SEARCH_DEPTH 3
-#define MIN_INTER_SEARCH_DEPTH 1
+#define MIN_INTER_SEARCH_DEPTH 0
 
 #define MAX_INTRA_SEARCH_DEPTH 3 /*!< Max search depth -> min block size (3 == 8x8) */
 #define MIN_INTRA_SEARCH_DEPTH 1 /*!< Min search depth -> max block size (0 == 64x64) */

From 0cce17453ca7f5e506f24fa812f026b66d9ef9a7 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 13:04:58 +0300
Subject: [PATCH 13/19] Simplified chroma-coeff-coded-flag derivation on
 transform split

---
 src/encoder.c | 33 ++++++++++++++-------------------
 src/encoder.h |  2 +-
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 1a0a7d9a..d95b603e 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -1069,7 +1069,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // Code (possible) coeffs to bitstream
      
     if(cur_cu->coeff_top_y[depth] | cur_cu->coeff_top_u[depth] | cur_cu->coeff_top_v[depth]) {
-      encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0);
+      encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0, 0, 0);
     }
 
 
@@ -1206,7 +1206,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // Coeff
     // Transform tree
     encode_transform_tree(encoder, x_ctb, y_ctb, depth);
-    encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0);
+    encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0, 0, 0);
     // end Transform tree
     // end Coeff
 
@@ -1619,12 +1619,15 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
 }
 
 void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
-                            int8_t depth, int8_t tr_depth)
+                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v)
 {
   cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
   int8_t width = LCU_WIDTH>>depth;
   int8_t split = (cur_cu->tr_depth > depth||!depth);
   int32_t coeff_fourth = ((LCU_WIDTH>>(depth))*(LCU_WIDTH>>(depth)))+1;
+
+  int8_t cb_flag_u = !split ? cur_cu->coeff_u : cur_cu->coeff_top_u[depth];
+  int8_t cb_flag_v = !split ? cur_cu->coeff_v : cur_cu->coeff_top_v[depth];
   
   if (depth != 0 && depth != MAX_DEPTH + 1) {
     cabac.ctx = &g_trans_subdiv_model[5 - ((g_convert_to_bit[LCU_WIDTH] + 2) -
@@ -1639,19 +1642,17 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     uint8_t offset = 1<<(MAX_DEPTH-1-depth);
 
     // Non-zero chroma U Tcoeffs
-    int8_t cb_flag = !split ? cur_cu->coeff_u : cur_cu->coeff_top_u[depth];
     cabac.ctx = &g_qt_cbf_model_chroma[tr_depth];
 
-    if (tr_depth == 0  || cur_cu->coeff_top_u[depth-1]) {
-      CABAC_BIN(&cabac, cb_flag, "cbf_chroma_u");
+    if (tr_depth == 0  || parent_coeff_u) {
+      CABAC_BIN(&cabac, cb_flag_u, "cbf_chroma_u");
     }
 
     // Non-zero chroma V Tcoeffs
     // NOTE: Using the same ctx as before
-    cb_flag = !split ? cur_cu->coeff_v : cur_cu->coeff_top_v[depth];
 
-    if (tr_depth == 0  || cur_cu->coeff_top_v[depth-1]) {
-      CABAC_BIN(&cabac, cb_flag, "cbf_chroma_v");
+    if (tr_depth == 0  || parent_coeff_v) {
+      CABAC_BIN(&cabac, cb_flag_v, "cbf_chroma_v");
     }
   }
   
@@ -1660,16 +1661,10 @@ void encode_transform_coeff(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     cu_info *cu_a =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + y_cu * (encoder->in.width_in_lcu << MAX_DEPTH)];
     cu_info *cu_b =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
     cu_info *cu_c =  &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + offset + (y_cu + offset) * (encoder->in.width_in_lcu << MAX_DEPTH)];
-    encode_transform_coeff(encoder, x_cu, y_cu, depth + 1, tr_depth + 1);
-    cu_a->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_a->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
-    cu_a->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
-    encode_transform_coeff(encoder, x_cu + offset, y_cu,  depth + 1, tr_depth + 1);
-    cu_b->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_b->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
-    cu_b->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
-    encode_transform_coeff(encoder, x_cu, y_cu + offset,  depth + 1, tr_depth + 1);
-    cu_c->coeff_top_y[depth] = cur_cu->coeff_top_y[depth]; cu_c->coeff_top_u[depth] = cur_cu->coeff_top_u[depth];
-    cu_c->coeff_top_v[depth] = cur_cu->coeff_top_v[depth];
-    encode_transform_coeff(encoder, x_cu + offset, y_cu + offset,  depth + 1, tr_depth + 1);
+    encode_transform_coeff(encoder, x_cu, y_cu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder, x_cu + offset, y_cu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder, x_cu, y_cu + offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(encoder, x_cu + offset, y_cu + offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
     return;
   }
 
diff --git a/src/encoder.h b/src/encoder.h
index f88046ad..b984f1f3 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -89,7 +89,7 @@ void encode_coeff_nxn(encoder_control *encoder, int16_t *coeff, uint8_t width,
 void encode_transform_tree(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                            uint8_t depth);
 void encode_transform_coeff(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
-                            int8_t depth, int8_t tr_depth);
+                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
 
 extern int16_t g_lambda_cost[55];
 extern uint32_t* g_sig_last_scan[3][7];

From b20b583d9bedc516fb751cfc307de2393dee848f Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 16:27:50 +0300
Subject: [PATCH 14/19] Moved all residual/coeff functionality to
 encode_block_residual()

---
 src/encoder.c | 252 +++++++++++++++++++++++++++++---------------------
 src/encoder.h |   2 +
 src/search.c  |   7 ++
 3 files changed, 157 insertions(+), 104 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index d95b603e..5c248de5 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -771,8 +771,7 @@ void encode_slice_header(encoder_control* encoder)
 void encode_slice_data(encoder_control* encoder)
 {
   uint16_t x_ctb, y_ctb;
-
-  scalinglist_process();
+  
   init_contexts(encoder,encoder->in.cur_pic->slicetype);
 
   // Loop through every LCU in the slice
@@ -1052,14 +1051,6 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     } // if !merge
 
 
-    // Inter reconstruction
-    inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
-                y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
-                encoder->in.cur_pic);
-    // Mark this block as "coded" (can be used for predictions..)
-    picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
-    encode_transform_tree(encoder,x_ctb, y_ctb, depth);
-
     // Only need to signal coded block flag if not skipped or merged
     // skip = no coded residual, merge = coded residual
     if (!cur_cu->merged) {
@@ -1081,35 +1072,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     int8_t mpm_preds = -1;
     int i;
     uint32_t flag;
-    pixel *base_y = &encoder->in.cur_pic->y_data[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))     + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
-    pixel *base_u = &encoder->in.cur_pic->u_data[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
-    pixel *base_v = &encoder->in.cur_pic->v_data[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
-    uint32_t width = LCU_WIDTH>>depth;
-
-    // INTRAPREDICTION VARIABLES
-    pixel pred_y[LCU_WIDTH * LCU_WIDTH];
-
-    pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))     + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
-    pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
-    pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
-
-    // SEARCH BEST INTRA MODE (AGAIN)
-    pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
-    pixel *rec_shift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
-    intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
-                                 (LCU_WIDTH >> (depth)) * 2 + 8, rec, 
-                                 (LCU_WIDTH >> (depth)) * 2 + 8, 0);
-    cur_cu->intra.mode = (int8_t)intra_prediction(encoder->in.cur_pic->y_data,
-                                                  encoder->in.width, 
-                                                  rec_shift, 
-                                                  (LCU_WIDTH >> (depth)) * 2 + 8,
-                                                  x_ctb * (LCU_WIDTH >> (MAX_DEPTH)), 
-                                                  y_ctb * (LCU_WIDTH >> (MAX_DEPTH)), 
-                                                  width, pred_y, width, 
-                                                  &cur_cu->intra.cost);
-    intra_pred_mode = cur_cu->intra.mode;
-    intra_set_block_mode(encoder->in.cur_pic, x_ctb, y_ctb, depth,
-                         intra_pred_mode);
+    uint32_t width = LCU_WIDTH>>depth;    
       
     #if ENABLE_PCM == 1
     // Code must start after variable initialization
@@ -1204,8 +1167,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // END OF PREDINFO CODING
     
     // Coeff
-    // Transform tree
-    encode_transform_tree(encoder, x_ctb, y_ctb, depth);
+    // Transform tree    
     encode_transform_coeff(encoder, x_ctb, y_ctb, depth, 0, 0, 0);
     // end Transform tree
     // end Coeff
@@ -1311,7 +1273,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     pixel *pred_v    = &encoder->in.cur_pic->pred_v[x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
     int32_t pred_stride = encoder->in.width;
 
-    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH<<2];
+    coefficient coeff_y[LCU_WIDTH*LCU_WIDTH];
     coefficient coeff_u[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient coeff_v[LCU_WIDTH*LCU_WIDTH>>2];
     coefficient *orig_coeff_y   = &encoder->in.cur_pic->coeff_y[x_cu * (LCU_WIDTH >> (MAX_DEPTH))     + (y_cu * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
@@ -1324,10 +1286,7 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
     int16_t pre_quant_coeff[LCU_WIDTH*LCU_WIDTH>>2];
 
     // INTRA PREDICTION
-    // TODO: split to a function!
-    pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
-    pixel *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
-    pixel *rec_shift_u = &rec[(LCU_WIDTH >> (depth + 1)) * 2 + 8 + 1];
+
 
     uint32_t ac_sum = 0;
     uint32_t ctx_idx;
@@ -1364,71 +1323,25 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu,int32_t y_cu,
         // TODO: support NxN
         dir_mode = cur_cu->intra.mode;
       }
-
       if (ctx_idx > 4 && ctx_idx < 7) { // if multiple scans supported for transform size
         scan_idx_chroma = abs((int32_t) dir_mode - 26) < 5 ? 1 : (abs((int32_t)dir_mode - 10) < 5 ? 2 : 0);
       }
+    } 
 
-      // Build reconstructed block to use in prediction with extrapolated borders
-      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
-                                   (LCU_WIDTH >> (depth)) * 2 + 8, rec, (LCU_WIDTH >> (depth)) * 2 + 8, 0);
-      intra_recon(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8,
-                  x_cu * (LCU_WIDTH >> (MAX_DEPTH)), y_cu * (LCU_WIDTH >> (MAX_DEPTH)),
-                  width, pred_y, pred_stride, cur_cu->intra.mode, 0);
 
-      // Filter DC-prediction
-      if (cur_cu->intra.mode == 1 && width < 32) {
-        intra_dc_pred_filtering(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8, pred_y,
-                                pred_stride, LCU_WIDTH >> depth, LCU_WIDTH >> depth);
-      }
-      
-      // TODO : chroma intra prediction
-      if (cur_cu->intra.mode_chroma != 36
-          && cur_cu->intra.mode_chroma == cur_cu->intra.mode) {
-          cur_cu->intra.mode_chroma = 36;
-      }
-    
-      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
-                                   (LCU_WIDTH >> (depth + 1)) * 2 + 8, rec,
-                                   (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                                   1);
-      intra_recon(rec_shift_u, 
-                  (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                  x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  width >> 1,
-                  pred_u,
-                  pred_stride >> 1,
-                  cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
-                  1);
-      intra_build_reference_border(encoder->in.cur_pic, x_cu, y_cu,
-                                   (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                                   rec, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                                   2);
-      intra_recon(rec_shift_u, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
-                  x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1)),
-                  width >> 1,
-                  pred_v,
-                  pred_stride >> 1,
-                  cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
-                  1);
-
-    // This affects reconstruction, do after that
-      picture_set_block_coded(encoder->in.cur_pic, x_cu, y_cu, depth, 1);
-    } else  { // Inter mode
-      for(y = 0; y < LCU_WIDTH>>depth; y++) {
-        for(x = 0; x < LCU_WIDTH>>depth; x++) {
-          pred_y[x+y*pred_stride]=recbase_y[x+y*base_stride];
-        }
-      }
-      for(y = 0; y < LCU_WIDTH>>(depth+1); y++) {
-        for(x = 0; x < LCU_WIDTH>>(depth+1); x++) {
-          pred_u[x+y*(pred_stride>>1)]=recbase_u[x+y*(base_stride>>1)];
-          pred_v[x+y*(pred_stride>>1)]=recbase_v[x+y*(base_stride>>1)];
-        }
+    // Copy Luma and Chroma to the pred-block
+    for(y = 0; y < LCU_WIDTH>>depth; y++) {
+      for(x = 0; x < LCU_WIDTH>>depth; x++) {
+        pred_y[x+y*pred_stride]=recbase_y[x+y*recbase_stride];
       }
     }
+    for(y = 0; y < LCU_WIDTH>>(depth+1); y++) {
+      for(x = 0; x < LCU_WIDTH>>(depth+1); x++) {
+        pred_u[x+y*(pred_stride>>1)]=recbase_u[x+y*(recbase_stride>>1)];
+        pred_v[x+y*(pred_stride>>1)]=recbase_v[x+y*(recbase_stride>>1)];
+      }
+    }
+    
     // INTRA PREDICTION ENDS HERE
 
     // Get residual by subtracting prediction
@@ -2060,3 +1973,134 @@ void encode_last_significant_xy(encoder_control *encoder,
 
   // end LastSignificantXY
 }
+
+
+/**
+ * \brief This function reconstructs inter/intra predictions and produces coded residual to the buffer
+ */
+void encode_block_residual(encoder_control *encoder, 
+                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
+{
+  cu_info *cur_cu = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_ctb + y_ctb * (encoder->in.width_in_lcu << MAX_DEPTH)];
+  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
+  uint8_t split_model = 0;
+
+  // Check for slice border
+  uint8_t border_x = ((encoder->in.width) < (x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_y = ((encoder->in.height) < (y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_split_x = ((encoder->in.width)  < ((x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border_split_y = ((encoder->in.height) < ((y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border = border_x | border_y; /*!< are we in any border CU */
+
+  scalinglist_process();
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    if (split_flag || border) {
+      // Split blocks and remember to change x and y block positions
+      uint8_t change = 1<<(MAX_DEPTH-1-depth);
+      encode_block_residual(encoder, x_ctb, y_ctb, depth + 1);
+
+      if (!border_x || border_split_x) { 
+        encode_block_residual(encoder, x_ctb + change, y_ctb, depth + 1);
+      }
+      if (!border_y || border_split_y) {
+        encode_block_residual(encoder, x_ctb, y_ctb + change, depth + 1);
+      }
+      if (!border || (border_split_x && border_split_y)) {
+        encode_block_residual(encoder, x_ctb + change, y_ctb + change, depth + 1);
+      }
+      return;
+    }
+  }
+
+  if (cur_cu->type == CU_INTRA) {
+    uint32_t width = LCU_WIDTH>>depth;
+
+    // INTRAPREDICTION VARIABLES
+    pixel pred_y[LCU_WIDTH * LCU_WIDTH];
+
+    pixel *recbase_y = &encoder->in.cur_pic->y_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH))     + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH)))     * encoder->in.width];
+    pixel *recbase_u = &encoder->in.cur_pic->u_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    pixel *recbase_v = &encoder->in.cur_pic->v_recdata[x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * (encoder->in.width >> 1)];
+    int32_t rec_stride = encoder->in.width;
+
+    // SEARCH BEST INTRA MODE (AGAIN)
+    pixel rec[(LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)];
+    pixel *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
+    pixel *rec_shift_u = &rec[(LCU_WIDTH >> (depth + 1)) * 2 + 8 + 1];
+
+    cur_cu->intra.mode_chroma = 36;
+    
+    intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
+                                 (LCU_WIDTH >> (depth)) * 2 + 8, rec,
+                                 (LCU_WIDTH >> (depth)) * 2 + 8, 0);
+    cur_cu->intra.mode = (int8_t)intra_prediction(encoder->in.cur_pic->y_data,
+                                                  encoder->in.width,
+                                                  rec_shift,
+                                                  (LCU_WIDTH >> (depth)) * 2 + 8,
+                                                  x_ctb * (LCU_WIDTH >> (MAX_DEPTH)),
+                                                  y_ctb * (LCU_WIDTH >> (MAX_DEPTH)),
+                                                  width, pred_y, width,
+                                                  &cur_cu->intra.cost);
+    intra_set_block_mode(encoder->in.cur_pic, x_ctb, y_ctb, depth,
+                         cur_cu->intra.mode);    
+    
+    // Build reconstructed block to use in prediction with extrapolated borders
+    intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
+                                  (LCU_WIDTH >> (depth)) * 2 + 8, rec, (LCU_WIDTH >> (depth)) * 2 + 8, 0);
+    intra_recon(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8,
+                x_ctb * (LCU_WIDTH >> (MAX_DEPTH)), y_ctb * (LCU_WIDTH >> (MAX_DEPTH)),
+                width, recbase_y, rec_stride, cur_cu->intra.mode, 0);
+
+    // Filter DC-prediction
+    if (cur_cu->intra.mode == 1 && width < 32) {
+      intra_dc_pred_filtering(rec_shift, (LCU_WIDTH >> (depth)) * 2 + 8, recbase_y,
+                              rec_stride, LCU_WIDTH >> depth, LCU_WIDTH >> depth);
+    }
+    
+    // TODO : chroma intra prediction
+    if (cur_cu->intra.mode_chroma != 36
+        && cur_cu->intra.mode_chroma == cur_cu->intra.mode) {
+        cur_cu->intra.mode_chroma = 36;
+    }
+    
+    intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
+                                  (LCU_WIDTH >> (depth + 1)) * 2 + 8, rec,
+                                  (LCU_WIDTH >> (depth + 1)) * 2 + 8,
+                                  1);
+                                  
+    intra_recon(rec_shift_u, 
+                (LCU_WIDTH >> (depth + 1)) * 2 + 8,
+                x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                width >> 1,
+                recbase_u,
+                rec_stride >> 1,
+                cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
+                1);
+    intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
+                                  (LCU_WIDTH >> (depth + 1)) * 2 + 8,
+                                  rec, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
+                                  2);
+    intra_recon(rec_shift_u, (LCU_WIDTH >> (depth + 1)) * 2 + 8,
+                x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)),
+                width >> 1,
+                recbase_v,
+                rec_stride >> 1,
+                cur_cu->intra.mode_chroma != 36 ? cur_cu->intra.mode_chroma : cur_cu->intra.mode,
+                1);
+
+  } else {
+    // Inter reconstruction
+    inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
+                y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
+                encoder->in.cur_pic);    
+  }
+
+  // Mark this block as "coded" (can be used for predictions..)
+  picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);    
+  encode_transform_tree(encoder,x_ctb, y_ctb, depth);
+
+}
diff --git a/src/encoder.h b/src/encoder.h
index b984f1f3..a44d35db 100644
--- a/src/encoder.h
+++ b/src/encoder.h
@@ -90,6 +90,8 @@ void encode_transform_tree(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                            uint8_t depth);
 void encode_transform_coeff(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                             int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
+void encode_block_residual(encoder_control *encoder, 
+                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
 
 extern int16_t g_lambda_cost[55];
 extern uint32_t* g_sig_last_scan[3][7];
diff --git a/src/search.c b/src/search.c
index e9aaac1f..2712b247 100644
--- a/src/search.c
+++ b/src/search.c
@@ -413,6 +413,8 @@ uint32_t search_best_mode(encoder_control *encoder,
   return best_cost;
 }
 
+
+
 /**
  * \brief
  */
@@ -441,9 +443,14 @@ void search_slice_data(encoder_control *encoder)
       if (RENDER_CU) {
         render_cu_file(encoder, encoder->in.cur_pic, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp2);
       }
+
+      encode_block_residual(encoder, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, depth);
+
     }
   }
 
+
+
   if (RENDER_CU && fp) {
     close_cu_file(fp);
     fp = 0;

From bb9d8ee9dd0ac22509608155b89d660ab0d9beda Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 16:53:18 +0300
Subject: [PATCH 15/19] Fixed motion vector difference calculation

---
 src/encoder.c | 53 ++++++++++++++++++++++++++-------------------------
 src/picture.h |  2 ++
 2 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 5c248de5..73081690 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -932,8 +932,6 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
       }
     } else {
       uint32_t ref_list_idx;
-      int16_t mv_cand[2][2];
-
       /*
       // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
       if(encoder->in.cur_pic->slicetype == SLICE_B)
@@ -982,30 +980,11 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
                   if (symbol == 0) break;
                 }
               }
-            }
-
-            // Get MV candidates
-            inter_get_mv_cand(encoder, x_ctb, y_ctb, depth, mv_cand);
-
-            // Select better candidate
-            cur_cu->inter.mv_ref = 0; // Default to candidate 0
-
-            // Only check when candidates are different
-            if (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1]) {
-              uint16_t cand_1_diff = abs(cur_cu->inter.mv[0] - mv_cand[0][0]) + abs(
-                                       cur_cu->inter.mv[1] - mv_cand[0][1]);
-              uint16_t cand_2_diff = abs(cur_cu->inter.mv[0] - mv_cand[1][0]) + abs(
-                                       cur_cu->inter.mv[1] - mv_cand[1][1]);
-
-              // Select candidate 1 if it's closer
-              if (cand_2_diff < cand_1_diff) {
-                cur_cu->inter.mv_ref = 1;
-              }
-            }
+            }            
 
             if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ encoder->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
-              const int32_t mvd_hor = cur_cu->inter.mv[0] - mv_cand[cur_cu->inter.mv_ref][0];
-              const int32_t mvd_ver = cur_cu->inter.mv[1] - mv_cand[cur_cu->inter.mv_ref][1];
+              const int32_t mvd_hor = cur_cu->inter.mvd[0];
+              const int32_t mvd_ver = cur_cu->inter.mvd[1];
               const int8_t hor_abs_gr0 = mvd_hor != 0;
               const int8_t ver_abs_gr0 = mvd_ver != 0;
               const uint32_t mvd_hor_abs = abs(mvd_hor);
@@ -2030,7 +2009,7 @@ void encode_block_residual(encoder_control *encoder,
     pixel *rec_shift  = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
     pixel *rec_shift_u = &rec[(LCU_WIDTH >> (depth + 1)) * 2 + 8 + 1];
 
-    cur_cu->intra.mode_chroma = 36;
+    cur_cu->intra.mode_chroma = 36; // TODO: Chroma intra prediction
     
     intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
                                  (LCU_WIDTH >> (depth)) * 2 + 8, rec,
@@ -2044,7 +2023,7 @@ void encode_block_residual(encoder_control *encoder,
                                                   width, pred_y, width,
                                                   &cur_cu->intra.cost);
     intra_set_block_mode(encoder->in.cur_pic, x_ctb, y_ctb, depth,
-                         cur_cu->intra.mode);    
+                         cur_cu->intra.mode);
     
     // Build reconstructed block to use in prediction with extrapolated borders
     intra_build_reference_border(encoder->in.cur_pic, x_ctb, y_ctb,
@@ -2093,6 +2072,28 @@ void encode_block_residual(encoder_control *encoder,
                 1);
 
   } else {
+    int16_t mv_cand[2][2];
+    // Get MV candidates
+    inter_get_mv_cand(encoder, x_ctb, y_ctb, depth, mv_cand);
+
+    // Select better candidate
+    cur_cu->inter.mv_ref = 0; // Default to candidate 0
+
+    // Only check when candidates are different
+    if (mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1]) {
+      uint16_t cand_1_diff = abs(cur_cu->inter.mv[0] - mv_cand[0][0]) + abs(
+                                cur_cu->inter.mv[1] - mv_cand[0][1]);
+      uint16_t cand_2_diff = abs(cur_cu->inter.mv[0] - mv_cand[1][0]) + abs(
+                                cur_cu->inter.mv[1] - mv_cand[1][1]);
+
+      // Select candidate 1 if it's closer
+      if (cand_2_diff < cand_1_diff) {
+        cur_cu->inter.mv_ref = 1;
+      }
+    }
+    cur_cu->inter.mvd[0] = cur_cu->inter.mv[0] - mv_cand[cur_cu->inter.mv_ref][0];
+    cur_cu->inter.mvd[1] = cur_cu->inter.mv[1] - mv_cand[cur_cu->inter.mv_ref][1];
+
     // Inter reconstruction
     inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
                 y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
diff --git a/src/picture.h b/src/picture.h
index 2ba93b2f..87065ab6 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -43,7 +43,9 @@ typedef struct
 {
   int8_t mode;
   uint32_t cost;
+
   int16_t mv[2];
+  int16_t mvd[2];
   uint8_t mv_ref; // \brief Index of the encoder_control.ref array.
   uint8_t mv_dir; // \brief Probably describes if mv_ref is forward, backward or both. Might not be needed?
 } cu_info_inter;

From 8883fb27aa4678c888c8d2d447fecdb6812c8c76 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Tue, 22 Oct 2013 17:40:55 +0300
Subject: [PATCH 16/19] Implemented skip/merge mode checking, disabled for now
 because it's not working

---
 src/encoder.c | 86 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 36 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 73081690..e310e3d8 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -844,40 +844,57 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     }
   }
   
-  // Encode skip flag
+
+
+    // Encode skip flag
   if (encoder->in.cur_pic->slicetype != SLICE_I) {
-    int8_t ctx_skip = 0;
-    // uiCtxSkip = aboveskipped + leftskipped;
-    cabac.ctx = &g_cu_skip_flag_model[ctx_skip];
-    CABAC_BIN(&cabac, (cur_cu->type == CU_SKIP) ? 1 : 0, "SkipFlag");
-  }
+    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
+    int ui;
+    int16_t unary_idx = 0; 
+    int8_t skipflag = 0;
+    int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
+    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);   
 
-  // IF SKIP
-  if (cur_cu->type == CU_SKIP) {
-    // Encode merge index
-    //TODO: calculate/fetch merge candidates
-    int16_t unary_idx = 0; //pcCU->getMergeIndex( uiAbsPartIdx );
-    int16_t num_cand = 0; //pcCU->getSlice()->getMaxNumMergeCand();
-    int32_t ui;
-
-    if (num_cand > 1) {
-      for (ui = 0; ui < num_cand - 1; ui++) {
-        int32_t symbol = (ui == unary_idx) ? 0 : 1;
-
-        if (ui == 0) {
-          cabac.ctx = &g_cu_merge_idx_ext_model;
-          CABAC_BIN(&cabac, symbol, "MergeIndex");
-        } else {
-          CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
-        }
-
-        if (symbol == 0) {
+    if (!cur_cu->coeff_top_y[depth] && !cur_cu->coeff_top_u[depth] && !cur_cu->coeff_top_v[depth]) {
+      // Encode merge index       
+      for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
+        if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
+           merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
+          //cur_cu->skipped = 1;
           break;
         }
       }
     }
+    // Get left and top skipped flags and if they are present and true, increase model number
+    if (x_ctb > 0 && (&encoder->in.cur_pic->cu_array[MAX_DEPTH][x_ctb - 1 + y_ctb * (encoder->in.width_in_lcu << MAX_DEPTH)])->skipped) {
+      ctx_skip++;
+    }
 
-    return;
+    if (y_ctb > 0 && (&encoder->in.cur_pic->cu_array[MAX_DEPTH][x_ctb + (y_ctb - 1) * (encoder->in.width_in_lcu << MAX_DEPTH)])->skipped) {
+      ctx_skip++;
+    }
+
+    cabac.ctx = &g_cu_skip_flag_model[ctx_skip];
+    CABAC_BIN(&cabac, cur_cu->skipped, "SkipFlag");
+  
+    // IF SKIP  
+    if (cur_cu->skipped) {
+      if (num_cand > 1) {
+        for (ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != unary_idx);
+          if (ui == 0) {
+            cabac.ctx = &g_cu_merge_idx_ext_model;
+            CABAC_BIN(&cabac, symbol, "MergeIndex");
+          } else {
+            CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return;
+    }
   }
 
   // ENDIF SKIP
@@ -899,34 +916,31 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
   //end partsize
   if (cur_cu->type == CU_INTER) {
     // FOR each part
-    // Mergeflag
-    uint8_t merge_flag = 0;
+    // Mergeflag    
     int16_t unary_idx = 0;
     int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
     int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);    
     for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
       if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
          merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
-        //merge_flag = 1;
+        //cur_cu->merged = 1;
         break;
       }
-    }
+    }    
     cabac.ctx = &g_cu_merge_flag_ext_model;
-    CABAC_BIN(&cabac, merge_flag, "MergeFlag");
-
-    if (merge_flag) { //merge
+    CABAC_BIN(&cabac, cur_cu->merged, "MergeFlag");
+    num_cand = MRG_MAX_NUM_CANDS;
+    if (cur_cu->merged) { //merge
       if (num_cand > 1) {
         int32_t ui;
         for (ui = 0; ui < num_cand - 1; ui++) {
           int32_t symbol = (ui != unary_idx);
-
           if (ui == 0) {
                 cabac.ctx = &g_cu_merge_idx_ext_model;
                 CABAC_BIN(&cabac, symbol, "MergeIndex");
           } else {
                 CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
           }
-
           if (symbol == 0) break;
         }
       }

From e1f0274b51822bba44fd6bcab8c31ba5be542848 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 23 Oct 2013 15:14:26 +0300
Subject: [PATCH 17/19] Merge mode working on blocks > 8x8

---
 src/encoder.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 src/inter.c   |  4 ++--
 src/picture.c | 23 +++++++++++++++++++++++
 src/picture.h |  4 +++-
 4 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index e310e3d8..7999d808 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -860,12 +860,12 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
       for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
         if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
            merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
-          //cur_cu->skipped = 1;
+          //picture_set_block_skipped(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
           break;
         }
       }
     }
-    // Get left and top skipped flags and if they are present and true, increase model number
+    // Get left and top skipped flags and if they are present and true, increase context number
     if (x_ctb > 0 && (&encoder->in.cur_pic->cu_array[MAX_DEPTH][x_ctb - 1 + y_ctb * (encoder->in.width_in_lcu << MAX_DEPTH)])->skipped) {
       ctx_skip++;
     }
@@ -917,16 +917,20 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
   if (cur_cu->type == CU_INTER) {
     // FOR each part
     // Mergeflag    
-    int16_t unary_idx = 0;
+    int16_t num_cand = 0;
+    /*
     int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
-    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);    
-    for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
-      if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
-         merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
-        //cur_cu->merged = 1;
-        break;
+    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);
+    if(cur_cu->coeff_top_y[depth] | cur_cu->coeff_top_u[depth] | cur_cu->coeff_top_v[depth]) {
+      for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
+        if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
+           merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
+          cur_cu->merged = 1;
+          break;
+        }
       }
-    }    
+    }
+    */
     cabac.ctx = &g_cu_merge_flag_ext_model;
     CABAC_BIN(&cabac, cur_cu->merged, "MergeFlag");
     num_cand = MRG_MAX_NUM_CANDS;
@@ -934,12 +938,12 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
       if (num_cand > 1) {
         int32_t ui;
         for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui != unary_idx);
+          int32_t symbol = (ui != cur_cu->merge_idx);
           if (ui == 0) {
-                cabac.ctx = &g_cu_merge_idx_ext_model;
-                CABAC_BIN(&cabac, symbol, "MergeIndex");
+            cabac.ctx = &g_cu_merge_idx_ext_model;
+            CABAC_BIN(&cabac, symbol, "MergeIndex");
           } else {
-                CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
+            CABAC_BIN_EP(&cabac,symbol,"MergeIndex");
           }
           if (symbol == 0) break;
         }
@@ -2087,6 +2091,18 @@ void encode_block_residual(encoder_control *encoder,
 
   } else {
     int16_t mv_cand[2][2];
+    
+    int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
+    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);    
+    for(cur_cu->merge_idx = 0; cur_cu->merge_idx < num_cand; cur_cu->merge_idx++) {
+      if(merge_cand[cur_cu->merge_idx][0] == cur_cu->inter.mv[0] &&
+          merge_cand[cur_cu->merge_idx][1] == cur_cu->inter.mv[1]) {
+        cur_cu->merged = 1;
+        break;
+      }
+    }
+    
+    
     // Get MV candidates
     inter_get_mv_cand(encoder, x_ctb, y_ctb, depth, mv_cand);
 
@@ -2111,11 +2127,16 @@ void encode_block_residual(encoder_control *encoder,
     // Inter reconstruction
     inter_recon(encoder->ref->pics[0], x_ctb * CU_MIN_SIZE_PIXELS,
                 y_ctb * CU_MIN_SIZE_PIXELS, LCU_WIDTH >> depth, cur_cu->inter.mv,
-                encoder->in.cur_pic);    
+                encoder->in.cur_pic);
   }
 
   // Mark this block as "coded" (can be used for predictions..)
   picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);    
   encode_transform_tree(encoder,x_ctb, y_ctb, depth);
 
+  if(cur_cu->merged &&!cur_cu->coeff_top_y[depth] && !cur_cu->coeff_top_u[depth] && !cur_cu->coeff_top_v[depth]) {
+    cur_cu->merged = 0;
+    //cur_cu->skipped = 1;
+  }
+
 }
diff --git a/src/inter.c b/src/inter.c
index 2e76b210..3f9846ed 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -412,9 +412,9 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   }
 #endif
 
-  // Fill with (0,0)
+  // Fill with (0,0)  
+  //i = candidates;
   /*
-  i = candidates;
   while (candidates < MRG_MAX_NUM_CANDS) {
     mv_cand[candidates][0] = 0;
     mv_cand[candidates][1] = 0;
diff --git a/src/picture.c b/src/picture.c
index 3983d392..1cebe141 100644
--- a/src/picture.c
+++ b/src/picture.c
@@ -20,6 +20,29 @@
 #define PSNRMAX (255.0 * 255.0)
 
 
+/**
+ * \brief Set block skipped
+ * \param pic    picture to use
+ * \param x_scu  x SCU position (smallest CU)
+ * \param y_scu  y SCU position (smallest CU)
+ * \param depth  current CU depth
+ * \param skipped skipped flag
+ */
+void picture_set_block_skipped(picture *pic, uint32_t x_scu, uint32_t y_scu,
+                                uint8_t depth, int8_t skipped)
+{
+  uint32_t x, y;
+  int width_in_scu = pic->width_in_lcu << MAX_DEPTH;
+  int block_scu_width = (LCU_WIDTH >> depth) / (LCU_WIDTH >> MAX_DEPTH);
+
+  for (y = y_scu; y < y_scu + block_scu_width; ++y) {
+    int cu_row = y * width_in_scu;
+    for (x = x_scu; x < x_scu + block_scu_width; ++x) {
+      pic->cu_array[MAX_DEPTH][cu_row + x].skipped = skipped;
+    }
+  }
+}
+
 /**
  * \brief Set block residual status
  * \param pic    picture to use
diff --git a/src/picture.h b/src/picture.h
index 87065ab6..662dbcaf 100644
--- a/src/picture.h
+++ b/src/picture.h
@@ -62,6 +62,7 @@ typedef struct
   int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
   int8_t skipped;    //!< \brief flag to indicate this block is skipped
   int8_t merged;     //!< \brief flag to indicate this block is merged
+  int8_t merge_idx;  //!< \brief merge index
   int8_t coeff_y;    //!< \brief is there coded coeffs Y
   int8_t coeff_u;    //!< \brief is there coded coeffs U
   int8_t coeff_v;    //!< \brief is there coded coeffs V
@@ -127,7 +128,8 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
                                 uint8_t depth, int8_t residual);
 void picture_set_block_split(picture *pic, uint32_t x_scu, uint32_t y_scu,
                              uint8_t depth, int8_t split);
-
+void picture_set_block_skipped(picture *pic, uint32_t x_scu, uint32_t y_scu,
+                                uint8_t depth, int8_t skipped);
 picture_list * picture_list_init(int size);
 int picture_list_resize(picture_list *list, int size);
 int picture_list_destroy(picture_list *list);

From ded4c18bf6d4c788994af6fd014d9aea7bd51cae Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Wed, 23 Oct 2013 16:50:11 +0300
Subject: [PATCH 18/19] Fixed merge candidate duplicate removal and implemented
 skip mode selection

---
 src/encoder.c | 42 ++++++++++--------------------------------
 src/inter.c   | 32 ++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 46 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 7999d808..133d4ac3 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -850,21 +850,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
   if (encoder->in.cur_pic->slicetype != SLICE_I) {
     int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
     int ui;
-    int16_t unary_idx = 0; 
-    int8_t skipflag = 0;
-    int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
-    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);   
-
-    if (!cur_cu->coeff_top_y[depth] && !cur_cu->coeff_top_u[depth] && !cur_cu->coeff_top_v[depth]) {
-      // Encode merge index       
-      for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
-        if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
-           merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
-          //picture_set_block_skipped(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
-          break;
-        }
-      }
-    }
+    int16_t num_cand = MRG_MAX_NUM_CANDS;
     // Get left and top skipped flags and if they are present and true, increase context number
     if (x_ctb > 0 && (&encoder->in.cur_pic->cu_array[MAX_DEPTH][x_ctb - 1 + y_ctb * (encoder->in.width_in_lcu << MAX_DEPTH)])->skipped) {
       ctx_skip++;
@@ -881,7 +867,7 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     if (cur_cu->skipped) {
       if (num_cand > 1) {
         for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui != unary_idx);
+          int32_t symbol = (ui != cur_cu->merge_idx);
           if (ui == 0) {
             cabac.ctx = &g_cu_merge_idx_ext_model;
             CABAC_BIN(&cabac, symbol, "MergeIndex");
@@ -918,19 +904,6 @@ void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
     // FOR each part
     // Mergeflag    
     int16_t num_cand = 0;
-    /*
-    int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
-    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);
-    if(cur_cu->coeff_top_y[depth] | cur_cu->coeff_top_u[depth] | cur_cu->coeff_top_v[depth]) {
-      for(unary_idx = 0; unary_idx < num_cand; unary_idx++) {
-        if(merge_cand[unary_idx][0] == cur_cu->inter.mv[0] &&
-           merge_cand[unary_idx][1] == cur_cu->inter.mv[1]) {
-          cur_cu->merged = 1;
-          break;
-        }
-      }
-    }
-    */
     cabac.ctx = &g_cu_merge_flag_ext_model;
     CABAC_BIN(&cabac, cur_cu->merged, "MergeFlag");
     num_cand = MRG_MAX_NUM_CANDS;
@@ -2092,8 +2065,11 @@ void encode_block_residual(encoder_control *encoder,
   } else {
     int16_t mv_cand[2][2];
     
+    // Search for merge mode candidate
     int16_t merge_cand[MRG_MAX_NUM_CANDS][2];
-    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);    
+    // Get list of candidates
+    int16_t num_cand = inter_get_merge_cand(encoder, x_ctb, y_ctb, depth, merge_cand);
+    // Check every candidate to find a match
     for(cur_cu->merge_idx = 0; cur_cu->merge_idx < num_cand; cur_cu->merge_idx++) {
       if(merge_cand[cur_cu->merge_idx][0] == cur_cu->inter.mv[0] &&
           merge_cand[cur_cu->merge_idx][1] == cur_cu->inter.mv[1]) {
@@ -2134,9 +2110,11 @@ void encode_block_residual(encoder_control *encoder,
   picture_set_block_coded(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);    
   encode_transform_tree(encoder,x_ctb, y_ctb, depth);
 
-  if(cur_cu->merged &&!cur_cu->coeff_top_y[depth] && !cur_cu->coeff_top_u[depth] && !cur_cu->coeff_top_v[depth]) {
+  // if merge is selected but no coefficients to code -> skip mode
+  if(cur_cu->merged && !cur_cu->coeff_top_y[depth] && !cur_cu->coeff_top_u[depth] && !cur_cu->coeff_top_v[depth]) {
     cur_cu->merged = 0;
-    //cur_cu->skipped = 1;
+    picture_set_block_skipped(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
+    cur_cu->skipped = 1;
   }
 
 }
diff --git a/src/inter.c b/src/inter.c
index 3f9846ed..d3a00a8c 100644
--- a/src/inter.c
+++ b/src/inter.c
@@ -359,9 +359,9 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   b0 = b1 = b2 = a0 = a1 = NULL;
   inter_get_spatial_merge_candidates(encoder, x_cu, y_cu, depth, &b0, &b1, &b2, &a0, &a1);
 
-#define CHECK_DUPLICATE(X,Y) {duplicate = 0; for(i = 0; i < candidates; i++) { \
-                                               if(mv_cand[i][0] == (X) && mv_cand[i][1] == (Y)) { \
-                                               duplicate = 1; break; } }}
+#define CHECK_DUPLICATE(CU1,CU2) {duplicate = 0; if ((CU2) && (CU2)->type == CU_INTER && \
+                                                     (CU1)->inter.mv[0] == (CU2)->inter.mv[0] && \
+                                                     (CU1)->inter.mv[1] == (CU2)->inter.mv[1]) duplicate = 1; }
 
   if (a1 && a1->type == CU_INTER) {
       mv_cand[candidates][0] = a1->inter.mv[0];
@@ -370,7 +370,7 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   }
 
   if (b1 && b1->type == CU_INTER) {
-    if(candidates) CHECK_DUPLICATE(b1->inter.mv[0],b1->inter.mv[1]);
+    if(candidates) CHECK_DUPLICATE(b1, a1);
     if(!duplicate) {
       mv_cand[candidates][0] = b1->inter.mv[0];
       mv_cand[candidates][1] = b1->inter.mv[1];
@@ -379,7 +379,7 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   }
 
   if (b0 && b0->type == CU_INTER) {
-    if(candidates) CHECK_DUPLICATE(b0->inter.mv[0],b0->inter.mv[1]);
+    if(candidates) CHECK_DUPLICATE(b0,b1);
     if(!duplicate) {
       mv_cand[candidates][0] = b0->inter.mv[0];
       mv_cand[candidates][1] = b0->inter.mv[1];
@@ -388,7 +388,7 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   }
 
   if (a0 && a0->type == CU_INTER) {
-    if(candidates) CHECK_DUPLICATE(a0->inter.mv[0],a0->inter.mv[1]);
+    if(candidates) CHECK_DUPLICATE(a0,a1);
     if(!duplicate) {
       mv_cand[candidates][0] = a0->inter.mv[0];
       mv_cand[candidates][1] = a0->inter.mv[1];
@@ -396,12 +396,17 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
     }
   }
 
-  if(b2 && b2->type == CU_INTER) {
-    if(candidates) CHECK_DUPLICATE(b2->inter.mv[0],b2->inter.mv[1]);
-    if(!duplicate) {
-      mv_cand[candidates][0] = b2->inter.mv[0];
-      mv_cand[candidates][1] = b2->inter.mv[1];
-      candidates++;
+  if (candidates != 4) {
+    if(b2 && b2->type == CU_INTER) {
+      CHECK_DUPLICATE(b2,a1);
+      if(!duplicate) {
+        CHECK_DUPLICATE(b2,b1);
+        if(!duplicate) {
+          mv_cand[candidates][0] = b2->inter.mv[0];
+          mv_cand[candidates][1] = b2->inter.mv[1];
+          candidates++;
+        }
+      }
     }
   }
 
@@ -412,8 +417,7 @@ uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_c
   }
 #endif
 
-  // Fill with (0,0)  
-  //i = candidates;
+  // Fill with (0,0)
   /*
   while (candidates < MRG_MAX_NUM_CANDS) {
     mv_cand[candidates][0] = 0;

From caa010a9729b8cb3e35c3f6287bb50c588c9b895 Mon Sep 17 00:00:00 2001
From: Marko Viitanen <fador@iki.fi>
Date: Mon, 28 Oct 2013 11:47:54 +0200
Subject: [PATCH 19/19] Changed scalinglist_process() to be done on frame level

---
 src/encoder.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index 133d4ac3..a9194a39 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -259,6 +259,7 @@ void init_encoder_input(encoder_input *input, FILE *inputfile,
 
 void encode_one_frame(encoder_control* encoder)
 {
+
   // output parameters before first frame
   if (encoder->frame == 0) {
     // Video Parameter Set (VPS)
@@ -289,8 +290,9 @@ void encode_one_frame(encoder_control* encoder)
     cabac_start(&cabac);
     encoder->in.cur_pic->slicetype = SLICE_I;
     encoder->in.cur_pic->type = NAL_IDR_W_RADL;
-    search_slice_data(encoder);
-
+    scalinglist_process();
+    search_slice_data(encoder);    
+    
     encode_slice_header(encoder);
     bitstream_align(encoder->stream);
     encode_slice_data(encoder);
@@ -304,8 +306,9 @@ void encode_one_frame(encoder_control* encoder)
     cabac_start(&cabac);
     encoder->in.cur_pic->slicetype = SLICE_P;
     encoder->in.cur_pic->type = NAL_TRAIL_R;
+    scalinglist_process();
     search_slice_data(encoder);
-
+    
     encode_slice_header(encoder);
     bitstream_align(encoder->stream);
     encode_slice_data(encoder);
@@ -1962,8 +1965,6 @@ void encode_block_residual(encoder_control *encoder,
   uint8_t border_split_y = ((encoder->in.height) < ((y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
   uint8_t border = border_x | border_y; /*!< are we in any border CU */
 
-  scalinglist_process();
-
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH) {
     if (split_flag || border) {