Merge branch 'master' into sao

SAO needs to be coded before LCU data has been searched. Searching has already been moved to happen before encoding in the master branch. Conflicts: src/encoder.c src/picture.c src/picture.h
2024-11-27 19:24:06 +00:00 · 2013-10-31 12:41:39 +02:00 · 2013-10-31 12:41:39 +02:00 · b6c5c87fb7
parent 7bd0902727 caa010a972
commit b6c5c87fb7
12 changed files with 786 additions and 646 deletions
--- a/src/encmain.c
+++ b/src/encmain.c
@ -154,6 +154,16 @@ int main(int argc, char *argv[])

  init_encoder_input(&encoder->in, input, cfg->width, cfg->height);

+  // Init coeff data table
+  encoder->in.cur_pic->coeff_y = MALLOC(coefficient, cfg->width * cfg->height);
+  encoder->in.cur_pic->coeff_u = MALLOC(coefficient, (cfg->width * cfg->height) >> 2);
+  encoder->in.cur_pic->coeff_v = MALLOC(coefficient, (cfg->width * cfg->height) >> 2);
+
+  // Init predicted data table
+  encoder->in.cur_pic->pred_y = MALLOC(pixel, cfg->width * cfg->height);
+  encoder->in.cur_pic->pred_u = MALLOC(pixel, (cfg->width * cfg->height) >> 2);
+  encoder->in.cur_pic->pred_v = MALLOC(pixel, (cfg->width * cfg->height) >> 2);
+
  // Start coding cycle while data on input and not on the last frame
  while(!feof(input) && (!cfg->frames || encoder->frame < cfg->frames)) {
    int32_t diff;
@ -202,6 +212,15 @@ int main(int argc, char *argv[])
    // TODO: reuse memory from old reference
    encoder->in.cur_pic = picture_init(encoder->in.width, encoder->in.height, encoder->in.width_in_lcu, encoder->in.height_in_lcu);

+    // Copy pointer from the last cur_pic because we don't want to reallocate it
+    MOVE_POINTER(encoder->in.cur_pic->coeff_y,encoder->ref->pics[0]->coeff_y);
+    MOVE_POINTER(encoder->in.cur_pic->coeff_u,encoder->ref->pics[0]->coeff_u);
+    MOVE_POINTER(encoder->in.cur_pic->coeff_v,encoder->ref->pics[0]->coeff_v);
+    
+    MOVE_POINTER(encoder->in.cur_pic->pred_y,encoder->ref->pics[0]->pred_y);
+    MOVE_POINTER(encoder->in.cur_pic->pred_u,encoder->ref->pics[0]->pred_u);
+    MOVE_POINTER(encoder->in.cur_pic->pred_v,encoder->ref->pics[0]->pred_v);
+
    encoder->frame++;
  }
  // Coding finished
--- a/src/encoder.c
+++ b/src/encoder.c
--- a/src/encoder.h
+++ b/src/encoder.h
@ -67,39 +67,6 @@ typedef struct
  int8_t tc_offset_div2;   // \brief (deblocking)tc offset (div 2), range -6...6
 } encoder_control;

-typedef struct
-{
-  int8_t idx;
-  pixel *base;
-  pixel *base_u;
-  pixel *base_v;
-  
-  pixel *recbase;
-  pixel *recbase_u;
-  pixel *recbase_v;
-  
-  int16_t *pred;
-  int16_t *pred_u;
-  int16_t *pred_v;
-
-  int32_t base_stride;
-  int32_t recbase_stride;
-  int32_t pred_stride;
-  
-  // TODO: unify luma+chroma arrays
-  int16_t *coeff[3];
-  int8_t cb_top[3];
-  int8_t cb[4];
-  int8_t intra_pred_mode;
-  int8_t intra_pred_mode_chroma;
-  int32_t split[4];
-
-  int8_t block_type;
-
-  int32_t x_ctb,y_ctb;
-
-} transform_info;
-
 void init_tables(void);
 void init_encoder_control(encoder_control *control, bitstream *output);
 void init_encoder_input(encoder_input *input, FILE* inputfile,
@ -119,10 +86,12 @@ void encode_last_significant_xy(encoder_control *encoder, uint8_t lastpos_x,
                                uint8_t type, uint8_t scan);
 void encode_coeff_nxn(encoder_control *encoder, int16_t *coeff, uint8_t width,
                      uint8_t type, int8_t scan_mode);
-void encode_transform_tree(encoder_control *encoder, transform_info *ti,
+void encode_transform_tree(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
                           uint8_t depth);
-void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
-                            int8_t depth, int8_t tr_depth);
+void encode_transform_coeff(encoder_control *encoder, int32_t x_cu, int32_t y_cu,
+                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
+void encode_block_residual(encoder_control *encoder, 
+                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);

 extern int16_t g_lambda_cost[55];
 extern uint32_t* g_sig_last_scan[3][7];
--- a/src/filter.c
+++ b/src/filter.c
@ -198,8 +198,8 @@ void filter_deblock_edge_luma(encoder_control *encoder,
        // Intra blocks have strength 2
        if(cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
          strength = 2;          
-          // Non-zero residual and transform boundary
-        } else if(cu_q->residual || cu_p->residual) {
+          // Non-zero residual/coeffs and transform boundary
+        } else if(cu_q->coeff_y || cu_p->coeff_y) {
          strength = 1;
          // Absolute motion vector diff between blocks >= 1 (Integer pixel)
        } else if((abs(cu_q->inter.mv[0] - cu_p->inter.mv[0]) >= 4) || (abs(cu_q->inter.mv[1] - cu_p->inter.mv[1]) >= 4)) {
--- a/src/global.h
+++ b/src/global.h
@ -123,5 +123,6 @@ typedef int16_t coefficient;
 #endif

 #define FREE_POINTER(pointer) { free(pointer); pointer = NULL; }
+#define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; }

 #endif
--- a/src/inter.c
+++ b/src/inter.c
@ -227,18 +227,21 @@ void inter_recon(picture* ref,int32_t xpos, int32_t ypos,int32_t width, const in
 }

 /**
- * \brief Get MV prediction for current block
+ * \brief Get merge candidates for current block
 * \param encoder encoder control struct to use
 * \param x_cu block x position in SCU
 * \param y_cu block y position in SCU
 * \param depth current block depth
- * \param mv_pred[2][2] 2x motion vector prediction
+ * \param b0 candidate b0
+ * \param b1 candidate b1
+ * \param b2 candidate b2
+ * \param a0 candidate a0
+ * \param a1 candidate a1
 */
-void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2])
+void inter_get_spatial_merge_candidates(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, 
+                                        cu_info **b0, cu_info **b1,cu_info **b2,cu_info **a0,cu_info **a1)
 {
  uint8_t cur_block_in_scu = (LCU_WIDTH>>depth) / CU_MIN_SIZE_PIXELS; //!< the width of the current block on SCU
-  uint8_t candidates = 0;
-  
  /*
  Predictor block locations
  ____      _______
@ -248,37 +251,50 @@ void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int
   __|         |
  |A1|_________|
  |A0|
-  */
-  cu_info *b0, *b1, *b2, *a0, *a1;
-
-  b0 = b1 = b2 = a0 = a1 = NULL;
+  */ 

  // A0 and A1 availability testing
  if (x_cu != 0) {    
-    a1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-    if (!a1->coded) a1 = NULL;
+    *a1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+    if (!(*a1)->coded) *a1 = NULL;

    if (y_cu + cur_block_in_scu < encoder->in.height_in_lcu<<MAX_DEPTH) {
-      a0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if (!a0->coded) a0 = NULL;
+      *a0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu + cur_block_in_scu) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if (!(*a0)->coded) *a0 = NULL;
    }
  }

  // B0, B1 and B2 availability testing
  if (y_cu != 0) {
-
    if (x_cu + cur_block_in_scu < encoder->in.width_in_lcu<<MAX_DEPTH) {
-      b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if (!b0->coded) b0 = NULL;
+      *b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if (!(*b0)->coded) *b0 = NULL;
    }
-    b1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-    if (!b1->coded) b1 = NULL;
+    *b1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+    if (!(*b1)->coded) *b1 = NULL;

    if (x_cu != 0) {
-      b2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-      if(!b2->coded) b2 = NULL;
+      *b2 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if(!(*b2)->coded) *b2 = NULL;
    }
  }
+}
+
+/**
+ * \brief Get MV prediction for current block
+ * \param encoder encoder control struct to use
+ * \param x_cu block x position in SCU
+ * \param y_cu block y position in SCU
+ * \param depth current block depth
+ * \param mv_pred[2][2] 2x motion vector prediction
+ */
+void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2])
+{  
+  uint8_t candidates = 0;
+
+  cu_info *b0, *b1, *b2, *a0, *a1;
+  b0 = b1 = b2 = a0 = a1 = NULL;
+  inter_get_spatial_merge_candidates(encoder, x_cu, y_cu, depth, &b0, &b1, &b2, &a0, &a1);

  // Left predictors
  if (a0 && a0->type == CU_INTER) {
@ -312,15 +328,103 @@ void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int
  }

 #if ENABLE_TEMPORAL_MVP
-  if(candidates < 2) {
+  if(candidates < AMVP_MAX_NUM_CANDS) {
    //TODO: add temporal mv predictor
  }
 #endif

  // Fill with (0,0)
-  while (candidates < 2) {
+  while (candidates < AMVP_MAX_NUM_CANDS) {
    mv_cand[candidates][0] = 0;
    mv_cand[candidates][1] = 0;
    candidates++;
  }
 }
+
+/**
+ * \brief Get merge predictions for current block
+ * \param encoder encoder control struct to use
+ * \param x_cu block x position in SCU
+ * \param y_cu block y position in SCU
+ * \param depth current block depth
+ * \param mv_pred[MRG_MAX_NUM_CANDS][2] MRG_MAX_NUM_CANDS motion vector prediction
+ */
+uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][2])
+{  
+  uint8_t candidates = 0;
+  uint8_t i = 0;
+  int8_t duplicate = 0;
+
+  cu_info *b0, *b1, *b2, *a0, *a1;
+  b0 = b1 = b2 = a0 = a1 = NULL;
+  inter_get_spatial_merge_candidates(encoder, x_cu, y_cu, depth, &b0, &b1, &b2, &a0, &a1);
+
+#define CHECK_DUPLICATE(CU1,CU2) {duplicate = 0; if ((CU2) && (CU2)->type == CU_INTER && \
+                                                     (CU1)->inter.mv[0] == (CU2)->inter.mv[0] && \
+                                                     (CU1)->inter.mv[1] == (CU2)->inter.mv[1]) duplicate = 1; }
+
+  if (a1 && a1->type == CU_INTER) {
+      mv_cand[candidates][0] = a1->inter.mv[0];
+      mv_cand[candidates][1] = a1->inter.mv[1];
+      candidates++;
+  }
+
+  if (b1 && b1->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(b1, a1);
+    if(!duplicate) {
+      mv_cand[candidates][0] = b1->inter.mv[0];
+      mv_cand[candidates][1] = b1->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if (b0 && b0->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(b0,b1);
+    if(!duplicate) {
+      mv_cand[candidates][0] = b0->inter.mv[0];
+      mv_cand[candidates][1] = b0->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if (a0 && a0->type == CU_INTER) {
+    if(candidates) CHECK_DUPLICATE(a0,a1);
+    if(!duplicate) {
+      mv_cand[candidates][0] = a0->inter.mv[0];
+      mv_cand[candidates][1] = a0->inter.mv[1];
+      candidates++;
+    }
+  }
+
+  if (candidates != 4) {
+    if(b2 && b2->type == CU_INTER) {
+      CHECK_DUPLICATE(b2,a1);
+      if(!duplicate) {
+        CHECK_DUPLICATE(b2,b1);
+        if(!duplicate) {
+          mv_cand[candidates][0] = b2->inter.mv[0];
+          mv_cand[candidates][1] = b2->inter.mv[1];
+          candidates++;
+        }
+      }
+    }
+  }
+
+
+#if ENABLE_TEMPORAL_MVP
+  if(candidates < AMVP_MAX_NUM_CANDS) {
+    //TODO: add temporal mv predictor
+  }
+#endif
+
+  // Fill with (0,0)
+  /*
+  while (candidates < MRG_MAX_NUM_CANDS) {
+    mv_cand[candidates][0] = 0;
+    mv_cand[candidates][1] = 0;
+    candidates++;
+  }
+  */
+  return candidates;
+}
+
--- a/src/inter.h
+++ b/src/inter.h
@ -21,6 +21,9 @@
 void inter_set_block(picture* pic,uint32_t x_cu, uint32_t y_cu, uint8_t depth, cu_info *cur_cu);
 void inter_recon(picture *ref,int32_t xpos, int32_t ypos,int32_t width, const int16_t mv[2], picture* dst);

+void inter_get_spatial_merge_candidates(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, 
+                                        cu_info **b0, cu_info **b1,cu_info **b2,cu_info **a0,cu_info **a1);
 void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2]);
+uint8_t inter_get_merge_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][2]);

 #endif
--- a/src/intra.c
+++ b/src/intra.c
@ -77,7 +77,7 @@ int8_t intra_get_block_mode(picture *pic, uint32_t x_cu, uint32_t y_cu, uint8_t
 * \param width block width
 * \returns DC prediction
 */
-int16_t intra_get_dc_pred(int16_t *pic, uint16_t picwidth, uint32_t xpos, uint32_t ypos, uint8_t width)
+int16_t intra_get_dc_pred(pixel *pic, uint16_t picwidth, uint32_t xpos, uint32_t ypos, uint8_t width)
 {
  int32_t i, sum = 0;

@ -155,11 +155,11 @@ int8_t intra_get_dir_luma_predictor(picture* pic, uint32_t x_cu, uint32_t y_cu,
 * \param preds output buffer for 3 predictions 
 * \returns (predictions are found)?1:0
 */
-void intra_filter(int16_t *ref, int32_t stride,int32_t width, int8_t mode)
+void intra_filter(pixel *ref, int32_t stride,int32_t width, int8_t mode)
 {
  #define FWIDTH (LCU_WIDTH*2+1)
-  int16_t filtered[FWIDTH * FWIDTH]; //!< temporary buffer for filtered samples
-  int16_t *filteredShift = &filtered[FWIDTH+1]; //!< pointer to temporary buffer with offset (1,1)
+  pixel filtered[FWIDTH * FWIDTH]; //!< temporary buffer for filtered samples
+  pixel *filteredShift = &filtered[FWIDTH+1]; //!< pointer to temporary buffer with offset (1,1)
  int x,y;

  if (!mode) {
@ -213,8 +213,8 @@ void intra_filter(int16_t *ref, int32_t stride,int32_t width, int8_t mode)

 This function derives the prediction samples for planar mode (intra coding).
 */
-int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t recstride, uint32_t xpos,
-                         uint32_t ypos, uint32_t width, int16_t *dst, int32_t dststride, uint32_t *sad_out)
+int16_t intra_prediction(pixel *orig, int32_t origstride, pixel *rec, int32_t recstride, uint32_t xpos,
+                         uint32_t ypos, uint32_t width, pixel *dst, int32_t dststride, uint32_t *sad_out)
 {
  uint32_t best_sad = 0xffffffff;
  uint32_t sad = 0;
@ -225,11 +225,11 @@ int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t

  // Temporary block arrays
  // TODO: alloc with alignment
-  int16_t pred[LCU_WIDTH * LCU_WIDTH + 1];  
-  int16_t orig_block[LCU_WIDTH * LCU_WIDTH + 1];  
-  int16_t rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
+  pixel pred[LCU_WIDTH * LCU_WIDTH + 1];  
+  pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];  
+  pixel rec_filtered_temp[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8) + 1];
  
-  int16_t* rec_filtered = &rec_filtered_temp[recstride + 1]; //!< pointer to rec_filtered_temp with offset of (1,1)
+  pixel* rec_filtered = &rec_filtered_temp[recstride + 1]; //!< pointer to rec_filtered_temp with offset of (1,1)
  pixel *orig_shift = &orig[xpos + ypos*origstride];  //!< pointer to orig with offset of (1,1)
  int8_t filter = (width<32); // TODO: chroma support

@ -318,10 +318,10 @@ int16_t intra_prediction(pixel *orig, int32_t origstride, int16_t *rec, int32_t
 * \param chroma chroma-block flag

 */
-void intra_recon(int16_t* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride, int8_t mode, int8_t chroma)
+void intra_recon(pixel* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride, int8_t mode, int8_t chroma)
 {
  int32_t x,y,i;
-  int16_t pred[LCU_WIDTH * LCU_WIDTH];
+  pixel pred[LCU_WIDTH * LCU_WIDTH];
  int8_t filter = !chroma&&(width<32);
  #define COPY_PRED_TO_DST() for(y = 0; y < (int32_t)width; y++)  { for(x = 0; x < (int32_t)width; x++) { dst[x+y*dststride] = pred[x+y*width]; } }

@ -362,12 +362,12 @@ void intra_recon(int16_t* rec,uint32_t recstride, uint32_t xpos, uint32_t ypos,u
 * \param chroma signaling if chroma is used, 0 = luma, 1 = U and 2 = V    
 *
 */
-void intra_build_reference_border(picture *pic, int32_t x_cu, int32_t y_cu,int16_t outwidth, int16_t *dst, int32_t dststride, int8_t chroma)
+void intra_build_reference_border(picture *pic, int32_t x_cu, int32_t y_cu,int16_t outwidth, pixel *dst, int32_t dststride, int8_t chroma)
 {
  int32_t left_column; //!< left column iterator
-  int16_t val;         //!< variable to store extrapolated value
+  pixel val;         //!< variable to store extrapolated value
  int32_t i;           //!< index iterator
-  int16_t dc_val       = 1<<(g_bitdepth-1); //!< default predictor value
+  pixel dc_val       = 1<<(g_bitdepth-1); //!< default predictor value
  int32_t top_row;     //!< top row iterator
  int32_t src_width    = (pic->width>>(chroma?1:0)); //!< source picture width
  int32_t src_height   = (pic->height>>(chroma?1:0));//!< source picture height
@ -443,7 +443,7 @@ const int32_t inv_ang_table[9] = {0, 4096, 1638, 910, 630, 482, 390, 315, 256};
 * \brief this functions constructs the angular intra prediction from border samples
 *
 */
-void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int32_t dst_stride, int32_t width,
+void intra_get_angular_pred(pixel* src, int32_t src_stride, pixel* dst, int32_t dst_stride, int32_t width,
                           int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter)
 {
  int32_t k,l;
@ -460,10 +460,10 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
  int32_t inv_angle       = inv_ang_table[abs_ang];

  // Do angular predictions
-  int16_t *ref_main;
-  int16_t *ref_side;
-  int16_t  ref_above[2 * LCU_WIDTH + 1];
-  int16_t  ref_left[2 * LCU_WIDTH + 1];
+  pixel *ref_main;
+  pixel *ref_side;
+  pixel  ref_above[2 * LCU_WIDTH + 1];
+  pixel  ref_left[2 * LCU_WIDTH + 1];

  abs_ang           = ang_table[abs_ang];
  intra_pred_angle  = sign_ang * abs_ang;
@ -522,7 +522,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3
        // Do linear filtering
        for (l = 0; l < blk_size; l++) {
          ref_main_index        = l + delta_int + 1;
-          dst[k * dst_stride + l] = (int16_t) ( (minus_delta_fract * ref_main[ref_main_index]
+          dst[k * dst_stride + l] = (pixel) ( (minus_delta_fract * ref_main[ref_main_index]
                                                 + delta_fract * ref_main[ref_main_index + 1] + 16) >> 5);
        }
      } else {
@ -536,7 +536,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3

  // Flip the block if this is the horizontal mode
  if (mode_hor) {
-    int16_t tmp;
+    pixel tmp;
    for (k=0;k<blk_size-1;k++) {
      for (l=k+1;l<blk_size;l++) {
        tmp                 = dst[k * dst_stride + l];
@ -551,7 +551,7 @@ void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* dst, int3



-void intra_dc_pred_filtering(int16_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t width, int32_t height )
+void intra_dc_pred_filtering(pixel *src, int32_t src_stride, pixel *dst, int32_t dst_stride, int32_t width, int32_t height )
 {
  int32_t x, y, dst_stride2, src_stride2;

@ -580,9 +580,8 @@ void intra_dc_pred_filtering(int16_t *src, int32_t src_stride, int16_t *dst, int
 
  This function derives the prediction samples for planar mode (intra coding).
 */
-void intra_get_planar_pred(int16_t* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride)
+void intra_get_planar_pred(pixel* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride)
 {
-  int16_t dc_val = 1<<(g_bitdepth-1);
  int32_t k, l, bottom_left, top_right;
  int32_t hor_pred;
  int32_t left_column[LCU_WIDTH+1], top_row[LCU_WIDTH+1], bottom_row[LCU_WIDTH+1], right_column[LCU_WIDTH+1];
--- a/src/intra.h
+++ b/src/intra.h
@ -21,19 +21,19 @@ void intra_set_block_mode(picture* pic,uint32_t x_ctb, uint32_t y_ctb, uint8_t d
 int8_t intra_get_block_mode(picture* pic, uint32_t x_ctb, uint32_t y_ctb, uint8_t depth);

 int8_t intra_get_dir_luma_predictor(picture* pic,uint32_t x_ctb, uint32_t y_ctb, uint8_t depth, int8_t* preds);
-void intra_dc_pred_filtering(int16_t* src, int32_t src_stride, int16_t* dst, int32_t dst_stride, int32_t width, int32_t height );
+void intra_dc_pred_filtering(pixel* src, int32_t src_stride, pixel* dst, int32_t dst_stride, int32_t width, int32_t height );

-void intra_build_reference_border(picture* pic, int32_t x_ctb, int32_t y_ctb, int16_t out_width, int16_t* dst, int32_t dst_stride, int8_t chroma);
-void intra_filter(int16_t* ref, int32_t stride, int32_t width, int8_t mode);
+void intra_build_reference_border(picture* pic, int32_t x_ctb, int32_t y_ctb, int16_t out_width, pixel* dst, int32_t dst_stride, int8_t chroma);
+void intra_filter(pixel* ref, int32_t stride, int32_t width, int8_t mode);

 /* Predictions */
-int16_t intra_prediction(pixel* orig, int32_t orig_stride, int16_t* rec, int32_t rec_stride,  uint32_t x_pos, uint32_t ypos, uint32_t width, int16_t* dst, int32_t dst_stride, uint32_t *sad);
+int16_t intra_prediction(pixel* orig, int32_t orig_stride, pixel* rec, int32_t rec_stride,  uint32_t x_pos, uint32_t ypos, uint32_t width, pixel* dst, int32_t dst_stride, uint32_t *sad);

-int16_t intra_get_dc_pred(int16_t* pic, uint16_t pic_width, uint32_t x_pos, uint32_t y_pos, uint8_t width);
-void intra_get_planar_pred(int16_t* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, int16_t* dst,int32_t dststride);
-void intra_get_angular_pred(int16_t* src, int32_t src_stride, int16_t* p_dst, int32_t dst_stride, int32_t width, int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter);
+int16_t intra_get_dc_pred(pixel* pic, uint16_t pic_width, uint32_t x_pos, uint32_t y_pos, uint8_t width);
+void intra_get_planar_pred(pixel* src,int32_t srcstride, uint32_t xpos, uint32_t ypos,uint32_t width, pixel* dst,int32_t dststride);
+void intra_get_angular_pred(pixel* src, int32_t src_stride, pixel* p_dst, int32_t dst_stride, int32_t width, int32_t height, int32_t dir_mode, int8_t left_avail,int8_t top_avail, int8_t filter);

-void intra_recon(int16_t* rec, uint32_t rec_stride, uint32_t x_pos, uint32_t y_pos, uint32_t width, int16_t* dst, int32_t dst_stride, int8_t mode, int8_t chroma);
+void intra_recon(pixel* rec, uint32_t rec_stride, uint32_t x_pos, uint32_t y_pos, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma);


 #endif
--- a/src/picture.c
+++ b/src/picture.c
@ -21,15 +21,15 @@


 /**
- * \brief Set block residual status
+ * \brief Set block skipped
 * \param pic    picture to use
 * \param x_scu  x SCU position (smallest CU)
 * \param y_scu  y SCU position (smallest CU)
 * \param depth  current CU depth
- * \param residual  residual status
+ * \param skipped skipped flag
 */
-void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
-                                uint8_t depth, int8_t residual)
+void picture_set_block_skipped(picture *pic, uint32_t x_scu, uint32_t y_scu,
+                                uint8_t depth, int8_t skipped)
 {
  uint32_t x, y;
  int width_in_scu = pic->width_in_lcu << MAX_DEPTH;
@ -38,7 +38,30 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
  for (y = y_scu; y < y_scu + block_scu_width; ++y) {
    int cu_row = y * width_in_scu;
    for (x = x_scu; x < x_scu + block_scu_width; ++x) {
-      pic->cu_array[MAX_DEPTH][cu_row + x].residual = residual;
+      pic->cu_array[MAX_DEPTH][cu_row + x].skipped = skipped;
+    }
+  }
+}
+
+/**
+ * \brief Set block residual status
+ * \param pic    picture to use
+ * \param x_scu  x SCU position (smallest CU)
+ * \param y_scu  y SCU position (smallest CU)
+ * \param depth  current CU depth
+ * \param coeff_y  residual status
+ */
+void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
+                                uint8_t depth, int8_t coeff_y)
+{
+  uint32_t x, y;
+  int width_in_scu = pic->width_in_lcu << MAX_DEPTH;
+  int block_scu_width = (LCU_WIDTH >> depth) / (LCU_WIDTH >> MAX_DEPTH);
+
+  for (y = y_scu; y < y_scu + block_scu_width; ++y) {
+    int cu_row = y * width_in_scu;
+    for (x = x_scu; x < x_scu + block_scu_width; ++x) {
+      pic->cu_array[MAX_DEPTH][cu_row + x].coeff_y = coeff_y;
    }
  }
 }
@ -275,6 +298,9 @@ picture *picture_init(int32_t width, int32_t height,
    memset(pic->cu_array[i], 0, sizeof(cu_info) * cu_array_size);
  }

+  pic->coeff_y = NULL; pic->coeff_u = NULL; pic->coeff_v = NULL;
+  pic->pred_y = NULL; pic->pred_u = NULL; pic->pred_v = NULL;
+
  pic->slice_sao_luma_flag = 1;
  pic->slice_sao_chroma_flag = 1;

@ -309,6 +335,14 @@ int picture_destroy(picture *pic)
  free(pic->cu_array);
  pic->cu_array = NULL;

+  FREE_POINTER(pic->coeff_y);
+  FREE_POINTER(pic->coeff_u);
+  FREE_POINTER(pic->coeff_v);
+
+  FREE_POINTER(pic->pred_y);
+  FREE_POINTER(pic->pred_u);
+  FREE_POINTER(pic->pred_v);
+
  return 1;
 }

@ -336,7 +370,7 @@ double image_psnr(pixel *frame1, pixel *frame2, int32_t x, int32_t y)
 /**
 * \brief  Calculate SATD between two 8x8 blocks inside bigger arrays.
 */
-unsigned satd_16bit_8x8_general(int16_t *piOrg, int32_t iStrideOrg, int16_t *piCur, int32_t iStrideCur)
+unsigned satd_16bit_8x8_general(pixel *piOrg, int32_t iStrideOrg, pixel *piCur, int32_t iStrideCur)
 {
  int32_t k, i, j, jj, sad=0;
  int32_t diff[64], m1[8][8], m2[8][8], m3[8][8];
@ -443,14 +477,13 @@ unsigned satd_16bit_8x8_general(int16_t *piOrg, int32_t iStrideOrg, int16_t *piC
      } \
    } \
    return sum; \
-  }
+    }

 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
-SATD_NXN(8, int16_t, 16bit)
-SATD_NXN(16, int16_t, 16bit)
-SATD_NXN(32, int16_t, 16bit)
-SATD_NXN(64, int16_t, 16bit)
-
+SATD_NXN(8, pixel, 16bit)
+SATD_NXN(16, pixel, 16bit)
+SATD_NXN(32, pixel, 16bit)
+SATD_NXN(64, pixel, 16bit)

 // Function macro for defining SAD calculating functions 
 // for fixed size blocks.
@ -472,11 +505,11 @@ SATD_NXN(64, int16_t, 16bit)
 // These macros define sad_16bit_nxn functions for n = 4, 8, 16, 32, 64
 // with function signatures of cost_16bit_nxn_func.
 // They are used through get_sad_16bit_nxn_func.
-SAD_NXN(4, int16_t, 16bit)
-SAD_NXN(8, int16_t, 16bit)
-SAD_NXN(16, int16_t, 16bit)
-SAD_NXN(32, int16_t, 16bit)
-SAD_NXN(64, int16_t, 16bit)
+SAD_NXN(4, pixel, 16bit)
+SAD_NXN(8, pixel, 16bit)
+SAD_NXN(16, pixel, 16bit)
+SAD_NXN(32, pixel, 16bit)
+SAD_NXN(64, pixel, 16bit)

 /**
 * \brief  Get a function that calculates SATD for NxN block.
@ -498,9 +531,9 @@ cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n)
    return &satd_16bit_64x64;
  default:
    return NULL;
+    }
  }
-}
-
+  
 /**
 * \brief  Get a function that calculates SAD for NxN block.
 * 
@ -509,7 +542,7 @@ cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n)
 * \returns  Pointer to cost_16bit_nxn_func.
 */
 cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
-{
+  {
  switch (n) {
  case 4:
    return &sad_16bit_4x4;
@ -523,7 +556,7 @@ cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
    return &sad_16bit_64x64;
  default:
    return NULL;
-  }
+  }  
 }

 /**
@ -535,11 +568,11 @@ cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n)
 * 
 * \returns       Sum of Absolute Transformed Differences (SATD)
 */
-unsigned satd_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
+unsigned satd_nxn_16bit(pixel *block1, pixel *block2, unsigned n)
 {
  cost_16bit_nxn_func sad_func = get_satd_16bit_nxn_func(n);
  return sad_func(block1, block2);
-}
+  }

 /**
 * \brief Calculate SAD for NxN block of size N.
@ -550,7 +583,7 @@ unsigned satd_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
 * 
 * \returns       Sum of Absolute Differences
 */
-unsigned sad_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
+unsigned sad_nxn_16bit(pixel *block1, pixel *block2, unsigned n)
 {
  cost_16bit_nxn_func sad_func = get_sad_16bit_nxn_func(n);
  if (sad_func) {
@ -561,10 +594,10 @@ unsigned sad_nxn_16bit(int16_t *block1, int16_t *block2, unsigned n)
    for (row = 0; row < n; row += n) {
      for (x = 0; x < n; ++x) {
        sum += abs(block1[row + x] - block2[row + x]);
-      }
-    }
-    return sum;
  }
+    }
+  return sum;
+}
 }

 /**
--- a/src/picture.h
+++ b/src/picture.h
@ -32,6 +32,7 @@ enum { REF_PIC_LIST_0 = 0, REF_PIC_LIST_1 = 1, REF_PIC_LIST_X = 100 };
 typedef struct
 {
  int8_t mode;
+  int8_t mode_chroma;
  uint32_t cost;
 } cu_info_intra;

@ -42,7 +43,9 @@ typedef struct
 {
  int8_t mode;
  uint32_t cost;
+
  int16_t mv[2];
+  int16_t mvd[2];
  uint8_t mv_ref; // \brief Index of the encoder_control.ref array.
  uint8_t mv_dir; // \brief Probably describes if mv_ref is forward, backward or both. Might not be needed?
 } cu_info_inter;
@ -52,10 +55,21 @@ typedef struct
 */
 typedef struct
 {  
-  int8_t type;
-  int8_t depth;
-  int8_t coded;
-  int8_t residual;
+  int8_t type;       //!< \brief block type, CU_INTER / CU_INTRA
+  int8_t depth;      //!< \brief depth / size of this block
+  int8_t part_size;  //!< \brief Currently only 2Nx2N, TODO: AMP/SMP/NxN parts
+  int8_t tr_depth;   //!< \brief transform depth
+  int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
+  int8_t skipped;    //!< \brief flag to indicate this block is skipped
+  int8_t merged;     //!< \brief flag to indicate this block is merged
+  int8_t merge_idx;  //!< \brief merge index
+  int8_t coeff_y;    //!< \brief is there coded coeffs Y
+  int8_t coeff_u;    //!< \brief is there coded coeffs U
+  int8_t coeff_v;    //!< \brief is there coded coeffs V
+
+  int8_t coeff_top_y[MAX_DEPTH+1];  //!< \brief is there coded coeffs Y in top level
+  int8_t coeff_top_u[MAX_DEPTH+1];  //!< \brief is there coded coeffs U in top level
+  int8_t coeff_top_v[MAX_DEPTH+1];  //!< \brief is there coded coeffs V in top level
  cu_info_intra intra;
  cu_info_inter inter;
 } cu_info;
@ -65,20 +79,28 @@ typedef struct
 */
 typedef struct
 {
-  pixel* y_data;        // \brief Pointer to luma pixel array.
-  pixel* u_data;        // \brief Pointer to chroma U pixel array.
-  pixel* v_data;        // \brief Pointer to chroma V pixel array.
+  pixel* y_data;        //!< \brief Pointer to luma pixel array.
+  pixel* u_data;        //!< \brief Pointer to chroma U pixel array.
+  pixel* v_data;        //!< \brief Pointer to chroma V pixel array.

-  pixel* y_recdata;     // \brief Pointer to reconstructed Y-data.
-  pixel* u_recdata;     // \brief Pointer to reconstructed U-data.
-  pixel* v_recdata;     // \brief Pointer to reconstructed V-data.
+  pixel* y_recdata;     //!< \brief Pointer to reconstructed Y-data.
+  pixel* u_recdata;     //!< \brief Pointer to reconstructed U-data.
+  pixel* v_recdata;     //!< \brief Pointer to reconstructed V-data.

-  int32_t width;          // \brief Luma pixel array width.
-  int32_t height;         // \brief Luma pixel array height.
-  int32_t height_in_lcu;  // \brief Picture width in number of LCU's.
-  int32_t width_in_lcu;   // \brief Picture height in number of LCU's.
-  uint8_t referenced;     // \brief Whether this picture is referenced.
-  cu_info** cu_array;           // \brief Info for each CU at each depth.
+  pixel* pred_y;        //!< \brief Pointer to predicted Y
+  pixel* pred_u;        //!< \brief Pointer to predicted U
+  pixel* pred_v;        //!< \brief Pointer to predicted V
+
+  coefficient* coeff_y;   //!< \brief coefficient pointer Y
+  coefficient* coeff_u;   //!< \brief coefficient pointer U
+  coefficient* coeff_v;   //!< \brief coefficient pointer V
+
+  int32_t width;          //!< \brief Luma pixel array width.
+  int32_t height;         //!< \brief Luma pixel array height.
+  int32_t height_in_lcu;  //!< \brief Picture width in number of LCU's.
+  int32_t width_in_lcu;   //!< \brief Picture height in number of LCU's.
+  uint8_t referenced;     //!< \brief Whether this picture is referenced.
+  cu_info** cu_array;     //!< \brief Info for each CU at each depth.
  uint8_t type;
  uint8_t slicetype;
  uint8_t slice_sao_luma_flag;
@ -90,8 +112,8 @@ typedef struct
 */
 typedef struct
 {
-  picture** pics;          // \brief Pointer to array of picture pointers.
-  unsigned int size;       // \brief Array size.
+  picture** pics;          //!< \brief Pointer to array of picture pointers.
+  unsigned int size;       //!< \brief Array size.
  unsigned int used_size;
 } picture_list;

@ -108,6 +130,8 @@ void picture_set_block_residual(picture *pic, uint32_t x_scu, uint32_t y_scu,
                                uint8_t depth, int8_t residual);
 void picture_set_block_split(picture *pic, uint32_t x_scu, uint32_t y_scu,
                             uint8_t depth, int8_t split);
+void picture_set_block_skipped(picture *pic, uint32_t x_scu, uint32_t y_scu,
+                                uint8_t depth, int8_t skipped);
 void picture_blit_pixels(const pixel* orig, pixel *dst,
                         unsigned width, unsigned height,
                         unsigned orig_stride, unsigned dst_stride);
@ -118,13 +142,14 @@ int picture_list_destroy(picture_list *list);
 int picture_list_add(picture_list *list, picture *pic);
 int picture_list_rem(picture_list *list, int n, int8_t destroy);

-typedef unsigned (*cost_16bit_nxn_func)(int16_t *block1, int16_t *block2);
+typedef unsigned (*cost_16bit_nxn_func)(pixel *block1, pixel *block2);
+

 cost_16bit_nxn_func get_satd_16bit_nxn_func(unsigned n);
 cost_16bit_nxn_func get_sad_16bit_nxn_func(unsigned n);

-unsigned satd_16bit_nxn(int16_t *block1, int16_t *block2, unsigned n);
-unsigned sad_16bit_nxn(int16_t *block1, int16_t *block2, unsigned n);
+unsigned satd_16bit_nxn(pixel *block1, pixel *block2, unsigned n);
+unsigned sad_16bit_nxn(pixel *block1, pixel *block2, unsigned n);

 unsigned calc_sad(const picture *pic, const picture *ref, 
                  int pic_x, int pic_y, int ref_x, int ref_y, 
--- a/src/search.c
+++ b/src/search.c
@ -276,13 +276,13 @@ unsigned search_mv_full(unsigned depth,
 * \brief
 */
 void search_buildReferenceBorder(picture *pic, int32_t x_ctb, int32_t y_ctb,
-                                 int16_t outwidth, int16_t *dst, 
+                                 int16_t outwidth, pixel *dst, 
                                 int32_t dststride, int8_t chroma)
 {
  int32_t left_col; // left column iterator
-  int16_t val;      // variable to store extrapolated value
+  pixel val;      // variable to store extrapolated value
  int32_t i;        // index iterator
-  int16_t dc_val = 1 << (g_bitdepth - 1); // default predictor value
+  pixel dc_val = 1 << (g_bitdepth - 1); // default predictor value
  int32_t top_row;  // top row iterator
  int32_t src_width = (pic->width >> (chroma ? 1 : 0));   // source picture width
  int32_t src_height = (pic->height >> (chroma ? 1 : 0)); // source picture height
@ -425,12 +425,9 @@ void search_tree(encoder_control *encoder,
    uint32_t width = LCU_WIDTH >> depth;

    // INTRAPREDICTION
-    int16_t pred[LCU_WIDTH * LCU_WIDTH + 1];
-    int16_t rec[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8)];
-    int16_t *recShift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];
-
-    //int16_t *pred = (int16_t*)malloc(LCU_WIDTH*LCU_WIDTH*sizeof(int16_t));
-    //int16_t *rec = (int16_t*)malloc((LCU_WIDTH*2+8)*(LCU_WIDTH*2+8)*sizeof(int16_t));
+    pixel pred[LCU_WIDTH * LCU_WIDTH + 1];
+    pixel rec[(LCU_WIDTH * 2 + 8) * (LCU_WIDTH * 2 + 8)];
+    pixel *recShift = &rec[(LCU_WIDTH >> (depth)) * 2 + 8 + 1];

    // Build reconstructed block to use in prediction with extrapolated borders
    search_buildReferenceBorder(encoder->in.cur_pic, x_ctb, y_ctb,
@ -494,6 +491,8 @@ uint32_t search_best_mode(encoder_control *encoder,
  }
 }

+
+
 /**
 * \brief
 */
@ -522,9 +521,14 @@ void search_slice_data(encoder_control *encoder)
      if (RENDER_CU) {
        render_cu_file(encoder, encoder->in.cur_pic, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp2);
      }
+
+      encode_block_residual(encoder, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, depth);
+
    }
  }

+
+
  if (RENDER_CU && fp) {
    close_cu_file(fp);
    fp = 0;