Merge branch 'master' into inter_residual

Conflicts: src/encoder.c
2024-11-27 19:24:06 +00:00 · 2013-10-08 14:59:23 +03:00 · 2013-10-08 14:59:23 +03:00 · ef0bea32e1
parent 192b077dba ac0db59408
commit ef0bea32e1
12 changed files with 1342 additions and 1210 deletions
--- a/src/debug.c
+++ b/src/debug.c
@ -19,7 +19,9 @@
 */
 FILE * open_cu_file(char *filename) {
  FILE *fp = fopen(filename, "w");
-  fprintf(fp, "<html><head><link rel='stylesheet' type='text/css' href='cu_style.css' /></head><body>");
+  fprintf(fp, "<?xml version='1.0' encoding='UTF-8' ?>\r\n"
+          "<html xmlns='http://www.w3.org/1999/xhtml' xml:lang='en'>\r\n"
+          "<head><link rel='stylesheet' type='text/css' href='cu_style.css' /></head><body>");
  return fp;
 }

@ -31,23 +33,74 @@ void close_cu_file(FILE *fp) {
  fclose(fp);
 }

+void yuv2rgb(unsigned char yuv[3], unsigned char rgb[3])
+{
+  int y = yuv[0];
+  int u = yuv[1];
+  int v = yuv[2];
+
+  int r = 1.164 * y + 1.596 * (v - 128);
+  int g = 1.165 * y - 0.392 * (u - 128) - 0.813 * (v - 128);
+  int b = 1.164 * y + 2.017 * (u - 128);
+
+  rgb[0] = CLIP(0, 255, r);
+  rgb[1] = CLIP(0, 255, g);
+  rgb[2] = CLIP(0, 255, b);
+}
+
 /**
 * Print information about the Coding Unit (CU) into the FILE* provided by open_cu_file.
 */
-unsigned render_cu_file(encoder_control *encoder, unsigned depth, uint16_t xCtb, uint16_t yCtb, FILE *fp)
+unsigned render_cu_file(encoder_control *encoder, picture *pic, 
+                        unsigned depth, uint16_t xCtb, uint16_t yCtb, FILE *fp)
 {
-  cu_info *cu = &encoder->in.cur_pic->cu_array[depth][xCtb + yCtb * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+  cu_info *cu = &pic->cu_array[depth][xCtb + yCtb * (pic->width_in_lcu<<MAX_DEPTH)];
+  cu_info *final_cu = &pic->cu_array[MAX_DEPTH][xCtb + yCtb * (pic->width_in_lcu<<MAX_DEPTH)];
  unsigned lambda_cost = (4 * g_lambda_cost[encoder->QP]) << 4;
  unsigned sum = 0;
  unsigned best_cost = -1;
  char type = cu->type == CU_INTRA ? 'I' : 'P';
+  unsigned x = xCtb * CU_MIN_SIZE_PIXELS;
+  unsigned y = yCtb * CU_MIN_SIZE_PIXELS;
+  unsigned luma = y * pic->width + x;
+  unsigned chroma = (y >> 1) * (pic->width >> 1) + (x >> 1);
+  unsigned char yuv[3] = { 0, 0, 0 };
+  unsigned char rgb[3] = { 0, 0, 0 };
+
+  if (x >= pic->width || y >= pic->height) {
+    // Don't output anything for CU's completely outside the botders.
+    return 0;
+  }
+
+  if (encoder->ref->used_size > 0) {
+    const picture *ref_pic = encoder->ref->pics[0];
+    yuv[0] = ref_pic->y_recdata[luma];
+    yuv[1] = ref_pic->u_recdata[chroma];
+    yuv[2] = ref_pic->v_recdata[chroma];
+    yuv2rgb(yuv, rgb);
+  }
+
+  // Enclose everything in a table with the assumption that this function is
+  // called from left to right and from top to down.
+  if (depth == 0) {
+    if (yCtb == 0 && xCtb == 0) {
+      fprintf(fp, "<table><tr><td>");
+    } else if (xCtb == 0) {
+      fprintf(fp, "</td></tr><tr><td>");
+    } else if (xCtb == NO_SCU_IN_LCU(pic->width_in_lcu)
+               && yCtb == NO_SCU_IN_LCU(pic->height_in_lcu)) {
+      fprintf(fp, "</td></tr></table>");
+    } else {
+      fprintf(fp, "</td><td>");
+    }
+  }

  fprintf(fp, 
-    "\n<table class=d%u><tr><td colspan=2>"
-    "%u (%u, %u), %d, %c, "
+    "\n<table class='d%u' bgcolor='#%02x%02x%02x'><tr><td colspan='2'>"
+    "%u (%u, %u), %c, "
    "c=%u, mv=(%d, %d)</td></tr>\n", 
-    depth,
-    depth, xCtb, yCtb, cu->split, (cu->type == CU_INTRA ? 'I' : 'P'),
+    depth, rgb[0], rgb[1], rgb[2],
+    depth, xCtb, yCtb, (cu->type == CU_INTRA ? 'I' : 'P'),
    cu->inter.cost, cu->inter.mv[0], cu->inter.mv[1]);


@ -57,18 +110,18 @@ unsigned render_cu_file(encoder_control *encoder, unsigned depth, uint16_t xCtb,
    uint8_t change = 1<<(MAX_DEPTH-1-depth);

    fprintf(fp, "<tr><td>");
-    sum += render_cu_file(encoder, depth + 1, xCtb, yCtb, fp);
+    sum += render_cu_file(encoder, pic, depth + 1, xCtb, yCtb, fp);
    fprintf(fp, "</td><td>");
-    sum += render_cu_file(encoder, depth + 1, xCtb + change, yCtb, fp);
+    sum += render_cu_file(encoder, pic, depth + 1, xCtb + change, yCtb, fp);
    fprintf(fp, "</td></tr>");

    fprintf(fp, "<tr><td>");
-    sum += render_cu_file(encoder, depth + 1, xCtb, yCtb + change, fp);
+    sum += render_cu_file(encoder, pic, depth + 1, xCtb, yCtb + change, fp);
    fprintf(fp, "</td><td>");
-    sum += render_cu_file(encoder, depth + 1, xCtb + change, yCtb + change, fp);
+    sum += render_cu_file(encoder, pic, depth + 1, xCtb + change, yCtb + change, fp);
    fprintf(fp, "</td></tr>");

-    fprintf(fp, "<tr><td colspan=2>sum=%u, sum+lambda=%u</td></tr>",
+    fprintf(fp, "<tr><td colspan='2'>sum=%u, sum+lambda=%u</td></tr>",
      sum, sum + lambda_cost);
    if (sum + lambda_cost < cu->inter.cost) {
      best_cost = sum + lambda_cost;
@ -79,6 +132,16 @@ unsigned render_cu_file(encoder_control *encoder, unsigned depth, uint16_t xCtb,
    best_cost = cu->inter.cost;
  }

+  if (depth == 0) {
+    fprintf(fp, 
+      "<tr><td colspan='2'>"
+      "best depth=%u, %c, "
+      "c=%u, mv=(%d, %d)</td></tr>\n"
+      "</td></tr>", 
+      final_cu->depth, (final_cu->type == CU_INTRA ? 'I' : 'P'),
+      final_cu->inter.cost, final_cu->inter.mv[0], final_cu->inter.mv[1]);
+  }
+
  fprintf(fp, "</table>");
  return best_cost;
 }
--- a/src/debug.h
+++ b/src/debug.h
@ -21,6 +21,6 @@

 FILE * open_cu_file(char *filename);
 void close_cu_file(FILE *fp);
-unsigned render_cu_file(encoder_control *encoder, unsigned depth, uint16_t x_cu, uint16_t y_cu, FILE *fp);
+unsigned render_cu_file(encoder_control *encoder, picture *pic, unsigned depth, uint16_t x_cu, uint16_t y_cu, FILE *fp);

 #endif
--- a/src/encmain.c
+++ b/src/encmain.c
@ -146,7 +146,7 @@ int main(int argc, char *argv[])
  encoder->QP       = 32;
  encoder->in.video_format = FORMAT_420;
  // deblocking filter
-  encoder->deblock_enable  = 0;
+  encoder->deblock_enable  = 1;
  encoder->beta_offset_div2  = 0;
  encoder->tc_offset_div2    = 0;
  // SAO
--- a/src/encoder.c
+++ b/src/encoder.c
--- a/src/encoder.h
+++ b/src/encoder.h
@ -33,7 +33,7 @@ enum { FORMAT_400 = 0, FORMAT_420, FORMAT_422, FORMAT_444 };
 /* Input info struct */
 typedef struct
 {
-  FILE* file;
+  FILE *file;
  int32_t width;  /*!< \brief input picture width */
  int32_t height; /*!< \brief input picture height */
  int32_t real_width;  /*!< \brief real input picture width */
@ -52,19 +52,19 @@ typedef struct
  config *cfg;
  encoder_input in;
  encoder_me me;
-  bitstream* stream;
+  bitstream *stream;
  FILE *output;
  picture_list *ref;
  int8_t ref_list;
  int8_t ref_idx_num[2];
-  int8_t QP;             /*!< \brief Quantization parameter */
+  int8_t QP;             // \brief Quantization parameter
  int8_t bitdepth;

  /* Filtering */
-  int8_t deblock_enable; /*!< \brief Flag to enable deblocking filter */
-  int8_t sao_enable;     /*!< \brief Flag to enable sample adaptive offset filter */
-  int8_t beta_offset_div2; /*!< \brief (deblocking) beta offset (div 2), range -6...6 */
-  int8_t tc_offset_div2;   /*!< \brief (deblocking)tc offset (div 2), range -6...6 */
+  int8_t deblock_enable; // \brief Flag to enable deblocking filter
+  int8_t sao_enable;     // \brief Flag to enable sample adaptive offset filter
+  int8_t beta_offset_div2; // \brief (deblocking) beta offset (div 2), range -6...6
+  int8_t tc_offset_div2;   // \brief (deblocking)tc offset (div 2), range -6...6
 } encoder_control;

 typedef struct
@ -86,7 +86,7 @@ typedef struct
  int32_t recbase_stride;
  int32_t pred_stride;
  
-  /* TODO: unify luma+chroma arrays */
+  // TODO: unify luma+chroma arrays
  int16_t *coeff[3];
  int8_t cb_top[3];
  int8_t cb[4];
@ -101,51 +101,64 @@ typedef struct
 } transform_info;

 void init_tables(void);
-void init_encoder_control(encoder_control* control,bitstream* output);
-void init_encoder_input(encoder_input* input,FILE* inputfile, int32_t width, int32_t height);
-void encode_one_frame(encoder_control* encoder);
-void read_one_frame(FILE* file, encoder_control* encoder);
+void init_encoder_control(encoder_control *control, bitstream *output);
+void init_encoder_input(encoder_input *input, FILE* inputfile,
+                        int32_t width, int32_t height);
+void encode_one_frame(encoder_control *encoder);
+void read_one_frame(FILE *file, encoder_control *encoder);

-
-void encode_seq_parameter_set(encoder_control* encoder);
-void encode_pic_parameter_set(encoder_control* encoder);
-void encode_vid_parameter_set(encoder_control* encoder);
-void encode_slice_data(encoder_control* encoder);
-void encode_slice_header(encoder_control* encoder);
-void encode_coding_tree(encoder_control* encoder,uint16_t x_ctb,uint16_t y_ctb, uint8_t depth);
-void encode_lastSignificantXY(encoder_control* encoder,uint8_t lastpos_x, uint8_t lastpos_y, uint8_t width, uint8_t height, uint8_t type, uint8_t scan);
-void encode_CoeffNxN(encoder_control* encoder,int16_t* coeff, uint8_t width, uint8_t type, int8_t scan_mode);
-void encode_transform_tree(encoder_control* encoder,transform_info* ti,uint8_t depth);
-void encode_transform_coeff(encoder_control* encoder,transform_info* ti,int8_t depth, int8_t tr_depth);
+void encode_seq_parameter_set(encoder_control *encoder);
+void encode_pic_parameter_set(encoder_control *encoder);
+void encode_vid_parameter_set(encoder_control *encoder);
+void encode_slice_data(encoder_control *encoder);
+void encode_slice_header(encoder_control *encoder);
+void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb,
+                        uint16_t y_ctb, uint8_t depth);
+void encode_last_significant_xy(encoder_control *encoder, uint8_t lastpos_x,
+                                uint8_t lastpos_y, uint8_t width, uint8_t height,
+                                uint8_t type, uint8_t scan);
+void encode_coeff_nxn(encoder_control *encoder, int16_t *coeff, uint8_t width,
+                      uint8_t type, int8_t scan_mode);
+void encode_transform_tree(encoder_control *encoder, transform_info *ti,
+                           uint8_t depth);
+void encode_transform_coeff(encoder_control *encoder, transform_info *ti,
+                            int8_t depth, int8_t tr_depth);

 extern int16_t g_lambda_cost[55];
 extern uint32_t* g_sig_last_scan[3][7];
-int8_t g_convert_to_bit[LCU_WIDTH+1];
+int8_t g_convert_to_bit[LCU_WIDTH + 1];
 static int8_t g_bitdepth     = 8;
 static int8_t g_bit_increment = 0;

-#define MAX_NUM_SPU_W ((1<<(MAX_DEPTH))/4)
-static uint32_t g_z_scan_to_raster [ MAX_NUM_SPU_W*MAX_NUM_SPU_W ] = { 0, };
-static uint32_t g_raster_to_z_scan [ MAX_NUM_SPU_W*MAX_NUM_SPU_W ] = { 0, };
-static const uint8_t g_group_idx[ 32 ]    = {0,1,2,3,4,4,5,5,6,6,6,6,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9};
-static const uint8_t g_min_in_group[ 10 ]  = {0,1,2,3,4,6,8,12,16,24};
-static uint32_t g_sig_last_scan_32x32[ 64 ] = 
-{  0, 8, 1,16, 9, 2,24,17,
-  10, 3,32,25,18,11, 4,40,
-  33,26,19,12, 5,48,41,34,
-  27,20,13, 6,56,49,42,35,
-  28,21,14, 7,57,50,43,36,
-  29,22,15,58,51,44,37,30,
-  23,59,52,45,38,31,60,53,
-  46,39,61,54,47,62,55,63 };
+#define MAX_NUM_SPU_W ((1 << (MAX_DEPTH)) / 4)
+static uint32_t g_z_scan_to_raster[MAX_NUM_SPU_W * MAX_NUM_SPU_W] = { 0, };
+static uint32_t g_raster_to_z_scan[MAX_NUM_SPU_W * MAX_NUM_SPU_W] = { 0, };

-static const uint32_t g_sig_last_scan_8x8[ 3 ][ 4 ] =
+static const uint8_t g_group_idx[32] = {
+  0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
+  6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
+  8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
+  9, 9 };
+
+static const uint8_t g_min_in_group[10] = {
+  0, 1, 2, 3, 4, 6, 8, 12, 16, 24 };
+
+static uint32_t g_sig_last_scan_32x32[64] = {
+  0,  8,  1,  16, 9,  2,  24, 17, 10, 3,
+  32, 25, 18, 11, 4,  40, 33, 26, 19, 12,
+  5,  48, 41, 34, 27, 20, 13, 6,  56, 49,
+  42, 35, 28, 21, 14, 7,  57, 50, 43, 36,
+  29, 22, 15, 58, 51, 44, 37, 30, 23, 59,
+  52, 45, 38, 31, 60, 53, 46, 39, 61, 54,
+  47, 62, 55, 63 };
+
+static const uint32_t g_sig_last_scan_8x8[3][4] =
 { {0, 2, 1, 3},
  {0, 1, 2, 3},
  {0, 2, 1, 3}
 };

-// 
+
 //4 8 16 32 64 128
 //0 1  2  3  4   5
 static const uint8_t g_to_bits[129] =
@ -156,20 +169,21 @@ static const uint8_t g_to_bits[129] =
  0,0,0,0,0,0,0,2,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,
-  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
 };
 #define TOBITS(len) g_to_bits[len]


-#define C1FLAG_NUMBER               8 /*!< maximum number of largerThan1 flag coded in one chunk */
-#define C2FLAG_NUMBER               1 /*!< maximum number of largerThan2 flag coded in one chunk */
+#define C1FLAG_NUMBER 8 // maximum number of largerThan1 flag coded in one chunk
+#define C2FLAG_NUMBER 1 // maximum number of largerThan2 flag coded in one chunk

 enum COEFF_SCAN_TYPE
 {
-  SCAN_DIAG = 0,         /*!< up-right diagonal scan */
-  SCAN_HOR,              /*!< horizontal first scan  */
-  SCAN_VER               /*!< vertical first scan    */
+  SCAN_DIAG = 0, // up-right diagonal scan
+  SCAN_HOR,      // horizontal first scan
+  SCAN_VER       // vertical first scan
 };


-#endif
+#endif
--- a/src/filter.c
+++ b/src/filter.c
@ -157,13 +157,16 @@ void filter_deblock_edge_luma(encoder_control *encoder,
  int32_t stride = encoder->in.cur_pic->width;
  int32_t offset = stride;
  int32_t beta_offset_div2 = encoder->beta_offset_div2;
-  int32_t tc_offset_div2   = encoder->tc_offset_div2;
-  int8_t strength = 2; // Filter strength
+  int32_t tc_offset_div2   = encoder->tc_offset_div2;  
  // TODO: support 10+bits
  uint8_t *orig_src = &encoder->in.cur_pic->y_recdata[xpos + ypos*stride];
  uint8_t *src = orig_src;
  int32_t step = 1;
+  cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(xpos>>MIN_SIZE) + (ypos>>MIN_SIZE) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+  cu_info *cu_p = 0;
+  int8_t strength = 0;
  
+
  if(dir == EDGE_VER) {
    offset = 1;
    step = stride;
@ -171,23 +174,31 @@ void filter_deblock_edge_luma(encoder_control *encoder,
  
  {
    int32_t qp              = encoder->QP;
-    int32_t bitdepth_scale  = 1 << (g_bitdepth - 8);
-    int32_t tc_index        = CLIP(0, 51 + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1)));
+    int32_t bitdepth_scale  = 1 << (g_bitdepth - 8);    
    int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
-    int32_t tc              = g_tc_table_8x8[tc_index] * bitdepth_scale;
    int32_t beta            = g_beta_table_8x8[b_index] * bitdepth_scale;
    int32_t side_threshold  = (beta + (beta >>1 )) >> 3;
-    int32_t thr_cut         = tc * 10;
    uint32_t blocks_in_part = (LCU_WIDTH >> depth) / 4;
    uint32_t block_idx;
-
+    int32_t tc_index,tc,thr_cut;
    // TODO: add CU based QP calculation 

    // For each 4-pixel part in the edge
-    for (block_idx = 0; block_idx < blocks_in_part; ++block_idx)
-    {
+    for (block_idx = 0; block_idx < blocks_in_part; ++block_idx) {      
      int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
-
+      if((block_idx & 1) == 0)
+      {
+        // CU in the side we are filtering, update every 8-pixels
+        cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][((xpos>>MIN_SIZE)-(dir == EDGE_VER)+(dir == EDGE_HOR?block_idx/2:0)) +
+                                                         ((ypos>>MIN_SIZE)-(dir == EDGE_HOR)+(dir == EDGE_VER?block_idx/2:0)) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+        // Filter strength
+        strength = ((cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) ? 2 : 
+                   (((abs(cu_q->inter.mv[0] - cu_p->inter.mv[0]) >= 4) || (abs(cu_q->inter.mv[1] - cu_p->inter.mv[1]) >= 4)) ? 1 : 0));
+        tc_index        = CLIP(0, 51 + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1)));
+        tc              = g_tc_table_8x8[tc_index] * bitdepth_scale;
+        thr_cut         = tc * 10;
+      }
+      if(!strength) continue;
      // Check conditions for filtering
      // TODO: Get rid of these inline defines.
      #define calc_DP(s,o) abs( (int16_t)s[-o*3] - (int16_t)2*s[-o*2] + (int16_t)s[-o] )
@ -242,9 +253,13 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
  // Init offset and step to EDGE_HOR
  int32_t offset = stride;
  int32_t step = 1;
+  cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x>>(MIN_SIZE-1)) + (y>>(MIN_SIZE-1)) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+  cu_info *cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][((x>>(MIN_SIZE-1))-(dir == EDGE_VER)) +
+                                                            ((y>>(MIN_SIZE-1))-(dir == EDGE_HOR)) * (encoder->in.width_in_lcu << MAX_DEPTH)];
+  int8_t strength = (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) ? 2 : 0; // Filter strength

  // We cannot filter edges not on 8x8 grid
-  if(depth == MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) )
+  if(strength != 2 || (depth == MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) ))
  {
    return;
  }
@ -259,7 +274,7 @@ void filter_deblock_edge_chroma(encoder_control *encoder,
  {
    int32_t QP             = g_chroma_scale[encoder->QP];
    int32_t bitdepth_scale = 1 << (g_bitdepth-8);
-    int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2 + (tc_offset_div2 << 1)));    
+    int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));    
    int32_t Tc             = g_tc_table_8x8[TC_index]*bitdepth_scale;
    uint32_t blocks_in_part= (LCU_WIDTH>>(depth+1)) / 4;
    uint32_t blk_idx;
@ -386,7 +401,9 @@ void filter_inter_halfpel_chroma(int16_t *src, int16_t src_stride, int width, in
  int32_t shift1 = g_bitdepth-8;
  int32_t shift2 = 6;
  int32_t shift3 = 14-g_bitdepth;
-  int32_t offset = 1<<(shift2-1); //!< offset for rounding purposes
+  int32_t offset = 1 << (shift2 - 1); //!< offset for rounding purposes
+  int32_t offset3 = 1 << (shift3 - 1);
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);

  // Loop source pixels and generate four filtered half-pel pixels on each round
  for (y = 0; y < height; y++) {
@ -406,12 +423,12 @@ void filter_inter_halfpel_chroma(int16_t *src, int16_t src_stride, int width, in

      // ae0,0 - We need this only when hor_flag and for ee0,0
      if (hor_flag) {
-        ae_temp = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1) + offset; // ae0,0
+        ae_temp = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1); // ae0,0
      }
      // ea0,0 - needed only when ver_flag
      if(ver_flag) {
        dst[dst_pos + 1*dst_stride] = (((-4*src[src_pos - src_stride] + 36*src[src_pos] + 36*src[src_pos + src_stride] 
-                                        - 4*src[src_pos + 2*src_stride]  ) >> shift1) + offset) >> shift3; // ea0,0
+                                        - 4*src[src_pos + 2*src_stride]  ) >> shift1) + (1<<(shift3-1))) >> shift3; // ea0,0
      }

      // When both flags, we use _only_ this pixel (but still need ae0,0 for it)
@ -419,17 +436,17 @@ void filter_inter_halfpel_chroma(int16_t *src, int16_t src_stride, int width, in
        // Calculate temporary values..
        //TODO: optimization, store these values
        src_pos -= src_stride;  //0,-1
-        ae_temp1 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1) + offset; // ae0,-1
-        src_pos += src_stride;  //0,1
-        ae_temp2 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1) + offset; // ae0,1
+        ae_temp1 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1); // ae0,-1
+        src_pos += 2*src_stride;  //0,1
+        ae_temp2 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1); // ae0,1
        src_pos += src_stride;  //0,2
-        ae_temp3 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1) + offset; // ae0,2
+        ae_temp3 = ((-4*src[src_pos - 1] + 36*src[src_pos] + 36*src[src_pos + 1] - 4*src[src_pos + 2] ) >> shift1); // ae0,2

-        dst[dst_pos + 1*dst_stride + 1] = ((( -4*ae_temp1 + 36*dst[dst_pos + 1] + 36*ae_temp2 - 4*ae_temp3 ) >> shift2) + offset) >> shift3; // ee0,0
+        dst[dst_pos + 1*dst_stride + 1] = (((-4*ae_temp1 + 36*ae_temp + 36*ae_temp2 - 4*ae_temp3 ) + offset23) >> shift2) >> shift3; // ee0,0
      }

      if(hor_flag) {
-        dst[dst_pos + 1] = ae_temp >> shift3;
+        dst[dst_pos + 1] = (ae_temp + offset3) >> shift3;
      }
    }
  }
--- a/src/global.h
+++ b/src/global.h
@ -34,7 +34,7 @@
 /* CONFIG VARIABLES */
 #define LCU_WIDTH 64 /*!< Largest Coding Unit (IT'S 64x64, DO NOT TOUCH!) */

-#define MAX_INTER_SEARCH_DEPTH 2
+#define MAX_INTER_SEARCH_DEPTH 3
 #define MIN_INTER_SEARCH_DEPTH 0

 #define MAX_INTRA_SEARCH_DEPTH 3 /*!< Max search depth -> min block size (3 == 8x8) */
--- a/src/inter.c
+++ b/src/inter.c
@ -58,9 +58,10 @@ void inter_set_block(picture* pic, uint32_t x_cu, uint32_t y_cu, uint8_t depth,
 * \param dst destination picture
 * \returns Void
 */
-void inter_recon(picture* ref,int32_t xpos, int32_t ypos,int32_t width, int16_t mv[2], picture *dst)
+void inter_recon(picture* ref,int32_t xpos, int32_t ypos,int32_t width, const int16_t mv_param[2], picture *dst)
 {
  int x,y,coord_x,coord_y;
+  int16_t mv[2] = { mv_param[0], mv_param[1] };

  int32_t dst_width_c = dst->width>>1; //!< Destination picture width in chroma pixels
  int32_t ref_width_c = ref->width>>1; //!< Reference picture width in chroma pixels
@ -84,8 +85,8 @@ void inter_recon(picture* ref,int32_t xpos, int32_t ypos,int32_t width, int16_t
  int16_t halfpel_v[LCU_WIDTH * LCU_WIDTH]; //!< interpolated 2W x 2H block (v)

  // TODO: Fractional pixel support
-  mv[0] = mv[0]>>2;
-  mv[1] = mv[1]>>2;
+  mv[0] >>= 2;
+  mv[1] >>= 2;

  // Chroma half-pel
  // get half-pel interpolated block and push it to output
@ -265,8 +266,11 @@ void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int

  // B0, B1 and B2 availability testing
  if (y_cu != 0) {
-    b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
-    if (!b0->coded) b0 = NULL;
+
+    if (x_cu + cur_block_in_scu < encoder->in.width_in_lcu<<MAX_DEPTH) {
+      b0 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
+      if (!b0->coded) b0 = NULL;
+    }
    b1 = &encoder->in.cur_pic->cu_array[MAX_DEPTH][x_cu + cur_block_in_scu - 1 + (y_cu - 1) * (encoder->in.width_in_lcu<<MAX_DEPTH)];
    if (!b1->coded) b1 = NULL;

--- a/src/inter.h
+++ b/src/inter.h
@ -19,7 +19,7 @@


 void inter_set_block(picture* pic,uint32_t x_cu, uint32_t y_cu, uint8_t depth, cu_info *cur_cu);
-void inter_recon(picture *ref,int32_t xpos, int32_t ypos,int32_t width, int16_t mv[2], picture* dst);
+void inter_recon(picture *ref,int32_t xpos, int32_t ypos,int32_t width, const int16_t mv[2], picture* dst);

 void inter_get_mv_cand(encoder_control *encoder, int32_t x_cu, int32_t y_cu, int8_t depth, int16_t mv_cand[2][2]);

--- a/src/picture.c
+++ b/src/picture.c
@ -20,29 +20,6 @@
 #define PSNRMAX (255.0 * 255.0)


-/**
- * \brief Set block splitflag
- * \param pic    picture to use
- * \param x_scu  x SCU position (smallest CU)
- * \param y_scu  y SCU position (smallest CU)
- * \param depth  current CU depth
- * \param mode   mode to set
- */
-void picture_set_block_split(picture *pic, uint32_t x_scu, uint32_t y_scu,
-                             uint8_t depth, int8_t split)
-{
-  uint32_t x, y;
-  int width_in_scu = pic->width_in_lcu << MAX_DEPTH;
-  int block_scu_width = (LCU_WIDTH >> depth) / (LCU_WIDTH >> MAX_DEPTH);
-
-  for (y = y_scu; y < y_scu + block_scu_width; ++y) {
-    int cu_row = y * width_in_scu;
-    for (x = x_scu; x < x_scu + block_scu_width; ++x) {
-      pic->cu_array[depth][cu_row + x].split = split;
-    }
-  }
-}
-
 /**
 * \brief Set block coded status
 * \param pic    picture to use
--- a/src/picture.h
+++ b/src/picture.h
@ -57,7 +57,6 @@ typedef struct
  int8_t coded;
  cu_info_intra intra;
  cu_info_inter inter;
-  int8_t split;
 } cu_info;

 /**
--- a/src/search.c
+++ b/src/search.c
@ -26,26 +26,76 @@

 // Temporarily for debugging.
 #define USE_INTRA_IN_P 0
+//#define RENDER_CU encoder->frame==2
 #define RENDER_CU 0
 #define USE_FULL_SEARCH 0
+#define USE_CHROMA_IN_MV_SEARCH 0

-#define IN_FRAME(x, y, width, height, block) ((x) >= 0 && (y) >= 0 && (x) + (block) <= (width) && (y) + (block) <= (height))
+#define IN_FRAME(x, y, width, height, block_width, block_height) \
+  ((x) >= 0 && (y) >= 0 \
+  && (x) + (block_width) <= (width) \
+  && (y) + (block_height) <= (height))


-unsigned get_sad(int x, int y, int width, int height, int block, uint8_t *pic_data, uint8_t *ref_data)
+/**
+ * \brief  Get Sum of Absolute Differences (SAD) between two blocks in two
+ *         different frames.
+ * \param pic  First frame.
+ * \param ref  Second frame.
+ * \param pic_x  X coordinate of the first block.
+ * \param pic_y  Y coordinate of the first block.
+ * \param ref_x  X coordinate of the second block.
+ * \param ref_y  Y coordinate of the second block.
+ * \param block_width  Width of the blocks.
+ * \param block_height  Height of the blocks.
+ */
+unsigned get_block_sad(picture *pic, picture *ref, 
+                       int pic_x, int pic_y, int ref_x, int ref_y, 
+                       int block_width, int block_height)
 {
-  if (!IN_FRAME(x, y, width, height, block)) return 0; // This means invalid, for now.
+  uint8_t *pic_data, *ref_data;
+  int width = pic->width;
+  int height = pic->height;

-  return 1 + sad(pic_data, &ref_data[y * width + x], block, block, width);
+  unsigned result = 1; // Start from 1 so result is never 0.
+
+  // 0 means invalid, for now.
+  if (!IN_FRAME(ref_x, ref_y, width, height, block_width, block_height)) return 0;
+
+  pic_data = &pic->y_data[pic_y * width + pic_x];
+  ref_data = &ref->y_data[ref_y * width + ref_x];
+  result += sad(pic_data, ref_data, block_width, block_height, width);
+
+  #if USE_CHROMA_IN_MV_SEARCH
+  // Halve everything because chroma is half the resolution.
+  width >>= 2;
+  pic_x >>= 2;
+  pic_y >>= 2;
+  ref_x >>= 2;
+  ref_y >>= 2;
+  block >>= 2;
+
+  pic_data = &pic->u_data[pic_y * width + pic_x];
+  ref_data = &ref->u_data[ref_y * width + ref_x];
+  result += sad(pic_data, ref_data, block_width, block_height, width);
+
+  pic_data = &pic->v_data[pic_y * width + pic_x];
+  ref_data = &ref->v_data[ref_y * width + ref_x];
+  result += sad(pic_data, ref_data, block_width, block_height, width);
+  #endif
+
+  return result;
 }

-void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
+void search_mv(picture *pic, picture *ref,
               cu_info *cur_cu,  int orig_x, int orig_y, int x, int y, 
               unsigned depth)
 {
  int block_width = CU_WIDTH_FROM_DEPTH(depth);

-  unsigned cost = get_sad(orig_x + x, orig_y + y, pic->width, pic->height, block_width, pic_data, ref_data);
+  // Get cost for the predicted motion vector.
+  unsigned cost = get_block_sad(pic, ref, orig_x, orig_y, orig_x + x, orig_y + y,
+                                block_width, block_width);
  unsigned best_cost = -1;
  unsigned step = 8;

@ -55,15 +105,15 @@ void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
    cur_cu->inter.mv[1] = y;
  }

-  // If initial vector is farther away than the step, try the (0, 0) vector
-  // in addition to the initial vector.
-  if (abs(x) > step || abs(y) > step) {
-    cost = get_sad(orig_x, orig_y, pic->width, pic->height, block_width, pic_data, ref_data);
+  // If initial vector is long, also try the (0, 0) vector just in case.
+  if (x != 0 || y != 0) {
+    cost = get_block_sad(pic, ref, orig_x, orig_y, orig_x, orig_y,
+                         block_width, block_width);

    if (cost > 0 && cost < best_cost) {
      best_cost = cost;
-      cur_cu->inter.mv[0] = x;
-      cur_cu->inter.mv[1] = y - step;
+      cur_cu->inter.mv[0] = 0;
+      cur_cu->inter.mv[1] = 0;
    }
  }

@ -75,16 +125,20 @@ void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
    // due to quantization. It's value is just a guess based on the first
    // blocks of the BQMall sequence, which don't move.
    // TODO: Quantization factor probably affects what the constant should be.
+    /*
    if (best_cost <= block_width * block_width * 1.8) {
      break;
    }
+    */

    // Change center of search to the current best point.
    x = cur_cu->inter.mv[0];
    y = cur_cu->inter.mv[1];

    // above
-    cost = get_sad(orig_x + x, orig_y + y - step, pic->width, pic->height, block_width, pic_data, ref_data);
+    cost = get_block_sad(pic, ref, orig_x, orig_y,
+                         orig_x + x, orig_y + y - step,
+                         block_width, block_width);
    if (cost > 0 && cost < best_cost) {
      best_cost = cost;
      cur_cu->inter.mv[0] = x;
@ -92,7 +146,9 @@ void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
    }

    // left
-    cost = get_sad(orig_x + x - step, orig_y + y, pic->width, pic->height, block_width, pic_data, ref_data);
+    cost = get_block_sad(pic, ref, orig_x, orig_y,
+                         orig_x + x - step, orig_y + y,
+                         block_width, block_width);
    if (cost > 0 && cost < best_cost) {
      best_cost = cost;
      cur_cu->inter.mv[0] = x - step;
@ -100,7 +156,9 @@ void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
    }

    // right
-    cost = get_sad(orig_x + x + step, orig_y + y, pic->width, pic->height, block_width, pic_data, ref_data);
+    cost = get_block_sad(pic, ref, orig_x, orig_y,
+                         orig_x + x + step, orig_y + y,
+                         block_width, block_width);
    if (cost > 0 && cost < best_cost) {
      best_cost = cost;
      cur_cu->inter.mv[0] = x + step;
@ -108,7 +166,9 @@ void search_mv(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
    }

    // below
-    cost = get_sad(orig_x + x, orig_y + y + step, pic->width, pic->height, block_width, pic_data, ref_data);
+    cost = get_block_sad(pic, ref, orig_x, orig_y,
+                         orig_x + x, orig_y + y + step,
+                         block_width, block_width);
    if (cost > 0 && cost < best_cost) {
      best_cost = cost;
      cur_cu->inter.mv[0] = x;
@ -157,13 +217,13 @@ void search_mv_full(picture *pic, uint8_t *pic_data, uint8_t *ref_data,
  step /= 2;
  if (step > 0) {
    search_mv_full(pic, pic_data, ref_data, cur_cu, step, orig_x, orig_y,
-                         x, y - step, depth);
+                   x, y - step, depth);
    search_mv_full(pic, pic_data, ref_data, cur_cu, step, orig_x, orig_y,
-                         x - step, y, depth);
+                   x - step, y, depth);
    search_mv_full(pic, pic_data, ref_data, cur_cu, step, orig_x, orig_y,
-                         x + step, y, depth);
+                   x + step, y, depth);
    search_mv_full(pic, pic_data, ref_data, cur_cu, step, orig_x, orig_y,
-                         x, y + step, depth);
+                   x, y + step, depth);
  }
 }

@ -268,7 +328,6 @@ void search_tree(encoder_control *encoder,
    if (border) {
      // Split blocks and remember to change x and y block positions
      uint8_t change = 1 << (MAX_DEPTH - 1 - depth);
-      SET_SPLITDATA(cur_cu, 1);
      search_tree(encoder, x_ctb, y_ctb, depth + 1);
      if (!border_x || border_split_x) {
        search_tree(encoder, x_ctb + change, y_ctb, depth + 1);
@ -305,8 +364,8 @@ void search_tree(encoder_control *encoder,
      int start_y = 0;
      // Convert from sub-pixel accuracy.
      if (ref_cu->type == CU_INTER) {
-        int start_x = ref_cu->inter.mv[0] >> 2;
-        int start_y = ref_cu->inter.mv[1] >> 2;
+        start_x = ref_cu->inter.mv[0] >> 2;
+        start_y = ref_cu->inter.mv[1] >> 2;
      }

      if (USE_FULL_SEARCH) {
@ -314,7 +373,7 @@ void search_tree(encoder_control *encoder,
                       cur_cu, 8, x, y,
                       start_x, start_y, depth);
      } else {
-        search_mv(cur_pic, cur_data, ref_pic->y_data, 
+        search_mv(cur_pic, ref_pic, 
                  cur_cu, x, y, 
                  start_x, start_y, depth);
      }
@ -385,26 +444,24 @@ uint32_t search_best_mode(encoder_control *encoder,
    cost += search_best_mode(encoder, x_ctb + change, y_ctb + change, depth + 1);

    // We split if the cost is better (0 cost -> not checked)
-    if (cost != 0 
+    if ( (encoder->in.cur_pic->slicetype == SLICE_I && depth < MIN_INTRA_SEARCH_DEPTH) ||
+        (cost != 0 
        && (best_intra_cost != 0 && cost + lambdaCost < best_intra_cost)
        && (best_inter_cost != 0
            && cost + lambdaCost < best_inter_cost
-            && encoder->in.cur_pic->slicetype != SLICE_I))
+            && encoder->in.cur_pic->slicetype != SLICE_I)))
    {
      // Set split to 1
-      picture_set_block_split(encoder->in.cur_pic, x_ctb, y_ctb, depth, 1);
      best_cost = cost + lambdaCost;
    } else if (best_inter_cost != 0 // Else, check if inter cost is smaller or the same as intra 
        && (best_inter_cost <= best_intra_cost || best_intra_cost == 0)
        && encoder->in.cur_pic->slicetype != SLICE_I)
    {
      // Set split to 0 and mode to inter.mode
-      picture_set_block_split(encoder->in.cur_pic, x_ctb, y_ctb, depth, 0);
      inter_set_block(encoder->in.cur_pic, x_ctb, y_ctb, depth, cur_cu);
      best_cost = best_inter_cost;
    } else { // Else, dont split and recursively set block mode
      // Set split to 0 and mode to intra.mode
-      picture_set_block_split(encoder->in.cur_pic, x_ctb, y_ctb, depth, 0);
      intra_set_block_mode(encoder->in.cur_pic, x_ctb, y_ctb, depth,
          cur_cu->intra.mode);
      best_cost = best_intra_cost;
@ -414,12 +471,10 @@ uint32_t search_best_mode(encoder_control *encoder,
             && encoder->in.cur_pic->slicetype != SLICE_I)
  {
    // Set split to 0 and mode to inter.mode
-    picture_set_block_split(encoder->in.cur_pic, x_ctb, y_ctb, depth, 0);
    inter_set_block(encoder->in.cur_pic, x_ctb, y_ctb, depth, cur_cu);
    best_cost = best_inter_cost;
  } else {
    // Set split to 0 and mode to intra.mode
-    picture_set_block_split(encoder->in.cur_pic, x_ctb, y_ctb, depth, 0);
    intra_set_block_mode(encoder->in.cur_pic, x_ctb, y_ctb, depth,
        cur_cu->intra.mode);
    best_cost = best_intra_cost;
@ -436,7 +491,7 @@ void search_slice_data(encoder_control *encoder)
  int16_t x_lcu, y_lcu;
  FILE *fp = 0, *fp2 = 0;

-  if (RENDER_CU && encoder->frame == 1) {
+  if (RENDER_CU) {
    fp = open_cu_file("cu_search.html");
    fp2 = open_cu_file("cu_best.html");
  }
@ -447,14 +502,14 @@ void search_slice_data(encoder_control *encoder)
      uint8_t depth = 0;
      // Recursive function for looping through all the sub-blocks
      search_tree(encoder, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, depth);
-      if (RENDER_CU && encoder->frame == 1) {
-        render_cu_file(encoder, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp);
+      if (RENDER_CU) {
+        render_cu_file(encoder, encoder->in.cur_pic, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp);
      }

      // Decide actual coding modes
      search_best_mode(encoder, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, depth);
-      if (RENDER_CU && encoder->frame == 1) {
-        render_cu_file(encoder, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp2);
+      if (RENDER_CU) {
+        render_cu_file(encoder, encoder->in.cur_pic, depth, x_lcu << MAX_DEPTH, y_lcu << MAX_DEPTH, fp2);
      }
    }
  }