Merge branch 't-20140612'

2024-11-27 19:24:06 +00:00 · 2014-06-12 15:51:16 +03:00 · 2014-06-12 15:51:16 +03:00 · 83f07647e0
parent 3b7d532675 68ad323e84
commit 83f07647e0
15 changed files with 325 additions and 188 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,7 @@
 *.exe
 *.o
 *.d
+
+*.log
+.kdev4
+src/kvazaar
--- a/src/checkpoint.h
+++ b/src/checkpoint.h
@ -41,7 +41,7 @@ extern int g_ckpt_record; //Do we record?
  } \
 } while (0)
    
-#define CHECKPOINTS_FINALIZE() do {fclose(g_ckpt_file); g_ckpt_file = NULL;} while (0)
+#define CHECKPOINTS_FINALIZE() do {if (g_ckpt_file) fclose(g_ckpt_file); g_ckpt_file = NULL;} while (0)

 #define CHECKPOINT_MARK(str, ...) do { \
  if (g_ckpt_file) { \
--- a/src/cu.h
+++ b/src/cu.h
@ -94,6 +94,23 @@ typedef struct
  cu_info_inter inter;
 } cu_info;

+#define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
+  "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
+  "intra[0].cost=%u intra[0].bitcost=%u intra[0].mode=%d intra[0].mode_chroma=%d intra[0].tr_skip=%d " \
+  "intra[1].cost=%u intra[1].bitcost=%u intra[1].mode=%d intra[1].mode_chroma=%d intra[1].tr_skip=%d " \
+  "intra[2].cost=%u intra[2].bitcost=%u intra[2].mode=%d intra[2].mode_chroma=%d intra[2].tr_skip=%d " \
+  "intra[3].cost=%u intra[3].bitcost=%u intra[3].mode=%d intra[3].mode_chroma=%d intra[3].tr_skip=%d " \
+  "inter.cost=%u inter.bitcost=%u inter.mv[0]=%d inter.mv[1]=%d inter.mvd[0]=%d inter.mvd[1]=%d " \
+  "inter.mv_cand=%d inter.mv_ref=%d inter.mv_dir=%d inter.mode=%d" \
+  , (cu).type, (cu).depth, (cu).part_size, (cu).tr_depth, (cu).coded, \
+  (cu).skipped, (cu).merged, (cu).merge_idx, (cu).cbf.y, (cu).cbf.u, (cu).cbf.v, \
+  (cu).intra[0].cost, (cu).intra[0].bitcost, (cu).intra[0].mode, (cu).intra[0].mode_chroma, (cu).intra[0].tr_skip, \
+  (cu).intra[1].cost, (cu).intra[1].bitcost, (cu).intra[1].mode, (cu).intra[1].mode_chroma, (cu).intra[1].tr_skip, \
+  (cu).intra[2].cost, (cu).intra[2].bitcost, (cu).intra[2].mode, (cu).intra[2].mode_chroma, (cu).intra[2].tr_skip, \
+  (cu).intra[3].cost, (cu).intra[3].bitcost, (cu).intra[3].mode, (cu).intra[3].mode_chroma, (cu).intra[3].tr_skip, \
+  (cu).inter.cost, (cu).inter.bitcost, (cu).inter.mv[0], (cu).inter.mv[1], (cu).inter.mvd[0], (cu).inter.mvd[1], \
+  (cu).inter.mv_cand, (cu).inter.mv_ref, (cu).inter.mv_dir, (cu).inter.mode)
+
 #define SUB_SCU_BIT_MASK (64 - 1)
 #define SUB_SCU(xy) (xy & SUB_SCU_BIT_MASK)
 #define LCU_CU_WIDTH 8
@ -143,6 +160,90 @@ typedef struct {
  cu_info cu[9*9+1];
 } lcu_t;

+#define CHECKPOINT_LCU(prefix_str, lcu) do { \
+  CHECKPOINT_CU(prefix_str " cu[0]", (lcu).cu[0]); \
+  CHECKPOINT_CU(prefix_str " cu[1]", (lcu).cu[1]); \
+  CHECKPOINT_CU(prefix_str " cu[2]", (lcu).cu[2]); \
+  CHECKPOINT_CU(prefix_str " cu[3]", (lcu).cu[3]); \
+  CHECKPOINT_CU(prefix_str " cu[4]", (lcu).cu[4]); \
+  CHECKPOINT_CU(prefix_str " cu[5]", (lcu).cu[5]); \
+  CHECKPOINT_CU(prefix_str " cu[6]", (lcu).cu[6]); \
+  CHECKPOINT_CU(prefix_str " cu[7]", (lcu).cu[7]); \
+  CHECKPOINT_CU(prefix_str " cu[8]", (lcu).cu[8]); \
+  CHECKPOINT_CU(prefix_str " cu[9]", (lcu).cu[9]); \
+  CHECKPOINT_CU(prefix_str " cu[10]", (lcu).cu[10]); \
+  CHECKPOINT_CU(prefix_str " cu[11]", (lcu).cu[11]); \
+  CHECKPOINT_CU(prefix_str " cu[12]", (lcu).cu[12]); \
+  CHECKPOINT_CU(prefix_str " cu[13]", (lcu).cu[13]); \
+  CHECKPOINT_CU(prefix_str " cu[14]", (lcu).cu[14]); \
+  CHECKPOINT_CU(prefix_str " cu[15]", (lcu).cu[15]); \
+  CHECKPOINT_CU(prefix_str " cu[16]", (lcu).cu[16]); \
+  CHECKPOINT_CU(prefix_str " cu[17]", (lcu).cu[17]); \
+  CHECKPOINT_CU(prefix_str " cu[18]", (lcu).cu[18]); \
+  CHECKPOINT_CU(prefix_str " cu[19]", (lcu).cu[19]); \
+  CHECKPOINT_CU(prefix_str " cu[20]", (lcu).cu[20]); \
+  CHECKPOINT_CU(prefix_str " cu[21]", (lcu).cu[21]); \
+  CHECKPOINT_CU(prefix_str " cu[22]", (lcu).cu[22]); \
+  CHECKPOINT_CU(prefix_str " cu[23]", (lcu).cu[23]); \
+  CHECKPOINT_CU(prefix_str " cu[24]", (lcu).cu[24]); \
+  CHECKPOINT_CU(prefix_str " cu[25]", (lcu).cu[25]); \
+  CHECKPOINT_CU(prefix_str " cu[26]", (lcu).cu[26]); \
+  CHECKPOINT_CU(prefix_str " cu[27]", (lcu).cu[27]); \
+  CHECKPOINT_CU(prefix_str " cu[28]", (lcu).cu[28]); \
+  CHECKPOINT_CU(prefix_str " cu[29]", (lcu).cu[29]); \
+  CHECKPOINT_CU(prefix_str " cu[30]", (lcu).cu[30]); \
+  CHECKPOINT_CU(prefix_str " cu[31]", (lcu).cu[31]); \
+  CHECKPOINT_CU(prefix_str " cu[32]", (lcu).cu[32]); \
+  CHECKPOINT_CU(prefix_str " cu[33]", (lcu).cu[33]); \
+  CHECKPOINT_CU(prefix_str " cu[34]", (lcu).cu[34]); \
+  CHECKPOINT_CU(prefix_str " cu[35]", (lcu).cu[35]); \
+  CHECKPOINT_CU(prefix_str " cu[36]", (lcu).cu[36]); \
+  CHECKPOINT_CU(prefix_str " cu[37]", (lcu).cu[37]); \
+  CHECKPOINT_CU(prefix_str " cu[38]", (lcu).cu[38]); \
+  CHECKPOINT_CU(prefix_str " cu[39]", (lcu).cu[39]); \
+  CHECKPOINT_CU(prefix_str " cu[40]", (lcu).cu[40]); \
+  CHECKPOINT_CU(prefix_str " cu[41]", (lcu).cu[41]); \
+  CHECKPOINT_CU(prefix_str " cu[42]", (lcu).cu[42]); \
+  CHECKPOINT_CU(prefix_str " cu[43]", (lcu).cu[43]); \
+  CHECKPOINT_CU(prefix_str " cu[44]", (lcu).cu[44]); \
+  CHECKPOINT_CU(prefix_str " cu[45]", (lcu).cu[45]); \
+  CHECKPOINT_CU(prefix_str " cu[46]", (lcu).cu[46]); \
+  CHECKPOINT_CU(prefix_str " cu[47]", (lcu).cu[47]); \
+  CHECKPOINT_CU(prefix_str " cu[48]", (lcu).cu[48]); \
+  CHECKPOINT_CU(prefix_str " cu[49]", (lcu).cu[49]); \
+  CHECKPOINT_CU(prefix_str " cu[50]", (lcu).cu[50]); \
+  CHECKPOINT_CU(prefix_str " cu[51]", (lcu).cu[51]); \
+  CHECKPOINT_CU(prefix_str " cu[52]", (lcu).cu[52]); \
+  CHECKPOINT_CU(prefix_str " cu[53]", (lcu).cu[53]); \
+  CHECKPOINT_CU(prefix_str " cu[54]", (lcu).cu[54]); \
+  CHECKPOINT_CU(prefix_str " cu[55]", (lcu).cu[55]); \
+  CHECKPOINT_CU(prefix_str " cu[56]", (lcu).cu[56]); \
+  CHECKPOINT_CU(prefix_str " cu[57]", (lcu).cu[57]); \
+  CHECKPOINT_CU(prefix_str " cu[58]", (lcu).cu[58]); \
+  CHECKPOINT_CU(prefix_str " cu[59]", (lcu).cu[59]); \
+  CHECKPOINT_CU(prefix_str " cu[60]", (lcu).cu[60]); \
+  CHECKPOINT_CU(prefix_str " cu[61]", (lcu).cu[61]); \
+  CHECKPOINT_CU(prefix_str " cu[62]", (lcu).cu[62]); \
+  CHECKPOINT_CU(prefix_str " cu[63]", (lcu).cu[63]); \
+  CHECKPOINT_CU(prefix_str " cu[64]", (lcu).cu[64]); \
+  CHECKPOINT_CU(prefix_str " cu[65]", (lcu).cu[65]); \
+  CHECKPOINT_CU(prefix_str " cu[66]", (lcu).cu[66]); \
+  CHECKPOINT_CU(prefix_str " cu[67]", (lcu).cu[67]); \
+  CHECKPOINT_CU(prefix_str " cu[68]", (lcu).cu[68]); \
+  CHECKPOINT_CU(prefix_str " cu[69]", (lcu).cu[69]); \
+  CHECKPOINT_CU(prefix_str " cu[70]", (lcu).cu[70]); \
+  CHECKPOINT_CU(prefix_str " cu[71]", (lcu).cu[71]); \
+  CHECKPOINT_CU(prefix_str " cu[72]", (lcu).cu[72]); \
+  CHECKPOINT_CU(prefix_str " cu[73]", (lcu).cu[73]); \
+  CHECKPOINT_CU(prefix_str " cu[74]", (lcu).cu[74]); \
+  CHECKPOINT_CU(prefix_str " cu[75]", (lcu).cu[75]); \
+  CHECKPOINT_CU(prefix_str " cu[76]", (lcu).cu[76]); \
+  CHECKPOINT_CU(prefix_str " cu[77]", (lcu).cu[77]); \
+  CHECKPOINT_CU(prefix_str " cu[78]", (lcu).cu[78]); \
+  CHECKPOINT_CU(prefix_str " cu[79]", (lcu).cu[79]); \
+  CHECKPOINT_CU(prefix_str " cu[80]", (lcu).cu[80]); \
+  CHECKPOINT_CU(prefix_str " cu[81]", (lcu).cu[81]); \
+} while(0)


 void coefficients_blit(const coefficient *orig, coefficient *dst,
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -21,6 +21,7 @@

 #include <string.h>

+#include "checkpoint.h"
 #include "encoderstate.h"
 #include "nal.h"

@ -665,6 +666,7 @@ static void add_checksum(encoder_state * const encoder_state)
    checksum_val = (checksum[i][0] << 24) + (checksum[i][1] << 16) +
                   (checksum[i][2] << 8) + (checksum[i][3]);
    WRITE_U(stream, checksum_val, 32, "picture_checksum");
+    CHECKPOINT("checksum[%d] = %u", i, checksum_val);
  }

  bitstream_align(stream);
@ -734,9 +736,7 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state)
    PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=write_bitstream_checksum,frame=%d,type=%c", main_state->global->frame, main_state->type);
  }
  
-  //FIXME: Why is this needed?
-//  assert(main_state->tile->frame->poc == main_state->global->poc);
-  main_state->tile->frame->poc = main_state->global->poc;
+  assert(main_state->tile->frame->poc == main_state->global->poc);
 }

 void encoder_state_write_bitstream_leaf(encoder_state * const encoder_state) {
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@ -46,6 +46,15 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state,
  
  const encoder_control * const encoder = encoder_state->encoder_control;
  encoder_state->tile->frame = videoframe_alloc(width, height, 0);
+  
+  if (encoder_state->type == ENCODER_STATE_TYPE_MAIN) {
+    //If not a parent, then we can avoid keeping a copy of the image
+    encoder_state->tile->frame->source = image_alloc(encoder_state->tile->frame->width, encoder_state->tile->frame->height, 0);
+    encoder_state->tile->frame->rec = image_alloc(encoder_state->tile->frame->width, encoder_state->tile->frame->height, 0);
+  } else {
+    encoder_state->tile->frame->source = NULL;
+    encoder_state->tile->frame->rec = NULL;
+  }

  if (!encoder_state->tile->frame) {
    printf("Error allocating videoframe!\r\n");
@ -95,6 +104,14 @@ static void encoder_state_config_tile_finalize(encoder_state * const encoder_sta
  yuv_t_free(encoder_state->tile->hor_buf_search);
  yuv_t_free(encoder_state->tile->ver_buf_search);
  
+  if (encoder_state->type == ENCODER_STATE_TYPE_MAIN) {
+    //If not a parent, then we can avoid keeping a copy of the image
+    image_free(encoder_state->tile->frame->source);
+    image_free(encoder_state->tile->frame->rec);
+  } else {
+    assert(!encoder_state->tile->frame->source);
+    assert(!encoder_state->tile->frame->rec);
+  }
  videoframe_free(encoder_state->tile->frame);
  encoder_state->tile->frame = NULL;
  
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -84,58 +84,6 @@ int encoder_state_match_children_of_previous_frame(encoder_state * const encoder
  return 1;
 }

-static void encoder_state_blit_pixels(const encoder_state * const target_enc, pixel * const target, const encoder_state * const source_enc, const pixel * const source, const int is_y_channel) {
-  const int source_offset_x = source_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int source_offset_y = source_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  const int target_offset_x = target_enc->tile->lcu_offset_x * LCU_WIDTH;
-  const int target_offset_y = target_enc->tile->lcu_offset_y * LCU_WIDTH;
-  
-  int source_stride = source_enc->tile->frame->width;
-  int target_stride = target_enc->tile->frame->width;
-  
-  int width;
-  int height;
-  
-  int source_offset;
-  int target_offset;
-  
-  //Do nothing if the source and the destination is the same!
-  if (source_enc->tile == target_enc->tile) return;
-
-  if (is_y_channel) {
-    target_offset = source_offset_x + source_offset_y * target_enc->tile->frame->width;
-    source_offset = target_offset_x + target_offset_y * source_enc->tile->frame->width;
-  } else {
-    target_offset = source_offset_x/2 + source_offset_y/2 * target_enc->tile->frame->width/2;
-    source_offset = target_offset_x/2 + target_offset_y/2 * source_enc->tile->frame->width/2;
-  }
-  
-  if (target_enc->children) {
-    //Use information from the source
-    width = MIN(source_enc->tile->frame->width_in_lcu * LCU_WIDTH, target_enc->tile->frame->width - source_offset_x);
-    height = MIN(source_enc->tile->frame->height_in_lcu * LCU_WIDTH, target_enc->tile->frame->height - source_offset_y);
-  } else {
-    //Use information from the target
-    width = MIN(target_enc->tile->frame->width_in_lcu * LCU_WIDTH, source_enc->tile->frame->width - target_offset_x);
-    height = MIN(target_enc->tile->frame->height_in_lcu * LCU_WIDTH, source_enc->tile->frame->height - target_offset_y);
-  }
-  
-  if (!is_y_channel) {
-    width /= 2;
-    height /= 2;
-    
-    source_stride /= 2;
-    target_stride /= 2;
-  }
-  
-  //picture_blit_pixels(source + source_offset, target + target_offset, width, height, source_enc->cur_pic->width, target_enc->cur_pic->width);
-  pixels_blit(source + source_offset, target + target_offset, width, height, source_stride, target_stride);
-}
-
-
-
-
 static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
  videoframe* const frame = encoder_state->tile->frame;
  
@ -145,15 +93,15 @@ static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, c
    const int by = lcu->position.y;
    
    //Copy the bottom row of this LCU to the horizontal buffer
-    pixels_blit(&frame->rec->y[rdpy * frame->width + rdpx],
+    pixels_blit(&frame->rec->y[rdpy * frame->rec->stride + rdpx],
                        &hor_buf->y[lcu->position_px.x + by * frame->width],
-                        lcu->size.x, 1, frame->width, frame->width);
-    pixels_blit(&frame->rec->u[(rdpy/2) * frame->width/2 + (rdpx/2)],
+                        lcu->size.x, 1, frame->rec->stride, frame->width);
+    pixels_blit(&frame->rec->u[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
                        &hor_buf->u[lcu->position_px.x / 2 + by * frame->width / 2],
-                        lcu->size.x / 2, 1, frame->width / 2, frame->width / 2);
-    pixels_blit(&frame->rec->v[(rdpy/2) * frame->width/2 + (rdpx/2)],
+                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
+    pixels_blit(&frame->rec->v[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
                        &hor_buf->v[lcu->position_px.x / 2 + by * frame->width / 2],
-                        lcu->size.x / 2, 1, frame->width / 2, frame->width / 2);
+                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
  }
  
  if (ver_buf) {
@ -163,15 +111,15 @@ static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, c
    
    
    //Copy the right row of this LCU to the vertical buffer.
-    pixels_blit(&frame->rec->y[rdpy * frame->width + rdpx],
+    pixels_blit(&frame->rec->y[rdpy * frame->rec->stride + rdpx],
                        &ver_buf->y[lcu->position_px.y + bx * frame->height],
-                        1, lcu->size.y, frame->width, 1);
-    pixels_blit(&frame->rec->u[(rdpy/2) * frame->width/2 + (rdpx/2)],
+                        1, lcu->size.y, frame->rec->stride, 1);
+    pixels_blit(&frame->rec->u[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
                        &ver_buf->u[lcu->position_px.y / 2 + bx * frame->height / 2],
-                        1, lcu->size.y / 2, frame->width / 2, 1);
-    pixels_blit(&frame->rec->v[(rdpy/2) * frame->width/2 + (rdpx/2)],
+                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
+    pixels_blit(&frame->rec->v[(rdpy/2) * frame->rec->stride/2 + (rdpx/2)],
                        &ver_buf->v[lcu->position_px.y / 2 + bx * frame->height / 2],
-                        1, lcu->size.y / 2, frame->width / 2, 1);
+                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
  }
  
 }
@ -300,6 +248,12 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
    // Merge only if both luma and chroma can be merged
    sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag;
    sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag;
+    
+    assert(sao_luma->eo_class < SAO_NUM_EO);
+    assert(sao_chroma->eo_class < SAO_NUM_EO);
+    
+    CHECKPOINT_SAO_INFO("sao_luma", *sao_luma);
+    CHECKPOINT_SAO_INFO("sao_chroma", *sao_chroma);
  }
  
  
@ -503,9 +457,15 @@ static void encoder_state_encode(encoder_state * const main_state) {
      encoder_state *sub_state = &(main_state->children[i]);
      
      if (sub_state->tile != main_state->tile) {
-        encoder_state_blit_pixels(sub_state, sub_state->tile->frame->source->y, main_state, main_state->tile->frame->source->y, 1);
-        encoder_state_blit_pixels(sub_state, sub_state->tile->frame->source->u, main_state, main_state->tile->frame->source->u, 0);
-        encoder_state_blit_pixels(sub_state, sub_state->tile->frame->source->v, main_state, main_state->tile->frame->source->v, 0);
+        const int offset_x = sub_state->tile->lcu_offset_x * LCU_WIDTH;
+        const int offset_y = sub_state->tile->lcu_offset_y * LCU_WIDTH;
+        const int width = MIN(sub_state->tile->frame->width_in_lcu * LCU_WIDTH, main_state->tile->frame->width - offset_x);
+        const int height = MIN(sub_state->tile->frame->height_in_lcu * LCU_WIDTH, main_state->tile->frame->height - offset_y);
+        
+        assert(!sub_state->tile->frame->source);
+        assert(!sub_state->tile->frame->rec);
+        sub_state->tile->frame->source = image_make_subimage(main_state->tile->frame->source, offset_x, offset_y, width, height);
+        sub_state->tile->frame->rec = image_make_subimage(main_state->tile->frame->rec, offset_x, offset_y, width, height);
      }
      
      //To be the last split, we require that every child is a chain
@ -586,9 +546,10 @@ static void encoder_state_encode(encoder_state * const main_state) {
    for (i=0; main_state->children[i].encoder_control; ++i) {
      encoder_state *sub_state = &(main_state->children[i]);
      if (sub_state->tile != main_state->tile) {
-        encoder_state_blit_pixels(main_state, main_state->tile->frame->rec->y, sub_state, sub_state->tile->frame->rec->y, 1);
-        encoder_state_blit_pixels(main_state, main_state->tile->frame->rec->u, sub_state, sub_state->tile->frame->rec->u, 0);
-        encoder_state_blit_pixels(main_state, main_state->tile->frame->rec->v, sub_state, sub_state->tile->frame->rec->v, 0);
+        image_free(sub_state->tile->frame->source);
+        image_free(sub_state->tile->frame->rec);
+        sub_state->tile->frame->source = NULL;
+        sub_state->tile->frame->rec = NULL;
      }
    }
  } else {
--- a/src/filter.c
+++ b/src/filter.c
@ -182,7 +182,7 @@ void filter_deblock_edge_luma(encoder_state * const encoder_state,
  }

  {
-    int32_t stride = frame->width;
+    int32_t stride = frame->rec->stride;
    int32_t offset = stride;
    int32_t beta_offset_div2 = encoder->beta_offset_div2;
    int32_t tc_offset_div2   = encoder->tc_offset_div2;
@ -313,7 +313,7 @@ void filter_deblock_edge_chroma(encoder_state * const encoder_state,

  // For each subpart
  {
-    int32_t stride = frame->width >> 1;
+    int32_t stride = frame->rec->stride >> 1;
    int32_t tc_offset_div2 = encoder->tc_offset_div2;
    // TODO: support 10+bits
    pixel *src_u = &frame->rec->u[x + y*stride];
--- a/src/global.h
+++ b/src/global.h
@ -117,7 +117,7 @@ typedef int16_t coefficient;

 #define LOG2_LCU_WIDTH 6
 // CU_TO_PIXEL = y * lcu_width * pic_width + x * lcu_width
-#define CU_TO_PIXEL(x, y, depth, width) (((y) << (LOG2_LCU_WIDTH - (depth))) * (width) \
+#define CU_TO_PIXEL(x, y, depth, stride) (((y) << (LOG2_LCU_WIDTH - (depth))) * (stride) \
                                         + ((x) << (LOG2_LCU_WIDTH - (depth))))
 //#define SIGN3(x) ((x) > 0) ? +1 : ((x) == 0 ? 0 : -1)
 #define SIGN3(x) (((x) > 0) - ((x) < 0))
--- a/src/image.c
+++ b/src/image.c
@ -31,6 +31,7 @@
 #include <math.h>
 #include <assert.h>

+#include "checkpoint.h"
 #include "sao.h"

 /**
@ -80,8 +81,12 @@ int image_free(image * const im)
  assert(im->base_image == im || im->refcount == 0);
  
  int32_t new_refcount = ATOMIC_DEC(&(im->base_image->refcount));
+  //If we're freeing a subimage, then we must free the pointer
+  //Base image may be stored in image_list, and should not be freed
+  //FIXME I don't find this very clean...
+  if (new_refcount > 0 && im->base_image != im) free(im);
  if (new_refcount > 0) return 1;
-  FREE_POINTER(im->fulldata);
+  FREE_POINTER(im->base_image->fulldata);
  
  //Just to make the program crash when using those values after the free
  im->y = im->u = im->v = im->data[COLOR_Y] = im->data[COLOR_U] = im->data[COLOR_V] = NULL;
@ -156,7 +161,7 @@ void yuv_t_free(yuv_t * yuv)
 * \returns Sum of Absolute Differences
 */
 static unsigned cor_sad(const pixel *pic_data, const pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_width)
+                        int block_width, int block_height, unsigned pic_stride)
 {
  pixel ref = *ref_data;
  int x, y;
@ -164,7 +169,7 @@ static unsigned cor_sad(const pixel *pic_data, const pixel *ref_data,

  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_width + x] - ref);
+      sad += abs(pic_data[y * pic_stride + x] - ref);
    }
  }

@ -183,14 +188,14 @@ static unsigned cor_sad(const pixel *pic_data, const pixel *ref_data,
 * \returns Sum of Absolute Differences
 */
 static unsigned ver_sad(const pixel *pic_data, const pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_width)
+                        int block_width, int block_height, unsigned pic_stride)
 {
  int x, y;
  unsigned sad = 0;

  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_width + x] - ref_data[x]);
+      sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
    }
  }

@ -209,14 +214,14 @@ static unsigned ver_sad(const pixel *pic_data, const pixel *ref_data,
 * \returns Sum of Absolute Differences
 */
 static unsigned hor_sad(const pixel *pic_data, const pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_width, unsigned ref_width)
+                        int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
 {
  int x, y;
  unsigned sad = 0;

  for (y = 0; y < block_height; ++y) {
    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_data[y * pic_width + x] - ref_data[y * ref_width]);
+      sad += abs(pic_data[y * pic_stride + x] - ref_data[y * ref_stride]);
    }
  }

@ -265,8 +270,8 @@ static unsigned image_interpolated_sad(const image *pic, const image *ref,
  // movement vector is pointing to. That point might be outside the buffer,
  // but that is ok because we project the movement vector to the buffer
  // before dereferencing the pointer.
-  pic_data = &pic->y[pic_y * pic->width + pic_x];
-  ref_data = &ref->y[ref_y * ref->width + ref_x];
+  pic_data = &pic->y[pic_y * pic->stride + pic_x];
+  ref_data = &ref->y[ref_y * ref->stride + ref_x];

  // The handling of movement vectors that point outside the picture is done
  // in the following way.
@ -278,86 +283,86 @@ static unsigned image_interpolated_sad(const image *pic, const image *ref,
  //   being compared is correct.
  if (top && left) {
    result += cor_sad(pic_data,
-                      &ref_data[top * ref->width + left],
-                      left, top, pic->width);
+                      &ref_data[top * ref->stride + left],
+                      left, top, pic->stride);
    result += ver_sad(&pic_data[left],
-                      &ref_data[top * ref->width + left],
-                      block_width - left, top, pic->width);
-    result += hor_sad(&pic_data[top * pic->width],
-                      &ref_data[top * ref->width + left],
-                      left, block_height - top, pic->width, ref->width);
-    result += reg_sad(&pic_data[top * pic->width + left],
-                      &ref_data[top * ref->width + left],
-                      block_width - left, block_height - top, pic->width, ref->width);
+                      &ref_data[top * ref->stride + left],
+                      block_width - left, top, pic->stride);
+    result += hor_sad(&pic_data[top * pic->stride],
+                      &ref_data[top * ref->stride + left],
+                      left, block_height - top, pic->stride, ref->stride);
+    result += reg_sad(&pic_data[top * pic->stride + left],
+                      &ref_data[top * ref->stride + left],
+                      block_width - left, block_height - top, pic->stride, ref->stride);
  } else if (top && right) {
    result += ver_sad(pic_data,
-                      &ref_data[top * ref->width],
-                      block_width - right, top, pic->width);
+                      &ref_data[top * ref->stride],
+                      block_width - right, top, pic->stride);
    result += cor_sad(&pic_data[block_width - right],
-                      &ref_data[top * ref->width + (block_width - right - 1)],
-                      right, top, pic->width);
-    result += reg_sad(&pic_data[top * pic->width],
-                      &ref_data[top * ref->width],
-                      block_width - right, block_height - top, pic->width, ref->width);
-    result += hor_sad(&pic_data[top * pic->width + (block_width - right)],
-                      &ref_data[top * ref->width + (block_width - right - 1)],
-                      right, block_height - top, pic->width, ref->width);
+                      &ref_data[top * ref->stride + (block_width - right - 1)],
+                      right, top, pic->stride);
+    result += reg_sad(&pic_data[top * pic->stride],
+                      &ref_data[top * ref->stride],
+                      block_width - right, block_height - top, pic->stride, ref->stride);
+    result += hor_sad(&pic_data[top * pic->stride + (block_width - right)],
+                      &ref_data[top * ref->stride + (block_width - right - 1)],
+                      right, block_height - top, pic->stride, ref->stride);
  } else if (bottom && left) {
    result += hor_sad(pic_data,
                      &ref_data[left],
-                      left, block_height - bottom, pic->width, ref->width);
+                      left, block_height - bottom, pic->stride, ref->stride);
    result += reg_sad(&pic_data[left],
                      &ref_data[left],
-                      block_width - left, block_height - bottom, pic->width, ref->width);
-    result += cor_sad(&pic_data[(block_height - bottom) * pic->width],
-                      &ref_data[(block_height - bottom - 1) * ref->width + left],
-                      left, bottom, pic->width);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->width + left],
-                      &ref_data[(block_height - bottom - 1) * ref->width + left],
-                      block_width - left, bottom, pic->width);
+                      block_width - left, block_height - bottom, pic->stride, ref->stride);
+    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
+                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
+                      left, bottom, pic->stride);
+    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
+                      &ref_data[(block_height - bottom - 1) * ref->stride + left],
+                      block_width - left, bottom, pic->stride);
  } else if (bottom && right) {
    result += reg_sad(pic_data,
                      ref_data,
-                      block_width - right, block_height - bottom, pic->width, ref->width);
+                      block_width - right, block_height - bottom, pic->stride, ref->stride);
    result += hor_sad(&pic_data[block_width - right],
                      &ref_data[block_width - right - 1],
-                      right, block_height - bottom, pic->width, ref->width);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->width],
-                      &ref_data[(block_height - bottom - 1) * ref->width],
-                      block_width - right, bottom, pic->width);
-    result += cor_sad(&pic_data[(block_height - bottom) * pic->width + block_width - right],
-                      &ref_data[(block_height - bottom - 1) * ref->width + block_width - right - 1],
-                      right, bottom, pic->width);
+                      right, block_height - bottom, pic->stride, ref->stride);
+    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
+                      &ref_data[(block_height - bottom - 1) * ref->stride],
+                      block_width - right, bottom, pic->stride);
+    result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
+                      &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
+                      right, bottom, pic->stride);
  } else if (top) {
    result += ver_sad(pic_data,
-                      &ref_data[top * ref->width],
-                      block_width, top, pic->width);
-    result += reg_sad(&pic_data[top * pic->width],
-                      &ref_data[top * ref->width],
-                      block_width, block_height - top, pic->width, ref->width);
+                      &ref_data[top * ref->stride],
+                      block_width, top, pic->stride);
+    result += reg_sad(&pic_data[top * pic->stride],
+                      &ref_data[top * ref->stride],
+                      block_width, block_height - top, pic->stride, ref->stride);
  } else if (bottom) {
    result += reg_sad(pic_data,
                      ref_data,
-                      block_width, block_height - bottom, pic->width, ref->width);
-    result += ver_sad(&pic_data[(block_height - bottom) * pic->width],
-                      &ref_data[(block_height - bottom - 1) * ref->width],
-                      block_width, bottom, pic->width);
+                      block_width, block_height - bottom, pic->stride, ref->stride);
+    result += ver_sad(&pic_data[(block_height - bottom) * pic->stride],
+                      &ref_data[(block_height - bottom - 1) * ref->stride],
+                      block_width, bottom, pic->stride);
  } else if (left) {
    result += hor_sad(pic_data,
                      &ref_data[left],
-                      left, block_height, pic->width, ref->width);
+                      left, block_height, pic->stride, ref->stride);
    result += reg_sad(&pic_data[left],
                      &ref_data[left],
-                      block_width - left, block_height, pic->width, ref->width);
+                      block_width - left, block_height, pic->stride, ref->stride);
  } else if (right) {
    result += reg_sad(pic_data,
                      ref_data,
-                      block_width - right, block_height, pic->width, ref->width);
+                      block_width - right, block_height, pic->stride, ref->stride);
    result += hor_sad(&pic_data[block_width - right],
                      &ref_data[block_width - right - 1],
-                      right, block_height, pic->width, ref->width);
+                      right, block_height, pic->stride, ref->stride);
  } else {
-    result += reg_sad(pic_data, ref_data, block_width, block_height, pic->width, ref->width);
+    result += reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride);
  }

  return result;
@ -373,9 +378,9 @@ unsigned image_calc_sad(const image *pic, const image *ref, int pic_x, int pic_y
  {
    // Reference block is completely inside the frame, so just calculate the
    // SAD directly. This is the most common case, which is why it's first.
-    const pixel *pic_data = &pic->y[pic_y * pic->width + pic_x];
-    const pixel *ref_data = &ref->y[ref_y * ref->width + ref_x];
-    return reg_sad(pic_data, ref_data, block_width, block_height, pic->width, ref->width);
+    const pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
+    const pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x];
+    return reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride);
  } else {
    // Call a routine that knows how to interpolate pixels outside the frame.
    return image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height);
@ -703,6 +708,25 @@ void pixels_blit(const pixel * const orig, pixel * const dst,
  assert(width <= orig_stride);
  assert(width <= dst_stride);

+#ifdef CHECKPOINTS
+  for (y = 0; y < height; ++y) {
+    char buffer[3*width];
+    int p;
+    for (p = 0; p < width; ++p) {
+      sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]);
+    }
+    buffer[3*width] = 0;
+    CHECKPOINT("pixels_blit: %04d: %s", y, buffer);
+  }
+#endif //CHECKPOINTS
+
+  if (orig == dst) {
+    //If we have the same array, then we should have the same stride
+    assert(orig_stride == dst_stride);
+    return;
+  }
+  assert(orig != dst || orig_stride == dst_stride);
+
  for (y = 0; y < height; ++y) {
    memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(pixel));
  }
--- a/src/image.h
+++ b/src/image.h
@ -65,6 +65,7 @@ typedef struct {

 image *image_alloc(const int32_t width, const int32_t height, const int32_t poc);
 int image_free(image * im);
+image *image_make_subimage(image * const orig_image, const unsigned int x_offset, const unsigned int y_offset, const unsigned int width, const unsigned int height);

 yuv_t * yuv_t_alloc(int luma_size);
 void yuv_t_free(yuv_t * yuv);
--- a/src/sao.c
+++ b/src/sao.c
@ -557,13 +557,13 @@ void sao_reconstruct(const encoder_control * const encoder, videoframe * frame,
  pixel buf_rec[(LCU_WIDTH + 2) * (LCU_WIDTH + 2)];
  pixel new_rec[LCU_WIDTH * LCU_WIDTH];
  // Calling CU_TO_PIXEL with depth 1 is the same as using block size of 32.
-  pixel *lcu_rec = &recdata[CU_TO_PIXEL(x_ctb, y_ctb, is_chroma, pic_stride)];
+  pixel *lcu_rec = &recdata[CU_TO_PIXEL(x_ctb, y_ctb, is_chroma, frame->rec->stride>>is_chroma)];
  const pixel *old_lcu_rec = &old_rec[CU_TO_PIXEL(x_ctb, y_ctb, is_chroma, pic_stride)];

  vector2d ofs;
  vector2d tl = { 1, 1 };
  vector2d br = { 1, 1 };
-  vector2d block = { LCU_WIDTH, LCU_WIDTH };
+  vector2d block;

  if (sao->type == SAO_TYPE_NONE) {
    return;
@ -585,6 +585,9 @@ void sao_reconstruct(const encoder_control * const encoder, videoframe * frame,
  assert(ofs.x + tl.x + block.x + br.x <= frame->width);
  assert(ofs.y + tl.y + block.y + br.y <= frame->height);
  
+  CHECKPOINT("ofs.x=%d ofs.y=%d tl.x=%d tl.y=%d block.x=%d block.y=%d br.x=%d br.y=%d", 
+             ofs.x, ofs.y, tl.x, tl.y, block.x, block.y, br.x, br.y);
+  
  // Data to tmp buffer.
  pixels_blit(&old_lcu_rec[ofs.y * pic_stride + ofs.x],
                      buf_rec,
@ -600,8 +603,8 @@ void sao_reconstruct(const encoder_control * const encoder, videoframe * frame,

  // Copy reconstructed block from tmp buffer to rec image.
  pixels_blit(&new_rec[(tl.y + ofs.y) * lcu_stride + (tl.x + ofs.x)],
-                      &lcu_rec[(tl.y + ofs.y) * pic_stride + (tl.x + ofs.x)],
-                      block.x, block.y, lcu_stride, pic_stride);
+                      &lcu_rec[(tl.y + ofs.y) * (frame->rec->stride >> is_chroma) + (tl.x + ofs.x)],
+                      block.x, block.y, lcu_stride, frame->rec->stride >> is_chroma);
 }


@ -737,6 +740,14 @@ static void sao_search_best_mode(const encoder_state * const encoder_state, cons
 {
  sao_info edge_sao;
  sao_info band_sao;
+  
+  //Avoid "random" uninitialized value
+  edge_sao.band_position = 0;
+  edge_sao.eo_class = SAO_EO0;
+  band_sao.offsets[0] = 0;
+  band_sao.eo_class = SAO_EO0;
+  //memset(&edge_sao, 0, sizeof(sao_info));
+  //memset(&band_sao, 0, sizeof(sao_info));

  sao_search_edge_sao(encoder_state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left);
  sao_search_band_sao(encoder_state, data, recdata, block_width, block_height, buf_cnt, &band_sao, sao_top, sao_left);
@ -814,12 +825,12 @@ static void sao_search_best_mode(const encoder_state * const encoder_state, cons

  // Copy data to temporary buffers and init orig and rec lists to point to those buffers.
  for (color_i = COLOR_U; color_i <= COLOR_V; ++color_i) {
-    pixel *data = &frame->source->data[color_i][CU_TO_PIXEL(x_ctb, y_ctb, 1, frame->width / 2)];
-    pixel *recdata = &frame->rec->data[color_i][CU_TO_PIXEL(x_ctb, y_ctb, 1, frame->width / 2)];
+    pixel *data = &frame->source->data[color_i][CU_TO_PIXEL(x_ctb, y_ctb, 1, frame->source->stride / 2)];
+    pixel *recdata = &frame->rec->data[color_i][CU_TO_PIXEL(x_ctb, y_ctb, 1, frame->rec->stride / 2)];
    pixels_blit(data, orig[color_i - 1], block_width, block_height,
-                        frame->width / 2, block_width);
+                        frame->source->stride / 2, block_width);
    pixels_blit(recdata, rec[color_i - 1], block_width, block_height,
-                        frame->width / 2, block_width);
+                        frame->rec->stride / 2, block_width);
    orig_list[color_i - 1] = &orig[color_i - 1][0];
    rec_list[color_i - 1] = &rec[color_i - 1][0];
  }
@ -834,8 +845,8 @@ void sao_search_luma(const encoder_state * const encoder_state, const videoframe
  pixel rec[LCU_LUMA_SIZE];
  const pixel * orig_list[1] = { NULL };
  const pixel * rec_list[1] = { NULL };
-  pixel *data = &frame->source->y[CU_TO_PIXEL(x_ctb, y_ctb, 0, frame->width)];
-  pixel *recdata = &frame->rec->y[CU_TO_PIXEL(x_ctb, y_ctb, 0, frame->width)];
+  pixel *data = &frame->source->y[CU_TO_PIXEL(x_ctb, y_ctb, 0, frame->source->stride)];
+  pixel *recdata = &frame->rec->y[CU_TO_PIXEL(x_ctb, y_ctb, 0, frame->rec->stride)];
  int block_width = LCU_WIDTH;
  int block_height = LCU_WIDTH;

@ -850,8 +861,8 @@ void sao_search_luma(const encoder_state * const encoder_state, const videoframe
  sao->type = SAO_TYPE_EDGE;

  // Fill temporary buffers with picture data.
-  pixels_blit(data, orig, block_width, block_height, frame->width, block_width);
-  pixels_blit(recdata, rec, block_width, block_height, frame->width, block_width);
+  pixels_blit(data, orig, block_width, block_height, frame->source->stride, block_width);
+  pixels_blit(recdata, rec, block_width, block_height, frame->rec->stride, block_width);

  orig_list[0] = orig;
  rec_list[0] = rec;
@ -866,12 +877,13 @@ void sao_reconstruct_frame(encoder_state * const encoder_state)
  // These are needed because SAO needs the pre-SAO pixels form left and
  // top LCUs. Single pixel wide buffers, like what search_lcu takes, would
  // be enough though.
-  pixel *new_y_data = MALLOC(pixel, frame->width * frame->height);
-  pixel *new_u_data = MALLOC(pixel, (frame->width * frame->height) >> 2);
-  pixel *new_v_data = MALLOC(pixel, (frame->width * frame->height) >> 2);
-  memcpy(new_y_data, frame->rec->y, sizeof(pixel) * frame->width * frame->height);
-  memcpy(new_u_data, frame->rec->u, sizeof(pixel) * (frame->width * frame->height) >> 2);
-  memcpy(new_v_data, frame->rec->v, sizeof(pixel) * (frame->width * frame->height) >> 2);
+  pixel *new_y_data = MALLOC(pixel, frame->rec->width * frame->rec->height);
+  pixel *new_u_data = MALLOC(pixel, (frame->rec->width * frame->rec->height) >> 2);
+  pixel *new_v_data = MALLOC(pixel, (frame->rec->width * frame->rec->height) >> 2);
+  
+  pixels_blit(frame->rec->y, new_y_data, frame->rec->width, frame->rec->height, frame->rec->stride, frame->rec->width);
+  pixels_blit(frame->rec->u, new_u_data, frame->rec->width/2, frame->rec->height/2, frame->rec->stride/2, frame->rec->width/2);
+  pixels_blit(frame->rec->v, new_v_data, frame->rec->width/2, frame->rec->height/2, frame->rec->stride/2, frame->rec->width/2);

  for (lcu.y = 0; lcu.y < frame->height_in_lcu; lcu.y++) {
    for (lcu.x = 0; lcu.x < frame->width_in_lcu; lcu.x++) {
--- a/src/sao.h
+++ b/src/sao.h
@ -24,6 +24,7 @@
 * \brief Coding Unit (CU) and picture data related functions.
 */

+#include "checkpoint.h"
 #include "global.h"
 #include "videoframe.h"
 #include "encoder.h"
@ -45,6 +46,13 @@ typedef struct sao_info_struct {
  int offsets[NUM_SAO_EDGE_CATEGORIES];
 } sao_info;

+#define CHECKPOINT_SAO_INFO(prefix_str, sao) CHECKPOINT(prefix_str " type=%d eo_class=%d ddistortion=%d " \
+  "merge_left_flag=%d merge_up_flag=%d band_position=%d " \
+  "offsets[0]=%d offsets[1]=%d offsets[2]=%d offsets[3]=%d offsets[4]=%d", \
+  (sao).type, (sao).eo_class, (sao).ddistortion, \
+  (sao).merge_left_flag, (sao).merge_up_flag, (sao).band_position, \
+  (sao).offsets[0], (sao).offsets[1], (sao).offsets[2], (sao).offsets[3], (sao).offsets[4])
+

 void init_sao_info(sao_info *sao);
 void sao_search_chroma(const encoder_state * encoder_state, const videoframe *frame, unsigned x_ctb, unsigned y_ctb, sao_info *sao, sao_info *sao_top, sao_info *sao_left);
--- a/src/search.c
+++ b/src/search.c
@ -1205,22 +1205,20 @@ static void init_lcu_t(const encoder_state * const encoder_state, const int x, c
  // Copy LCU pixels.
  {
    const videoframe * const frame = encoder_state->tile->frame;
-    int pic_width = frame->width;
-    int x_max = MIN(x + LCU_WIDTH, pic_width) - x;
+    int x_max = MIN(x + LCU_WIDTH, frame->width) - x;
    int y_max = MIN(y + LCU_WIDTH, frame->height) - y;

    int x_c = x / 2;
    int y_c = y / 2;
-    int pic_width_c = pic_width / 2;
    int x_max_c = x_max / 2;
    int y_max_c = y_max / 2;

-    pixels_blit(&frame->source->y[x + y * pic_width], lcu->ref.y,
-                        x_max, y_max, pic_width, LCU_WIDTH);
-    pixels_blit(&frame->source->u[x_c + y_c * pic_width_c], lcu->ref.u,
-                        x_max_c, y_max_c, pic_width_c, LCU_WIDTH / 2);
-    pixels_blit(&frame->source->v[x_c + y_c * pic_width_c], lcu->ref.v,
-                        x_max_c, y_max_c, pic_width_c, LCU_WIDTH / 2);
+    pixels_blit(&frame->source->y[x + y * frame->source->stride], lcu->ref.y,
+                        x_max, y_max, frame->source->stride, LCU_WIDTH);
+    pixels_blit(&frame->source->u[x_c + y_c * frame->source->stride/2], lcu->ref.u,
+                        x_max_c, y_max_c, frame->source->stride/2, LCU_WIDTH / 2);
+    pixels_blit(&frame->source->v[x_c + y_c * frame->source->stride/2], lcu->ref.v,
+                        x_max_c, y_max_c, frame->source->stride/2, LCU_WIDTH / 2);
  }
 }

@ -1259,15 +1257,15 @@ static void copy_lcu_to_cu_data(const encoder_state * const encoder_state, int x
    const int luma_index = x_px + y_px * pic_width;
    const int chroma_index = (x_px / 2) + (y_px / 2) * (pic_width / 2);

-    pixels_blit(lcu->rec.y, &pic->rec->y[luma_index],
-                        x_max, y_max, LCU_WIDTH, pic_width);
+    pixels_blit(lcu->rec.y, &pic->rec->y[x_px + y_px * pic->rec->stride],
+                        x_max, y_max, LCU_WIDTH, pic->rec->stride);
    coefficients_blit(lcu->coeff.y, &pic->coeff_y[luma_index],
                        x_max, y_max, LCU_WIDTH, pic_width);

-    pixels_blit(lcu->rec.u, &pic->rec->u[chroma_index],
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
-    pixels_blit(lcu->rec.v, &pic->rec->v[chroma_index],
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
+    pixels_blit(lcu->rec.u, &pic->rec->u[(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2)],
+                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
+    pixels_blit(lcu->rec.v, &pic->rec->v[(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2)],
+                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
    coefficients_blit(lcu->coeff.u, &pic->coeff_u[chroma_index],
                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
    coefficients_blit(lcu->coeff.v, &pic->coeff_v[chroma_index],
--- a/src/threadqueue.c
+++ b/src/threadqueue.c
@ -1,6 +1,7 @@

 #include <assert.h>
 #include <pthread.h>
+#include <errno.h> //ETIMEDOUT
 #include <stdlib.h>

 #ifdef _DEBUG
@ -41,7 +42,6 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {

  for(;;) {
    int i = 0;
-    int signal_count = 0;
    threadqueue_job * job_to_run = NULL;
    
    PTHREAD_LOCK(&threadqueue->lock);
@ -87,6 +87,7 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
    
    //Ok we got a job (and we have a lock on it)
    if (job_to_run) {
+      int queue_waiting_dependency_decr, queue_waiting_execution_incr;
      threadqueue_job * const job = job_to_run;

      assert(job->state == THREADQUEUE_JOB_STATE_QUEUED || (job == next_job && job->state == THREADQUEUE_JOB_STATE_RUNNING));
@ -120,9 +121,10 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
      
      job->state = THREADQUEUE_JOB_STATE_DONE;
      
-      signal_count = 0;
      next_job = NULL;
      
+      queue_waiting_dependency_decr = 0;
+      queue_waiting_execution_incr = 0;
      //Decrease counter of dependencies
      for (i = 0; i < job->rdepends_count; ++i) {
        threadqueue_job * const depjob = job->rdepends[i];
@ -138,11 +140,9 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
            next_job = depjob;
            depjob->state = THREADQUEUE_JOB_STATE_RUNNING;
          } else {
-            ++signal_count;
-            ++threadqueue->queue_waiting_execution;
+            ++queue_waiting_execution_incr;
          }
-          assert(threadqueue->queue_waiting_dependency > 0);
-          --threadqueue->queue_waiting_dependency;
+          ++queue_waiting_dependency_decr;
        }
        
        PTHREAD_UNLOCK(&depjob->lock);
@ -152,9 +152,11 @@ static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
      
      //Signal the queue that we've done a job
      PTHREAD_LOCK(&threadqueue->lock);
-      for (i = 0; i < signal_count; ++i) {
-        PTHREAD_COND_SIGNAL(&threadqueue->cond);
-      }
+      assert(threadqueue->queue_waiting_dependency >= queue_waiting_dependency_decr);
+      threadqueue->queue_waiting_dependency -= queue_waiting_dependency_decr;
+      threadqueue->queue_waiting_execution += queue_waiting_execution_incr;
+      PTHREAD_COND_BROADCAST(&threadqueue->cond);
+
      //PTHREAD_COND_BROADCAST(&threadqueue->cond);
      //Don't log this one
      //PTHREAD_COND_SIGNAL(&threadqueue->cb_cond);
@ -364,6 +366,9 @@ int threadqueue_finalize(threadqueue_queue * const threadqueue) {
 int threadqueue_flush(threadqueue_queue * const threadqueue) {
  int notdone = 1;
  int i;
+  struct timespec time_to_wait;
+  time_to_wait.tv_sec = 0;
+  time_to_wait.tv_nsec = 100000;
  
  //Lock the queue
  PTHREAD_LOCK(&threadqueue->lock);
@ -383,9 +388,15 @@ int threadqueue_flush(threadqueue_queue * const threadqueue) {
    }

    if (notdone > 0) {
+      int ret;
      PTHREAD_COND_BROADCAST(&(threadqueue->cond));
      SLEEP();
-      PTHREAD_COND_WAIT(&threadqueue->cb_cond, &threadqueue->lock);
+      ret = pthread_cond_timedwait(&threadqueue->cb_cond, &threadqueue->lock, &time_to_wait);
+      if (ret != 0 && ret != ETIMEDOUT) {
+        fprintf(stderr, "pthread_cond_timedwait failed!\n"); 
+        assert(0); 
+        return 0;
+      }
    }
  } while (notdone > 0);
  
--- a/src/videoframe.c
+++ b/src/videoframe.c
@ -49,8 +49,8 @@ videoframe *videoframe_alloc(const int32_t width, const int32_t height, const in
  if (frame->height_in_lcu * LCU_WIDTH < frame->height) frame->height_in_lcu++;
  
  //Allocate images
-  frame->source = image_alloc(frame->width, frame->height, poc);
-  frame->rec = image_alloc(frame->width, frame->height, poc);
+  //frame->source = image_alloc(frame->width, frame->height, poc);
+  //frame->rec = image_alloc(frame->width, frame->height, poc);

  {
    // Allocate height_in_scu x width_in_scu x sizeof(CU_info)
@ -76,8 +76,8 @@ videoframe *videoframe_alloc(const int32_t width, const int32_t height, const in
 */
 int videoframe_free(videoframe * const frame)
 {
-  image_free(frame->source);
-  image_free(frame->rec);
+  //image_free(frame->source);
+  //image_free(frame->rec);

  FREE_POINTER(frame->cu_array);

@ -94,8 +94,8 @@ int videoframe_free(videoframe * const frame)
 }

 void videoframe_set_poc(videoframe * const frame, const int32_t poc) {
-  frame->source->poc = poc;
-  frame->rec->poc = poc;
+  if (frame->source) frame->source->poc = poc;
+  if (frame->rec) frame->rec->poc = poc;
  frame->poc = poc;
 }