Merge branch 'bipred'

Conflicts: README.md src/config.c src/config.h src/encmain.c
2024-11-27 11:24:05 +00:00 · 2015-04-23 14:45:44 +03:00 · 2015-04-23 14:45:44 +03:00 · fd060cf2c6
parent 13924a2057 79dc7e7270
commit fd060cf2c6
15 changed files with 1084 additions and 266 deletions
--- a/README.md
+++ b/README.md
@ -58,6 +58,8 @@ http://github.com/ultravideo/kvazaar/wiki/List-of-suggested-topics for a list of
              --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.
                                         0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
              --no-info              : Don't add information about the encoder to settings.
+              --gop <int>            : Length of Group of Pictures, must be 8 or 0 [0]
+              --bipred               : Enable bi-prediction search

      Video Usability Information:
              --sar <width:height>   : Specify Sample Aspect Ratio
--- a/src/cabac.h
+++ b/src/cabac.h
@ -54,6 +54,7 @@ typedef struct
    cabac_ctx_t split_flag_model[3]; //!< \brief split flag context models
    cabac_ctx_t intra_mode_model;    //!< \brief intra mode context models
    cabac_ctx_t chroma_pred_model[2];
+    cabac_ctx_t inter_dir[5];
    cabac_ctx_t trans_subdiv_model[3]; //!< \brief intra mode context models
    cabac_ctx_t qt_cbf_model_luma[4];
    cabac_ctx_t qt_cbf_model_chroma[4];
--- a/src/config.c
+++ b/src/config.c
@ -88,6 +88,8 @@ int config_init(config_t *cfg)
  cfg->cqmfile         = NULL;
  cfg->ref_frames      = DEFAULT_REF_PIC_COUNT;
  cfg->seek            = 0;
+  cfg->gop_len         = 0;
+  cfg->bipred          = 0;

  cfg->tiles_width_count         = 0;
  cfg->tiles_height_count         = 0;
@ -346,8 +348,13 @@ static int config_parse(config_t *cfg, const char *name, const char *value)
    cfg->frames = atoi(value);
  else if OPT("qp")
    cfg->qp = atoi(value);
-  else if OPT("period")
+  else if OPT("period") {
    cfg->intra_period = atoi(value);
+    if (cfg->gop_len && cfg->intra_period && cfg->intra_period%cfg->gop_len != 0) {
+      fprintf(stderr, "Input error: Intra period (%d) not equal to goplen (%d)\n", cfg->intra_period, atoi(value));
+      return 0;
+    }
+  }
  else if OPT("vps-period")
    cfg->vps_period = atoi(value);
  else if OPT("ref") {
@ -509,6 +516,56 @@ static int config_parse(config_t *cfg, const char *name, const char *value)
  }
  else if OPT("info")
    cfg->add_encoder_info = atobool(value);
+  else if OPT("gop") {
+    // ToDo: Defining the whole GOp structure via parameters
+
+    // Check for intra period, must be equal to goplen
+    if (atoi(value) && cfg->intra_period && cfg->intra_period%atoi(value) != 0) {
+      fprintf(stderr, "Input error: Intra period (%d) not equal to goplen (%d)\n", cfg->intra_period, atoi(value));
+      return 0;
+    }
+
+    if(atoi(value) == 8) {
+      // GOP
+      cfg->gop_len = 8;
+      cfg->gop[0].poc_offset = 8; cfg->gop[0].qp_offset = 1; cfg->gop[0].layer = 1; cfg->gop[0].qp_factor = 0.442;  cfg->gop[0].is_ref = 1;
+      cfg->gop[0].ref_pos_count = 0;
+      cfg->gop[0].ref_neg_count = 3; cfg->gop[0].ref_neg[0] = 8; cfg->gop[0].ref_neg[1] = 12; cfg->gop[0].ref_neg[2] = 16;
+
+      cfg->gop[1].poc_offset = 4; cfg->gop[1].qp_offset = 2; cfg->gop[1].layer = 2; cfg->gop[1].qp_factor = 0.3536; cfg->gop[1].is_ref = 1;
+      cfg->gop[1].ref_neg_count = 2; cfg->gop[1].ref_neg[0] = 4; cfg->gop[1].ref_neg[1] = 8;
+      cfg->gop[1].ref_pos_count = 1; cfg->gop[1].ref_pos[0] = 4;
+
+      cfg->gop[2].poc_offset = 2; cfg->gop[2].qp_offset = 3; cfg->gop[2].layer = 3; cfg->gop[2].qp_factor = 0.3536; cfg->gop[2].is_ref = 1;
+      cfg->gop[2].ref_neg_count = 2; cfg->gop[2].ref_neg[0] = 2; cfg->gop[2].ref_neg[1] = 6;
+      cfg->gop[2].ref_pos_count = 2; cfg->gop[2].ref_pos[0] = 2; cfg->gop[2].ref_pos[1] = 6;
+
+      cfg->gop[3].poc_offset = 1; cfg->gop[3].qp_offset = 4; cfg->gop[3].layer = 4; cfg->gop[3].qp_factor = 0.68;   cfg->gop[3].is_ref = 0;
+      cfg->gop[3].ref_neg_count = 1; cfg->gop[3].ref_neg[0] = 1;
+      cfg->gop[3].ref_pos_count = 3; cfg->gop[3].ref_pos[0] = 1; cfg->gop[3].ref_pos[1] = 3; cfg->gop[3].ref_pos[2] = 7;
+
+      cfg->gop[4].poc_offset = 3; cfg->gop[4].qp_offset = 4; cfg->gop[4].layer = 4; cfg->gop[4].qp_factor = 0.68;   cfg->gop[4].is_ref = 0;
+      cfg->gop[4].ref_neg_count = 2; cfg->gop[4].ref_neg[0] = 1; cfg->gop[4].ref_neg[1] = 3;
+      cfg->gop[4].ref_pos_count = 2; cfg->gop[4].ref_pos[0] = 1; cfg->gop[4].ref_pos[1] = 5;
+
+      cfg->gop[5].poc_offset = 6; cfg->gop[5].qp_offset = 3; cfg->gop[5].layer = 3; cfg->gop[5].qp_factor = 0.3536; cfg->gop[5].is_ref = 1;
+      cfg->gop[5].ref_neg_count = 2; cfg->gop[5].ref_neg[0] = 2; cfg->gop[5].ref_neg[1] = 6;
+      cfg->gop[5].ref_pos_count = 1; cfg->gop[5].ref_pos[0] = 2;
+
+      cfg->gop[6].poc_offset = 5; cfg->gop[6].qp_offset = 4; cfg->gop[6].layer = 4; cfg->gop[6].qp_factor = 0.68;   cfg->gop[6].is_ref = 0;
+      cfg->gop[6].ref_neg_count = 2;  cfg->gop[6].ref_neg[0] = 1; cfg->gop[6].ref_neg[1] = 5;
+      cfg->gop[6].ref_pos_count = 2; cfg->gop[6].ref_pos[0] = 1; cfg->gop[6].ref_pos[1] = 3;
+
+      cfg->gop[7].poc_offset = 7; cfg->gop[7].qp_offset = 4; cfg->gop[7].layer = 4; cfg->gop[7].qp_factor = 0.68;   cfg->gop[7].is_ref = 0;
+      cfg->gop[7].ref_neg_count = 3; cfg->gop[7].ref_neg[0] = 1; cfg->gop[7].ref_neg[1] = 3; cfg->gop[7].ref_neg[2] = 7;
+      cfg->gop[7].ref_pos_count = 1; cfg->gop[7].ref_pos[0] = 1;
+    } else if(atoi(value)) {
+      fprintf(stderr, "Input error: goplen must be 8\n");
+      return 0;
+    }
+  }
+  else if OPT("bipred")
+    cfg->bipred = atobool(value);
  else
    return 0;
 #undef OPT
@ -571,6 +628,8 @@ int config_read(config_t *cfg,int argc, char *argv[])
    { "pu-depth-inter",     required_argument, NULL, 0 },
    { "pu-depth-intra",     required_argument, NULL, 0 },
    { "no-info",                  no_argument, NULL, 0 },
+    { "gop",                required_argument, NULL, 0 },
+    { "bipred",                   no_argument, NULL, 0 },
    {0, 0, 0, 0}
  };

--- a/src/config.h
+++ b/src/config.h
@ -28,6 +28,18 @@
 #include "global.h"


+typedef struct {
+  double qp_factor;
+  int8_t qp_offset;    /*!< \brief QP offset */
+  int8_t poc_offset;   /*!< \brief POC offset */
+  int8_t layer;        /*!< \brief Current layer */
+  int8_t is_ref;       /*!< \brief Flag if this picture is used as a reference */
+  int8_t ref_pos_count;/*!< \brief Reference picture count */
+  int8_t ref_pos[16];  /*!< \brief reference picture offset list */
+  int8_t ref_neg_count;/*!< \brief Reference picture count */
+  int8_t ref_neg[16];  /*!< \brief reference picture offset list */
+} gop_config_t;
+
 /*!
    \brief Struct which contains all configuration data
 */
@ -52,6 +64,7 @@ typedef struct
  int32_t tr_depth_intra; /*!< \brief Maximum transform depth for intra. */
  int8_t  ime_algorithm;  /*!< \brief Integer motion estimation algorithm. */
  int32_t fme_level;      /*!< \brief Fractional pixel motion estimation level (0: disabled, 1: enabled). */
+  int32_t bipred;         /*!< \brief Bi-prediction (0: disabled, 1: enabled). */
  int32_t deblock_beta;   /*!< \brief (deblocking) beta offset (div 2), range -6...6 */
  int32_t deblock_tc;     /*!< \brief (deblocking) tc offset (div 2), range -6...6 */
  struct
@ -91,6 +104,9 @@ typedef struct
  } pu_depth_inter, pu_depth_intra;

  bool add_encoder_info;
+  int8_t gop_len;            /*!< \brief length of GOP for the video sequence */
+  gop_config_t gop[MAX_GOP];  /*!< \brief Array of GOP settings */
+
 } config_t;

 /* Function definitions */
--- a/src/context.c
+++ b/src/context.c
@ -108,6 +108,11 @@ const uint8_t INIT_CHROMA_PRED_MODE[3][2] = {
  {  63,  139 },
 };

+const uint8_t INIT_INTER_DIR[3][5] = {
+  {  95,  79,  63,  31,  31, },
+  {  95,  79,  63,  31,  31, },
+  { CNU, CNU, CNU, CNU, CNU, },
+};

 const uint8_t INIT_TRANS_SUBDIV_FLAG[3][3] = {
  { 224,  167,  122 },
@ -255,6 +260,10 @@ void init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
    ctx_init(&cabac->ctx.qt_cbf_model_chroma[i], QP, INIT_QT_CBF[slice][i + 4]);
  }

+  for (i = 0; i < 5; i++) {
+    ctx_init(&cabac->ctx.inter_dir[i], QP, INIT_INTER_DIR[slice][i]);
+  }
+
  for (i = 0; i < 8; i++) {
    ctx_init(&cabac->ctx.cu_one_model_chroma[i], QP, INIT_ONE_FLAG[slice][i+16]);
  }
--- a/src/cu.h
+++ b/src/cu.h
@ -72,11 +72,12 @@ typedef struct
  struct {
    double cost;
    uint32_t bitcost;
-    int16_t mv[2];
-    int16_t mvd[2];
-    uint8_t mv_cand; // \brief selected MV candidate
-    uint8_t mv_ref; // \brief Index of the encoder_control.ref array.
-    uint8_t mv_dir; // \brief Probably describes if mv_ref is forward, backward or both. Might not be needed?
+    int16_t mv[2][2];  // \brief Motion vectors for L0 and L1
+    int16_t mvd[2][2]; // \brief Motion vector differences for L0 and L1
+    uint8_t mv_cand[2]; // \brief selected MV candidate
+    uint8_t mv_ref[2]; // \brief Index of the encoder_control.ref array.
+    uint8_t mv_ref_coded[2]; // \brief Coded and corrected index of ref picture
+    uint8_t mv_dir; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
    int8_t mode;
  } inter;
 } cu_info_t;
--- a/src/encmain.c
+++ b/src/encmain.c
@ -137,6 +137,8 @@ int main(int argc, char *argv[])
            "          --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.\n"
            "                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4\n"
            "          --no-info              : Don't add information about the encoder to settings.\n"
+            "          --gop <int>           : Length of Group of Pictures, must be 8 or 0 [0]\n"
+            "          --bipred               : Enable bi-prediction search\n"
            "\n"
            "  Video Usability Information:\n"
            "          --sar <width:height>   : Specify Sample Aspect Ratio\n"
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -21,6 +21,7 @@
 #include "encoder_state-bitstream.h"

 #include <string.h>
+#include <stdlib.h>

 #include "checkpoint.h"
 #include "encoderstate.h"
@ -328,12 +329,12 @@ static void encoder_state_write_bitstream_seq_parameter_set(encoder_state_t * co

  WRITE_UE(stream, encoder->in.bitdepth-8, "bit_depth_luma_minus8");
  WRITE_UE(stream, encoder->in.bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 0, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_UE(stream, 1, "log2_max_pic_order_cnt_lsb_minus4");
  WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");

  //for each layer
-  WRITE_UE(stream, state->encoder_control->cfg->ref_frames, "sps_max_dec_pic_buffering");
-  WRITE_UE(stream, 0, "sps_num_reorder_pics");
+  WRITE_UE(stream, state->encoder_control->cfg->ref_frames + encoder->cfg->gop_len, "sps_max_dec_pic_buffering");
+  WRITE_UE(stream, encoder->cfg->gop_len, "sps_num_reorder_pics");
  WRITE_UE(stream, 0, "sps_max_latency_increase");
  //end for

@ -556,6 +557,18 @@ void encoder_state_write_bitstream_slice_header(encoder_state_t * const state)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  bitstream_t * const stream = &state->stream;
+  int j;
+  int ref_negative = 0;
+  int ref_positive = 0;
+  if (state->encoder_control->cfg->gop_len) {
+    for (j = 0; j < state->global->ref->used_size; j++) {
+      if (state->global->ref->images[j]->poc < state->global->poc) {
+        ref_negative++;
+      } else {
+        ref_positive++;
+      }
+    }
+  } else ref_negative = state->global->ref->used_size;

 #ifdef _DEBUG
  printf("=========== Slice ===========\n");
@ -584,20 +597,65 @@ void encoder_state_write_bitstream_slice_header(encoder_state_t * const state)
    //if( IdrPicFlag ) <- nal_unit_type == 5
  if (state->global->pictype != NAL_IDR_W_RADL
      && state->global->pictype != NAL_IDR_N_LP) {
-      int j;
-      int ref_negative = state->global->ref->used_size;
-      int ref_positive = 0;
-      WRITE_U(stream, state->global->poc&0xf, 4, "pic_order_cnt_lsb");
+    int last_poc = 0;
+    int poc_shift = 0;
+
+      WRITE_U(stream, state->global->poc&0x1f, 5, "pic_order_cnt_lsb");
      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
      WRITE_UE(stream, ref_negative, "num_negative_pics");
      WRITE_UE(stream, ref_positive, "num_positive_pics");
+    for (j = 0; j < ref_negative; j++) {      
+      int8_t delta_poc = 0;
+      
+      if (state->encoder_control->cfg->gop_len) {
+        int8_t found = 0;
+        do {
+          delta_poc = state->encoder_control->cfg->gop[state->global->gop_offset].ref_neg[j + poc_shift];
+          for (int i = 0; i < state->global->ref->used_size; i++) {
+            if (state->global->ref->images[i]->poc == state->global->poc - delta_poc) {
+              found = 1;
+              break;
+            }
+          }
+          if (!found) poc_shift++;
+          if (j + poc_shift == ref_negative) {
+            fprintf(stderr, "Failure, reference not found!");
+            exit(EXIT_FAILURE);
+          }
+        } while (!found);
+      }

-    for (j = 0; j < ref_negative; j++) {
-      int32_t delta_poc_minus1 = 0;
-      WRITE_UE(stream, delta_poc_minus1, "delta_poc_s0_minus1");
+      WRITE_UE(stream, state->encoder_control->cfg->gop_len?delta_poc - last_poc - 1:0, "delta_poc_s0_minus1");
+      last_poc = delta_poc;
      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
    }
-
+    last_poc = 0;
+    poc_shift = 0;
+    for (j = 0; j < ref_positive; j++) {      
+      int8_t delta_poc = 0;
+      
+      if (state->encoder_control->cfg->gop_len) {
+        int8_t found = 0;
+        do {
+          delta_poc = state->encoder_control->cfg->gop[state->global->gop_offset].ref_pos[j + poc_shift];
+          for (int i = 0; i < state->global->ref->used_size; i++) {
+            if (state->global->ref->images[i]->poc == state->global->poc + delta_poc) {
+              found = 1;
+              break;
+            }
+          }
+          if (!found) poc_shift++;
+          if (j + poc_shift == ref_positive) {
+            fprintf(stderr, "Failure, reference not found!");
+            exit(EXIT_FAILURE);
+          }
+        } while (!found);
+      }
+      
+      WRITE_UE(stream, state->encoder_control->cfg->gop_len ? delta_poc - last_poc - 1 : 0, "delta_poc_s1_minus1");
+      last_poc = delta_poc;
+      WRITE_U(stream, 1, 1, "used_by_curr_pic_s1_flag");
+    }
    //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
  }

@ -610,14 +668,14 @@ void encoder_state_write_bitstream_slice_header(encoder_state_t * const state)

  if (state->global->slicetype != SLICE_I) {
      WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-        WRITE_UE(stream, state->global->ref->used_size-1, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, ref_negative != 0 ? ref_negative - 1: 0, "num_ref_idx_l0_active_minus1");
+        if (state->global->slicetype == SLICE_B) {
+          WRITE_UE(stream, ref_positive != 0 ? ref_positive - 1 : 0, "num_ref_idx_l1_active_minus1");
+          WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
+        }
      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
  }

-  if (state->global->slicetype == SLICE_B) {
-      WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
-  }
-
  {
    int slice_qp_delta = state->global->QP - state->encoder_control->cfg->qp;
    WRITE_SE(stream, slice_qp_delta, "slice_qp_delta");
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -52,17 +52,16 @@
  \brief Initializes lambda-value for current QP

  Implementation closer to HM (Used HM12 as reference)
-   - Still missing functionality when GOP and B-pictures are used
 */
 void encoder_state_init_lambda(encoder_state_t * const state)
 {
  double qp = state->global->QP;
-  double lambda_scale = 1.0;
+  double lambda_scale = 1.0 - CLIP(0.0, 0.5, 0.05*(double)state->encoder_control->cfg->gop_len);
  double qp_temp      = qp - 12;
  double lambda;

  // Default QP-factor from HM config
-  double qp_factor = 0.4624;
+  double qp_factor = state->encoder_control->cfg->gop_len ? state->global->QP_factor : 0.4624;

  if (state->global->slicetype == SLICE_I) {
    qp_factor=0.57*lambda_scale;
@ -660,11 +659,103 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
  }
 }

+
+static void encoder_ref_insertion_sort(int reflist[16], int length) {
+
+  for (uint8_t i = 1; i < length; ++i) {
+    const int16_t cur_poc = reflist[i];
+    int16_t j = i;
+    while (j > 0 && cur_poc < reflist[j - 1]) {
+      reflist[j] = reflist[j - 1];
+      --j;
+    }
+    reflist[j] = cur_poc;
+  }
+}
+static void encoder_state_ref_sort(encoder_state_t *state) {
+  int j, ref_list[2] = { 0, 0 }, ref_list_poc[2][16];
+
+  // List all pocs of lists
+  for (j = 0; j < state->global->ref->used_size; j++) {
+    if (state->global->ref->images[j]->poc < state->global->poc) {
+      ref_list_poc[0][ref_list[0]] = state->global->ref->images[j]->poc;
+      ref_list[0]++;
+    } else {
+      ref_list_poc[1][ref_list[1]] = state->global->ref->images[j]->poc;
+      ref_list[1]++;
+    }
+  }
+
+  encoder_ref_insertion_sort(ref_list_poc[0], ref_list[0]);
+  encoder_ref_insertion_sort(ref_list_poc[1], ref_list[1]);
+
+  for (j = 0; j < state->global->ref->used_size; j++) {
+    if (state->global->ref->images[j]->poc < state->global->poc) {
+      int idx = ref_list[0];
+      for (int ref_idx = 0; ref_idx < ref_list[0]; ref_idx++) {
+        if (ref_list_poc[0][ref_idx] == state->global->ref->images[j]->poc) {
+          state->global->refmap[j].idx = ref_list[0] - ref_idx - 1;
+          break;
+        }
+      }
+      state->global->refmap[j].list = 1;
+
+    } else {
+      int idx = ref_list[1];
+      for (int ref_idx = 0; ref_idx < ref_list[1]; ref_idx++) {
+        if (ref_list_poc[1][ref_idx] == state->global->ref->images[j]->poc) {
+          state->global->refmap[j].idx = ref_idx;
+          break;
+        }
+      }
+      state->global->refmap[j].list = 2;
+    }
+    state->global->refmap[j].poc = state->global->ref->images[j]->poc;
+  }
+}
+
+static void encoder_state_remove_refs(encoder_state_t *state) {
+  const encoder_control_t * const encoder = state->encoder_control;
+  int8_t refnumber = encoder->cfg->ref_frames;
+  int8_t check_refs = 0;
+  if (encoder->cfg->gop_len) {
+    refnumber = encoder->cfg->gop[state->global->gop_offset].ref_neg_count + encoder->cfg->gop[state->global->gop_offset].ref_pos_count;
+    check_refs = 1;
+  } else if (state->global->slicetype == SLICE_I) {
+    refnumber = 1;
+  }
+  // Remove the ref pic (if present)
+  while (check_refs || state->global->ref->used_size > (uint32_t)refnumber) {
+    int8_t ref_to_remove = state->global->ref->used_size - 1;
+    if (encoder->cfg->gop_len) {
+      for (int ref = 0; ref < state->global->ref->used_size; ref++) {
+        uint8_t found = 0;
+        for (int i = 0; i < encoder->cfg->gop[state->global->gop_offset].ref_neg_count; i++) {
+          if (state->global->ref->images[ref]->poc == state->global->poc - encoder->cfg->gop[state->global->gop_offset].ref_neg[i]) {
+            found = 1;
+            break;
+          }
+        }
+        if (found) continue;
+        for (int i = 0; i < encoder->cfg->gop[state->global->gop_offset].ref_pos_count; i++) {
+          if (state->global->ref->images[ref]->poc == state->global->poc + encoder->cfg->gop[state->global->gop_offset].ref_pos[i]) {
+            found = 1;
+            break;
+          }
+        }
+        if (!found) {
+          image_list_rem(state->global->ref, ref);
+          ref--;
+        }
+      }
+      check_refs = 0;
+    } else image_list_rem(state->global->ref, ref_to_remove);
+  }
+
+}
+
 static void encoder_state_clear_refs(encoder_state_t *state) {
  int i;
-  while (state->global->ref->used_size) {
-    image_list_rem(state->global->ref, state->global->ref->used_size - 1);
-  }

  state->global->poc = 0;
  videoframe_set_poc(state->tile->frame, 0);
@ -685,7 +776,15 @@ static void encoder_state_new_frame(encoder_state_t * const state) {
    const int is_i_radl = (encoder->cfg->intra_period == 1 && state->global->frame % 2 == 0);
    const int is_p_radl = (encoder->cfg->intra_period > 1 && (state->global->frame % encoder->cfg->intra_period) == 0);
    state->global->is_radl_frame = is_first_frame || is_i_radl || is_p_radl;
-    
+
+    if (state->global->frame && encoder->cfg->gop_len) {
+      // Calculate POC according to the global frame counter and GOP structure
+      state->global->poc = (state->global->frame - 1) - (state->global->frame - 1) % encoder->cfg->gop_len +
+        encoder->cfg->gop[state->global->gop_offset].poc_offset;
+      videoframe_set_poc(state->tile->frame, state->global->poc);
+      state->global->is_radl_frame = 0;
+    }
+   
    if (state->global->is_radl_frame) {
      // Clear the reference list
      encoder_state_clear_refs(state);
@ -693,9 +792,29 @@ static void encoder_state_new_frame(encoder_state_t * const state) {
      state->global->slicetype = SLICE_I;
      state->global->pictype = NAL_IDR_W_RADL;
    } else {
-      state->global->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : SLICE_P;
+      encoder_state_remove_refs(state);
+      encoder_state_ref_sort(state);
+      state->global->slicetype = encoder->cfg->intra_period==1 ? SLICE_I : (state->encoder_control->cfg->gop_len?SLICE_B:SLICE_P);
      state->global->pictype = NAL_TRAIL_R;
+      if (state->encoder_control->cfg->gop_len) {
+        if (encoder->cfg->intra_period > 1 && (state->global->poc % encoder->cfg->intra_period) == 0) {
+          state->global->slicetype = SLICE_I;
+        }
+      }
    }
+    if (state->encoder_control->cfg->gop_len) {
+      if (state->global->slicetype == SLICE_I) {
+        state->global->QP = state->encoder_control->cfg->qp;
+        state->global->QP_factor = 0.4624;
+      }
+      else {
+        state->global->QP = state->encoder_control->cfg->qp +
+          state->encoder_control->cfg->gop[state->global->gop_offset].qp_offset;
+        state->global->QP_factor = state->encoder_control->cfg->gop[state->global->gop_offset].qp_factor;
+      }
+        
+    }
+
  } else {
    //Clear the bitstream if it's not the main encoder
    bitstream_clear(&state->stream);
@ -817,35 +936,114 @@ int read_one_frame(FILE* file, const encoder_state_t * const state)
  unsigned array_width = state->tile->frame->width;
  unsigned array_height = state->tile->frame->height;

-  if (width != array_width) {
-    // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
-    if (!read_and_fill_frame_data(file, width, height, array_width,
-                                  state->tile->frame->source->y) ||
-        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  state->tile->frame->source->u) ||
-        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
-                                  state->tile->frame->source->v))
-      return 0;
-  } else {
-    // Otherwise the data can be read directly to the array.
-    unsigned y_size = width * height;
-    unsigned uv_size = (width >> 1) * (height >> 1);
-    if (y_size  != fread(state->tile->frame->source->y, sizeof(unsigned char),
-                         y_size, file) ||
-        uv_size != fread(state->tile->frame->source->u, sizeof(unsigned char),
-                         uv_size, file) ||
-        uv_size != fread(state->tile->frame->source->v, sizeof(unsigned char),
-                         uv_size, file))
-      return 0;
+  // storing GOP pictures
+  static int8_t gop_init = 0;
+  static int8_t gop_pictures_available = 0;
+  static videoframe_t gop_pictures[MAX_GOP];
+  static int8_t gop_skip_frames = 0;
+  static int8_t gop_skipped = 0;
+
+  // Initialize GOP structure when gop is enabled and not initialized
+  if (state->encoder_control->cfg->gop_len && !gop_init) {
+    int i;
+    for (i = 0; i < state->encoder_control->cfg->gop_len; i++) {
+      gop_pictures[i].source = image_alloc(array_width, array_height, 0);
+    }
+    state->global->gop_offset = 0;
+    gop_init = 1;
  }

-  if (height != array_height) {
-    fill_after_frame(height, array_width, array_height,
-                     state->tile->frame->source->y);
-    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     state->tile->frame->source->u);
-    fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
-                     state->tile->frame->source->v);
+  // If GOP is present but no pictures found
+  if (state->global->frame && 
+      state->encoder_control->cfg->gop_len &&
+      !gop_pictures_available) {
+    int i;
+    unsigned y_size = width * height;
+    unsigned uv_size = (width >> 1) * (height >> 1);
+
+    for (i = 0; i < state->encoder_control->cfg->gop_len; i++, gop_pictures_available++) {
+      if (state->encoder_control->cfg->frames && state->global->frame + gop_pictures_available >= state->encoder_control->cfg->frames) {
+        if (gop_pictures_available) {
+          gop_skip_frames = state->encoder_control->cfg->gop_len - gop_pictures_available;
+          break;
+        }
+        else return 0;
+      }
+      if (width != array_width) {
+       // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
+        if(!read_and_fill_frame_data(file, width, height, array_width, gop_pictures[i].source->y) ||
+           !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1, gop_pictures[i].source->u) ||
+           !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1, gop_pictures[i].source->v)) {
+          if (gop_pictures_available) { gop_skip_frames = state->encoder_control->cfg->gop_len - gop_pictures_available; break; }
+          else return 0;
+        }
+      } else {
+        // Otherwise the data can be read directly to the array.
+        if(y_size != fread(gop_pictures[i].source->y, sizeof(unsigned char), y_size, file) ||
+          uv_size != fread(gop_pictures[i].source->u, sizeof(unsigned char), uv_size, file) ||
+          uv_size != fread(gop_pictures[i].source->v, sizeof(unsigned char), uv_size, file)) {
+          if (gop_pictures_available) { gop_skip_frames = state->encoder_control->cfg->gop_len - gop_pictures_available; break; }
+          else return 0;
+        }
+      }
+
+      if (height != array_height) {
+        fill_after_frame(height, array_width, array_height, gop_pictures[i].source->y);
+        fill_after_frame(height >> 1, array_width >> 1, array_height >> 1, gop_pictures[i].source->u);
+        fill_after_frame(height >> 1, array_width >> 1, array_height >> 1, gop_pictures[i].source->v);
+      }
+    }  
+  }
+
+  // If GOP is present, fetch the data from our GOP picture buffer
+  if (state->global->frame && state->encoder_control->cfg->gop_len) {
+    int cur_gop_idx = state->encoder_control->cfg->gop_len - (gop_pictures_available + gop_skip_frames) + gop_skipped;
+    int cur_gop = state->encoder_control->cfg->gop[cur_gop_idx].poc_offset - 1;
+    // Special case when end of the sequence and not all pictures are available
+    if (gop_skip_frames && cur_gop >= state->encoder_control->cfg->gop_len - gop_skip_frames) {
+      for (; cur_gop >= state->encoder_control->cfg->gop_len - gop_skip_frames; cur_gop_idx++) {
+        cur_gop = state->encoder_control->cfg->gop[cur_gop_idx].poc_offset - 1;
+        gop_skipped++;
+      }
+      cur_gop_idx--;
+      gop_skipped--;
+    }
+    state->global->gop_offset = cur_gop_idx;
+    memcpy(state->tile->frame->source->y, gop_pictures[cur_gop].source->y, width * height);
+    memcpy(state->tile->frame->source->u, gop_pictures[cur_gop].source->u, (width >> 1) * (height >> 1));
+    memcpy(state->tile->frame->source->v, gop_pictures[cur_gop].source->v, (width >> 1) * (height >> 1));
+    gop_pictures_available--;
+  } else {
+    if (width != array_width) {
+      // In the case of frames not being aligned on 8 bit borders, bits need to be copied to fill them in.
+      if (!read_and_fill_frame_data(file, width, height, array_width,
+        state->tile->frame->source->y) ||
+        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
+        state->tile->frame->source->u) ||
+        !read_and_fill_frame_data(file, width >> 1, height >> 1, array_width >> 1,
+        state->tile->frame->source->v))
+        return 0;
+    } else {
+      // Otherwise the data can be read directly to the array.
+      unsigned y_size = width * height;
+      unsigned uv_size = (width >> 1) * (height >> 1);
+      if (y_size != fread(state->tile->frame->source->y, sizeof(unsigned char),
+        y_size, file) ||
+        uv_size != fread(state->tile->frame->source->u, sizeof(unsigned char),
+        uv_size, file) ||
+        uv_size != fread(state->tile->frame->source->v, sizeof(unsigned char),
+        uv_size, file))
+        return 0;
+    }
+
+    if (height != array_height) {
+      fill_after_frame(height, array_width, array_height,
+        state->tile->frame->source->y);
+      fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
+        state->tile->frame->source->u);
+      fill_after_frame(height >> 1, array_width >> 1, array_height >> 1,
+        state->tile->frame->source->v);
+    }
  }
  return 1;
 }
@ -890,10 +1088,37 @@ void encoder_compute_stats(encoder_state_t *state, FILE * const recout, uint32_t
    
    videoframe_compute_psnr(state->tile->frame, temp_psnr);
    
-    fprintf(stderr, "POC %4d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f\n", state->global->frame,
+    fprintf(stderr, "POC %4d QP %2d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f", state->global->poc,
+          state->global->QP,
          "BPI"[state->global->slicetype%3], state->stats_bitstream_length<<3,
          temp_psnr[0], temp_psnr[1], temp_psnr[2]);
+    // Print reference picture lists
+    if (state->global->slicetype != SLICE_I) {
+      int j, ref_list[2] = { 0, 0 }, ref_list_poc[2][16];
+      // List all pocs of lists
+      for (j = 0; j < state->global->ref->used_size; j++) {
+        if (state->global->ref->images[j]->poc < state->global->poc) {
+          ref_list_poc[0][ref_list[0]] = state->global->ref->images[j]->poc;
+          ref_list[0]++;
+        } else {
+          ref_list_poc[1][ref_list[1]] = state->global->ref->images[j]->poc;
+          ref_list[1]++;
+        }
+      }
+      encoder_ref_insertion_sort(ref_list_poc[0], ref_list[0]);
+      encoder_ref_insertion_sort(ref_list_poc[1], ref_list[1]);

+      fprintf(stderr, " [L0 ");
+      for (j = ref_list[0]-1; j >= 0; j--) {
+        fprintf(stderr, "%d ", ref_list_poc[0][j]);
+      }
+      fprintf(stderr, "] [L1 ");
+      for (j = 0; j < ref_list[1]; j++) {
+        fprintf(stderr, "%d ", ref_list_poc[1][j]);
+      }
+      fprintf(stderr, "]");
+    }
+    fprintf(stderr, "\n");
    // Increment total PSNR
    psnr[0] += temp_psnr[0];
    psnr[1] += temp_psnr[1];
@ -903,10 +1128,8 @@ void encoder_compute_stats(encoder_state_t *state, FILE * const recout, uint32_t
  *bitstream_length += state->stats_bitstream_length;
 }

-
 void encoder_next_frame(encoder_state_t *state) {
  const encoder_control_t * const encoder = state->encoder_control;
-  
  //Blocking call
  threadqueue_waitfor(encoder->threadqueue, state->tqj_bitstream_written);
  
@ -925,7 +1148,7 @@ void encoder_next_frame(encoder_state_t *state) {
    //We have a "real" previous encoder
    state->global->frame = state->previous_encoder_state->global->frame + 1;
    state->global->poc = state->previous_encoder_state->global->poc + 1;
-    
+
    image_free(state->tile->frame->rec);
    cu_array_free(state->tile->frame->cu_array);
    
@ -937,29 +1160,27 @@ void encoder_next_frame(encoder_state_t *state) {
      state->tile->frame->cu_array = cu_array_alloc(width_in_scu, height_in_scu);
    }
    videoframe_set_poc(state->tile->frame, state->global->poc);
-    
    image_list_copy_contents(state->global->ref, state->previous_encoder_state->global->ref);
-    image_list_add(state->global->ref, state->previous_encoder_state->tile->frame->rec, state->previous_encoder_state->tile->frame->cu_array);
-    // Remove the ref pics in excess
-    while (state->global->ref->used_size > (uint32_t)encoder->cfg->ref_frames) {
-      image_list_rem(state->global->ref, state->global->ref->used_size-1);
+    if (!encoder->cfg->gop_len || !state->previous_encoder_state->global->poc || encoder->cfg->gop[state->previous_encoder_state->global->gop_offset].is_ref) {
+      image_list_add(state->global->ref, state->previous_encoder_state->tile->frame->rec, state->previous_encoder_state->tile->frame->cu_array);
    }
-    return; //FIXME reference frames
+
+    return;
  }

-  // Remove the ref pic (if present)
-  if (state->global->ref->used_size == (uint32_t)encoder->cfg->ref_frames) {
-    image_list_rem(state->global->ref, state->global->ref->used_size-1);
+
+  if (!encoder->cfg->gop_len || !state->global->poc || encoder->cfg->gop[state->global->gop_offset].is_ref) {
+    // Add current reconstructed picture as reference
+    image_list_add(state->global->ref, state->tile->frame->rec, state->tile->frame->cu_array);
  }
-  // Add current reconstructed picture as reference
-  image_list_add(state->global->ref, state->tile->frame->rec, state->tile->frame->cu_array);
-  
-  //Remove current reconstructed picture, and alloc a new one
-  image_free(state->tile->frame->rec);
-  
+
+
  state->global->frame++;
  state->global->poc++;
-  
+
+  //Remove current reconstructed picture, and alloc a new one
+  image_free(state->tile->frame->rec);
+
  state->tile->frame->rec = image_alloc(state->tile->frame->width, state->tile->frame->height, state->global->poc);
  videoframe_set_poc(state->tile->frame, state->global->poc);
 }
@ -1108,101 +1329,109 @@ void encode_coding_tree(encoder_state_t * const state,
      }
    } else {
      uint32_t ref_list_idx;
-      /*
-      // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
-      if(cur_pic->slicetype == SLICE_B)
-      {
-        // Code Inter Dir
-        const UInt uiInterDir = pcCU->getInterDir( uiAbsPartIdx ) - 1;
-        const UInt uiCtx      = pcCU->getCtxInterDir( uiAbsPartIdx );
-        ContextModel *pCtx    = m_cCUInterDirSCModel.get( 0 );
-        if (pcCU->getPartitionSize(uiAbsPartIdx) == SIZE_2Nx2N || pcCU->getHeight(uiAbsPartIdx) != 8 )
-        {
-          m_pcBinIf->encodeBin( uiInterDir == 2 ? 1 : 0, *( pCtx + uiCtx ) );
-        }
-        if (uiInterDir < 2)
-        {
-          m_pcBinIf->encodeBin( uiInterDir, *( pCtx + 4 ) );
+      uint32_t j;
+      int ref_list[2] = { 0, 0 };
+      for (j = 0; j < state->global->ref->used_size; j++) {
+        if (state->global->ref->images[j]->poc < state->global->poc) {
+          ref_list[0]++;
+        } else {
+          ref_list[1]++;
+        }
+      }
+
+      // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
+      if (state->global->slicetype == SLICE_B)
+      {
+        // Code Inter Dir
+        uint8_t inter_dir = cur_cu->inter.mv_dir-1;
+        uint8_t ctx = depth;
+        
+
+        if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8)
+        {
+          cabac->cur_ctx = &(cabac->ctx.inter_dir[ctx]);
+          CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+        }
+        if (inter_dir < 2)
+        {
+          cabac->cur_ctx = &(cabac->ctx.inter_dir[4]);
+          CABAC_BIN(cabac, inter_dir, "inter_pred_idc");
        }
      }
-      */

      for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
-            //if(encoder_state->ref_idx_num[uiRefListIdx] > 0)
-            {
-          if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
-            if (state->global->ref->used_size != 1) { //encoder_state->ref_idx_num[uiRefListIdx] != 1)//NumRefIdx != 1)
-              // parseRefFrmIdx
-              int32_t ref_frame = cur_cu->inter.mv_ref;
+        if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
+          if (ref_list[ref_list_idx] > 1) {
+            // parseRefFrmIdx
+            int32_t ref_frame = cur_cu->inter.mv_ref_coded[ref_list_idx];

-              cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
-              CABAC_BIN(cabac, (ref_frame != 0), "ref_frame_flag");
+            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
+            CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");

-              if (ref_frame > 0) {
-                int32_t i;
-                int32_t ref_num = state->global->ref->used_size - 2;
+            if (ref_frame > 0) {
+              int32_t i;
+              int32_t ref_num = ref_list[ref_list_idx] - 2;

-                cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
-                ref_frame--;
+              cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
+              ref_frame--;

-                for (i = 0; i < ref_num; ++i) {
-                  const uint32_t symbol = (i == ref_frame) ? 0 : 1;
+              for (i = 0; i < ref_num; ++i) {
+                const uint32_t symbol = (i == ref_frame) ? 0 : 1;

-                  if (i == 0) {
-                    CABAC_BIN(cabac, symbol, "ref_frame_flag2");
-                  } else {
-                    CABAC_BIN_EP(cabac, symbol, "ref_frame_flag2");
-                  }
-                  if (symbol == 0) break;
+                if (i == 0) {
+                  CABAC_BIN(cabac, symbol, "ref_idx_lX");
+                } else {
+                  CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
                }
+                if (symbol == 0) break;
              }
            }
+          }

-            if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
-              const int32_t mvd_hor = cur_cu->inter.mvd[0];
-              const int32_t mvd_ver = cur_cu->inter.mvd[1];
-              const int8_t hor_abs_gr0 = mvd_hor != 0;
-              const int8_t ver_abs_gr0 = mvd_ver != 0;
-              const uint32_t mvd_hor_abs = abs(mvd_hor);
-              const uint32_t mvd_ver_abs = abs(mvd_ver);
+          if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
+            const int32_t mvd_hor = cur_cu->inter.mvd[ref_list_idx][0];
+            const int32_t mvd_ver = cur_cu->inter.mvd[ref_list_idx][1];
+            const int8_t hor_abs_gr0 = mvd_hor != 0;
+            const int8_t ver_abs_gr0 = mvd_ver != 0;
+            const uint32_t mvd_hor_abs = abs(mvd_hor);
+            const uint32_t mvd_ver_abs = abs(mvd_ver);

-              cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]);
-              CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-              CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+            cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]);
+            CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
+            CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");

-              cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]);
+            cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]);

-              if (hor_abs_gr0) {
-                CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
-              }
-
-              if (ver_abs_gr0) {
-                CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
-              }
-
-              if (hor_abs_gr0) {
-                if (mvd_hor_abs > 1) {
-                  cabac_write_ep_ex_golomb(cabac,mvd_hor_abs-2, 1);
-                }
-
-                CABAC_BIN_EP(cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
-              }
-
-              if (ver_abs_gr0) {
-                if (mvd_ver_abs > 1) {
-                  cabac_write_ep_ex_golomb(cabac,mvd_ver_abs-2, 1);
-                }
-
-                CABAC_BIN_EP(cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
-              }
+            if (hor_abs_gr0) {
+              CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
            }

-            // Signal which candidate MV to use
-            cabac_write_unary_max_symbol(cabac, cabac->ctx.mvp_idx_model, cur_cu->inter.mv_cand, 1,
-                                        AMVP_MAX_NUM_CANDS - 1);
+            if (ver_abs_gr0) {
+              CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+            }
+
+            if (hor_abs_gr0) {
+              if (mvd_hor_abs > 1) {
+                cabac_write_ep_ex_golomb(cabac,mvd_hor_abs-2, 1);
+              }
+
+              CABAC_BIN_EP(cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
+            }
+
+            if (ver_abs_gr0) {
+              if (mvd_ver_abs > 1) {
+                cabac_write_ep_ex_golomb(cabac,mvd_ver_abs-2, 1);
+              }
+
+              CABAC_BIN_EP(cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
+            }
          }
-          }
-        } // for ref_list
+
+          // Signal which candidate MV to use
+          cabac_write_unary_max_symbol(cabac, cabac->ctx.mvp_idx_model, cur_cu->inter.mv_cand[ref_list_idx], 1,
+                                      AMVP_MAX_NUM_CANDS - 1);
+        }
+      } // for ref_list
    } // if !merge

    {
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -64,13 +64,20 @@ typedef struct {
  
  int32_t frame;
  int32_t poc; /*!< \brief picture order count */
+  int8_t gop_offset; /*!< \brief offset in the gop structure */
  
  int8_t QP;   //!< \brief Quantization parameter
+  double QP_factor; //!< \brief Quantization factor
  
  //Current picture available references
  image_list_t *ref;
  int8_t ref_list;
-  //int8_t ref_idx_num[2];
+
+  struct {
+    int32_t poc;
+    int8_t list;
+    int8_t idx;
+  } refmap[16];
  
  int is_radl_frame;
  uint8_t pictype;
--- a/src/filter.c
+++ b/src/filter.c
@ -171,7 +171,7 @@ void filter_deblock_edge_luma(encoder_state_t * const state,
  const videoframe_t * const frame = state->tile->frame;
  const encoder_control_t * const encoder = state->encoder_control;
  
-  const cu_info_t *cu_q = videoframe_get_cu_const(frame, xpos >> MIN_SIZE, ypos >> MIN_SIZE);
+  cu_info_t *cu_q = videoframe_get_cu(frame, xpos >> MIN_SIZE, ypos >> MIN_SIZE);

  {
    // Return if called with a coordinate which is not at CU or TU boundary.
@ -191,7 +191,7 @@ void filter_deblock_edge_luma(encoder_state_t * const state,
    pixel_t *orig_src = &frame->rec->y[xpos + ypos*stride];
    pixel_t *src = orig_src;
    int32_t step = 1;
-    const cu_info_t *cu_p = NULL;
+    cu_info_t *cu_p = NULL;
    int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE;
    int8_t strength = 0;

@ -228,7 +228,8 @@ void filter_deblock_edge_luma(encoder_state_t * const state,
        }

        // CU in the side we are filtering, update every 8-pixels
-        cu_p = videoframe_get_cu_const(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0));
+        cu_p = videoframe_get_cu(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0));
+
        // Filter strength
        strength = 0;
        if(cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
@ -236,13 +237,76 @@ void filter_deblock_edge_luma(encoder_state_t * const state,
        } else if(cbf_is_set(cu_q->cbf.y, cu_q->tr_depth) || cbf_is_set(cu_p->cbf.y, cu_p->tr_depth)) {
          // Non-zero residual/coeffs and transform boundary
          // Neither CU is intra so tr_depth <= MAX_DEPTH.
-          strength = 1;
-        } else if((abs(cu_q->inter.mv[0] - cu_p->inter.mv[0]) >= 4) || (abs(cu_q->inter.mv[1] - cu_p->inter.mv[1]) >= 4)) {
+          strength = 1;       
+        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 && ((abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][0] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][0]) >= 4) || (abs(cu_q->inter.mv[cu_q->inter.mv_dir - 1][1] - cu_p->inter.mv[cu_p->inter.mv_dir - 1][1]) >= 4))) {
          // Absolute motion vector diff between blocks >= 1 (Integer pixel)
          strength = 1;
-        } else if(cu_q->inter.mv_ref != cu_p->inter.mv_ref) {
+        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 && cu_q->inter.mv_ref[cu_q->inter.mv_dir - 1] != cu_p->inter.mv_ref[cu_p->inter.mv_dir - 1]) {
          strength = 1;
        }
+        
+        // B-slice related checks
+        if(!strength && state->global->slicetype == SLICE_B) {
+
+          // Zero all undefined motion vectors for easier usage
+          if(!(cu_q->inter.mv_dir & 1)) {
+            cu_q->inter.mv[0][0] = 0;
+            cu_q->inter.mv[0][1] = 0;
+          }
+          if(!(cu_q->inter.mv_dir & 2)) {
+            cu_q->inter.mv[1][0] = 0;
+            cu_q->inter.mv[1][1] = 0;
+          }
+
+          if(!(cu_p->inter.mv_dir & 1)) {
+            cu_p->inter.mv[0][0] = 0;
+            cu_p->inter.mv[0][1] = 0;
+          }
+          if(!(cu_p->inter.mv_dir & 2)) {
+            cu_p->inter.mv[1][0] = 0;
+            cu_p->inter.mv[1][1] = 0;
+          }
+          const int refP0 = (cu_p->inter.mv_dir & 1) ? cu_p->inter.mv_ref[0] : -1;
+          const int refP1 = (cu_p->inter.mv_dir & 2) ? cu_p->inter.mv_ref[1] : -1;
+          const int refQ0 = (cu_q->inter.mv_dir & 1) ? cu_q->inter.mv_ref[0] : -1;
+          const int refQ1 = (cu_q->inter.mv_dir & 2) ? cu_q->inter.mv_ref[1] : -1;
+          const int16_t* mvQ0 = cu_q->inter.mv[0];
+          const int16_t* mvQ1 = cu_q->inter.mv[1];
+
+          const int16_t* mvP0 = cu_p->inter.mv[0];
+          const int16_t* mvP1 = cu_p->inter.mv[1];
+
+          if(( refP0 == refQ0 &&  refP1 == refQ1 ) || ( refP0 == refQ1 && refP1==refQ0 ))
+          {
+            // Different L0 & L1
+            if ( refP0 != refP1 ) {          
+              if ( refP0 == refQ0 ) {
+                strength  = ((abs(mvQ0[0] - mvP0[0]) >= 4) ||
+                             (abs(mvQ0[1] - mvP0[1]) >= 4) ||
+                             (abs(mvQ1[0] - mvP1[0]) >= 4) ||
+                             (abs(mvQ1[1] - mvP1[1]) >= 4)) ? 1 : 0;
+              } else {
+                strength  = ((abs(mvQ1[0] - mvP0[0]) >= 4) ||
+                             (abs(mvQ1[1] - mvP0[1]) >= 4) ||
+                             (abs(mvQ0[0] - mvP1[0]) >= 4) ||
+                             (abs(mvQ0[1] - mvP1[1]) >= 4)) ? 1 : 0;
+              }
+            // Same L0 & L1
+            } else {  
+              strength  = ((abs(mvQ0[0] - mvP0[0]) >= 4) ||
+                           (abs(mvQ0[1] - mvP0[1]) >= 4) ||
+                           (abs(mvQ1[0] - mvP1[0]) >= 4) ||
+                           (abs(mvQ1[1] - mvP1[1]) >= 4)) &&
+                          ((abs(mvQ1[0] - mvP0[0]) >= 4) ||
+                           (abs(mvQ1[1] - mvP0[1]) >= 4) ||
+                           (abs(mvQ0[0] - mvP1[0]) >= 4) ||
+                           (abs(mvQ0[1] - mvP1[1]) >= 4)) ? 1 : 0;
+            }
+          } else {
+            strength = 1;
+          }
+        }
+
        tc_index        = CLIP(0, 51 + 2, (int32_t)(qp + 2*(strength - 1) + (tc_offset_div2 << 1)));
        tc              = g_tc_table_8x8[tc_index] * bitdepth_scale;
        thr_cut         = tc * 10;
--- a/src/global.h
+++ b/src/global.h
@ -66,6 +66,9 @@ typedef int16_t coeff_t;
 #define PU_DEPTH_INTRA_MIN 0
 #define PU_DEPTH_INTRA_MAX 4

+// Maximum length of GoP (for allocating structures)
+#define MAX_GOP 32
+
 // Maximum CU depth when descending form LCU level.
 #define MAX_DEPTH 3  /*!< spec: log2_diff_max_min_luma_coding_block_size */
 // Minimum log2 size of CUs.
--- a/src/inter.c
+++ b/src/inter.c
@ -55,11 +55,8 @@ void inter_set_block(videoframe_t* frame, uint32_t x_cu, uint32_t y_cu, uint8_t
      cu->depth = depth;
      cu->type  = CU_INTER;
      cu->part_size = SIZE_2Nx2N;
-      cu->inter.mode   = cur_cu->inter.mode;
-      cu->inter.mv[0]  = cur_cu->inter.mv[0];
-      cu->inter.mv[1]  = cur_cu->inter.mv[1];
-      cu->inter.mv_dir = cur_cu->inter.mv_dir;
-      cu->inter.mv_ref = cur_cu->inter.mv_ref;
+      memcpy(&cu->inter, &cur_cu->inter, sizeof(cur_cu->inter));
+      
      cu->tr_depth = tr_depth;
    }
  }
@ -329,6 +326,71 @@ void inter_recon_lcu(const encoder_state_t * const state, const image_t * const
  }
 }

+/**
+* \brief Reconstruct bi-pred inter block
+* \param ref1 reference picture to copy the data from
+* \param ref2 other reference picture to copy the data from
+* \param xpos block x position
+* \param ypos block y position
+* \param width block width
+* \param mv[2][2] motion vectors
+* \param lcu destination lcu
+* \returns Void
+*/
+
+void inter_recon_lcu_bipred(const encoder_state_t * const state, const image_t * ref1, const image_t * ref2, int32_t xpos, int32_t ypos, int32_t width, const int16_t mv_param[2][2], lcu_t* lcu) {
+  pixel_t temp_lcu_y[64 * 64];
+  pixel_t temp_lcu_u[32 * 32];
+  pixel_t temp_lcu_v[32 * 32];
+  int temp_x, temp_y;
+  // TODO: interpolated values require 14-bit accuracy for bi-prediction, current implementation of ipol filters round the value to 8bits
+
+  //Reconstruct both predictors
+  inter_recon_lcu(state, ref1, xpos, ypos, width, mv_param[0], lcu);
+  memcpy(temp_lcu_y, lcu->rec.y, sizeof(pixel_t) * 64 * 64);
+  memcpy(temp_lcu_u, lcu->rec.u, sizeof(pixel_t) * 32 * 32);
+  memcpy(temp_lcu_v, lcu->rec.v, sizeof(pixel_t) * 32 * 32);
+  inter_recon_lcu(state, ref2, xpos, ypos, width, mv_param[1], lcu);
+
+  // After reconstruction, merge the predictors by taking an average of each pixel
+  for (temp_y = 0; temp_y < width; ++temp_y) {
+    int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+    for (temp_x = 0; temp_x < width; ++temp_x) {
+      int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+      lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (pixel_t)(((int)lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] +
+        (int)temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] + 1) >> 1);
+    }
+  }
+  for (temp_y = 0; temp_y < width>>1; ++temp_y) {
+    int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+    for (temp_x = 0; temp_x < width>>1; ++temp_x) {
+      int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+      lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (pixel_t)(((int)lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] +
+        (int)temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] + 1) >> 1);
+
+      lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (pixel_t)(((int)lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] +
+        (int)temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] + 1) >> 1);
+    }
+  }
+}
+
+/**
+ * \brief Set unused L0/L1 motion vectors and reference
+ * \param cu coding unit to clear
+ */
+static void inter_clear_cu_unused(cu_info_t* cu) {
+  if(!(cu->inter.mv_dir & 1)) {
+    cu->inter.mv[0][0] = 0;
+    cu->inter.mv[0][1] = 0;
+    cu->inter.mv_ref[0] = 255;
+  }
+  if(!(cu->inter.mv_dir & 2)) {
+    cu->inter.mv[1][0] = 0;
+    cu->inter.mv[1][1] = 0;
+    cu->inter.mv_ref[1] = 255;
+  }
+}
+
 /**
 * \brief Get merge candidates for current block
 * \param encoder encoder control struct to use
@ -362,11 +424,13 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
  if (x != 0) {
    *a1 = &cu[x_cu - 1 + (y_cu + cur_block_in_scu - 1) * LCU_T_CU_WIDTH];
    if (!(*a1)->coded) *a1 = NULL;
+    if(*a1) inter_clear_cu_unused(*a1);

    if (y_cu + cur_block_in_scu < LCU_WIDTH>>3) {
      *a0 = &cu[x_cu - 1 + (y_cu + cur_block_in_scu) * LCU_T_CU_WIDTH];
      if (!(*a0)->coded) *a0 = NULL;
    }
+    if(*a0) inter_clear_cu_unused(*a0);
  }

  // B0, B1 and B2 availability testing
@ -379,14 +443,17 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
      *b0 = &lcu->cu[LCU_T_CU_WIDTH*LCU_T_CU_WIDTH];
      if (!(*b0)->coded) *b0 = NULL;
    }
+    if(*b0) inter_clear_cu_unused(*b0);

    *b1 = &cu[x_cu + cur_block_in_scu - 1 + (y_cu - 1) * LCU_T_CU_WIDTH];
    if (!(*b1)->coded) *b1 = NULL;
+    if(*b1) inter_clear_cu_unused(*b1);

    if (x != 0) {
      *b2 = &cu[x_cu - 1 + (y_cu - 1) * LCU_T_CU_WIDTH];
      if(!(*b2)->coded) *b2 = NULL;
    }
+    if(*b2) inter_clear_cu_unused(*b2);
  }
 }

@ -398,61 +465,109 @@ void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_i
 * \param depth current block depth
 * \param mv_pred[2][2] 2x motion vector prediction
 */
-void inter_get_mv_cand(const encoder_state_t * const state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info_t* cur_cu, lcu_t *lcu)
+void inter_get_mv_cand(const encoder_state_t * const state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist)
 {
  uint8_t candidates = 0;
  uint8_t b_candidates = 0;
+  int8_t reflist2nd = !reflist;

  cu_info_t *b0, *b1, *b2, *a0, *a1;
  b0 = b1 = b2 = a0 = a1 = NULL;
  inter_get_spatial_merge_candidates(x, y, depth, &b0, &b1, &b2, &a0, &a1, lcu);

 #define CALCULATE_SCALE(cu,tb,td) ((tb * ((0x4000 + (abs(td)>>1))/td) + 32) >> 6)
-#define APPLY_MV_SCALING(cu, cand) {int td = state->global->poc - state->global->ref->images[(cu)->inter.mv_ref]->poc;\
-                                   int tb = state->global->poc - state->global->ref->images[cur_cu->inter.mv_ref]->poc;\
+#define APPLY_MV_SCALING(cu, cand, list) {int td = state->global->poc - state->global->ref->images[(cu)->inter.mv_ref[list]]->poc;\
+                                   int tb = state->global->poc - state->global->ref->images[cur_cu->inter.mv_ref[reflist]]->poc;\
                                   if (td != tb) { \
                                      int scale = CALCULATE_SCALE(cu,tb,td); \
-                                       mv_cand[cand][0] = ((scale * (cu)->inter.mv[0] + 127 + (scale * (cu)->inter.mv[0] < 0)) >> 8 ); \
-                                       mv_cand[cand][1] = ((scale * (cu)->inter.mv[1] + 127 + (scale * (cu)->inter.mv[1] < 0)) >> 8 ); }}
+                                       mv_cand[cand][0] = ((scale * (cu)->inter.mv[list][0] + 127 + (scale * (cu)->inter.mv[list][0] < 0)) >> 8 ); \
+                                       mv_cand[cand][1] = ((scale * (cu)->inter.mv[list][1] + 127 + (scale * (cu)->inter.mv[list][1] < 0)) >> 8 ); }}

  // Left predictors
-  if (a0 && a0->type == CU_INTER && a0->inter.mv_ref == cur_cu->inter.mv_ref) {
-    mv_cand[candidates][0] = a0->inter.mv[0];
-    mv_cand[candidates][1] = a0->inter.mv[1];
+  if (a0 && a0->type == CU_INTER && (
+    ((a0->inter.mv_dir & 1) && a0->inter.mv_ref[0] == cur_cu->inter.mv_ref[reflist]) ||
+    ((a0->inter.mv_dir & 2) && a0->inter.mv_ref[1] == cur_cu->inter.mv_ref[reflist]))) {
+    if (a0->inter.mv_dir & (1 << reflist) && a0->inter.mv_ref[reflist] == cur_cu->inter.mv_ref[reflist]) {
+      mv_cand[candidates][0] = a0->inter.mv[reflist][0];
+      mv_cand[candidates][1] = a0->inter.mv[reflist][1];
+    } else {
+      mv_cand[candidates][0] = a0->inter.mv[reflist2nd][0];
+      mv_cand[candidates][1] = a0->inter.mv[reflist2nd][1];
+    }
    candidates++;
-  } else if (a1 && a1->type == CU_INTER && a1->inter.mv_ref == cur_cu->inter.mv_ref) {
-    mv_cand[candidates][0] = a1->inter.mv[0];
-    mv_cand[candidates][1] = a1->inter.mv[1];
+  } else if (a1 && a1->type == CU_INTER && (
+    ((a1->inter.mv_dir & 1) && a1->inter.mv_ref[0] == cur_cu->inter.mv_ref[reflist]) ||
+    ((a1->inter.mv_dir & 2) && a1->inter.mv_ref[1] == cur_cu->inter.mv_ref[reflist]))) {
+    if (a1->inter.mv_dir & (1 << reflist) && a1->inter.mv_ref[reflist] == cur_cu->inter.mv_ref[reflist]) {
+      mv_cand[candidates][0] = a1->inter.mv[reflist][0];
+      mv_cand[candidates][1] = a1->inter.mv[reflist][1];
+    } else {
+      mv_cand[candidates][0] = a1->inter.mv[reflist2nd][0];
+      mv_cand[candidates][1] = a1->inter.mv[reflist2nd][1];
+    }
    candidates++;
  }

  if(!candidates) {
      // Left predictors
    if (a0 && a0->type == CU_INTER) {
-      mv_cand[candidates][0] = a0->inter.mv[0];
-      mv_cand[candidates][1] = a0->inter.mv[1];
-      APPLY_MV_SCALING(a0, candidates);
+      if (a0->inter.mv_dir & (1 << reflist)) {
+        mv_cand[candidates][0] = a0->inter.mv[reflist][0];
+        mv_cand[candidates][1] = a0->inter.mv[reflist][1];
+        APPLY_MV_SCALING(a0, candidates, reflist);
+      } else {
+        mv_cand[candidates][0] = a0->inter.mv[reflist2nd][0];
+        mv_cand[candidates][1] = a0->inter.mv[reflist2nd][1];
+        APPLY_MV_SCALING(a0, candidates, reflist2nd);
+      }
      candidates++;
    } else if (a1 && a1->type == CU_INTER) {
-      mv_cand[candidates][0] = a1->inter.mv[0];
-      mv_cand[candidates][1] = a1->inter.mv[1];
-      APPLY_MV_SCALING(a1, candidates);
+      if (a1->inter.mv_dir & (1 << reflist)) {
+        mv_cand[candidates][0] = a1->inter.mv[reflist][0];
+        mv_cand[candidates][1] = a1->inter.mv[reflist][1];
+        APPLY_MV_SCALING(a1, candidates, reflist);
+      } else {
+        mv_cand[candidates][0] = a1->inter.mv[reflist2nd][0];
+        mv_cand[candidates][1] = a1->inter.mv[reflist2nd][1];
+        APPLY_MV_SCALING(a1, candidates, reflist2nd);
+      }
      candidates++;
    }
  }

  // Top predictors
-  if (b0 && b0->type == CU_INTER && b0->inter.mv_ref == cur_cu->inter.mv_ref) {
-    mv_cand[candidates][0] = b0->inter.mv[0];
-    mv_cand[candidates][1] = b0->inter.mv[1];
+  if (b0 && b0->type == CU_INTER && (
+    ((b0->inter.mv_dir & 1) && b0->inter.mv_ref[0] == cur_cu->inter.mv_ref[reflist]) ||
+    ((b0->inter.mv_dir & 2) && b0->inter.mv_ref[1] == cur_cu->inter.mv_ref[reflist]))) {
+    if (b0->inter.mv_dir & (1 << reflist) && b0->inter.mv_ref[reflist] == cur_cu->inter.mv_ref[reflist]) {
+      mv_cand[candidates][0] = b0->inter.mv[reflist][0];
+      mv_cand[candidates][1] = b0->inter.mv[reflist][1];
+    } else {
+      mv_cand[candidates][0] = b0->inter.mv[reflist2nd][0];
+      mv_cand[candidates][1] = b0->inter.mv[reflist2nd][1];
+    }
    b_candidates++;
-  } else if (b1 && b1->type == CU_INTER && b1->inter.mv_ref == cur_cu->inter.mv_ref) {
-    mv_cand[candidates][0] = b1->inter.mv[0];
-    mv_cand[candidates][1] = b1->inter.mv[1];
+  } else if (b1 && b1->type == CU_INTER && (
+    ((b1->inter.mv_dir & 1) && b1->inter.mv_ref[0] == cur_cu->inter.mv_ref[reflist]) ||
+    ((b1->inter.mv_dir & 2) && b1->inter.mv_ref[1] == cur_cu->inter.mv_ref[reflist]))) {
+    if (b1->inter.mv_dir & (1 << reflist) && b1->inter.mv_ref[reflist] == cur_cu->inter.mv_ref[reflist]) {
+      mv_cand[candidates][0] = b1->inter.mv[reflist][0];
+      mv_cand[candidates][1] = b1->inter.mv[reflist][1];
+    } else {
+      mv_cand[candidates][0] = b1->inter.mv[reflist2nd][0];
+      mv_cand[candidates][1] = b1->inter.mv[reflist2nd][1];
+    }
    b_candidates++;
-  } else if(b2 && b2->type == CU_INTER && b2->inter.mv_ref == cur_cu->inter.mv_ref) {
-    mv_cand[candidates][0] = b2->inter.mv[0];
-    mv_cand[candidates][1] = b2->inter.mv[1];
+  } else if (b2 && b2->type == CU_INTER && (
+    ((b2->inter.mv_dir & 1) && b2->inter.mv_ref[0] == cur_cu->inter.mv_ref[reflist]) ||
+    ((b2->inter.mv_dir & 2) && b2->inter.mv_ref[1] == cur_cu->inter.mv_ref[reflist]))) {
+    if (b2->inter.mv_dir & (1 << reflist) && b2->inter.mv_ref[reflist] == cur_cu->inter.mv_ref[reflist]) {
+      mv_cand[candidates][0] = b2->inter.mv[reflist][0];
+      mv_cand[candidates][1] = b2->inter.mv[reflist][1];
+    } else {
+      mv_cand[candidates][0] = b2->inter.mv[reflist2nd][0];
+      mv_cand[candidates][1] = b2->inter.mv[reflist2nd][1];
+    }
    b_candidates++;
  }
  candidates += b_candidates;
@ -467,19 +582,37 @@ void inter_get_mv_cand(const encoder_state_t * const state, int32_t x, int32_t y
  if(!b_candidates) {
    // Top predictors
    if (b0 && b0->type == CU_INTER) {
-      mv_cand[candidates][0] = b0->inter.mv[0];
-      mv_cand[candidates][1] = b0->inter.mv[1];
-      APPLY_MV_SCALING(b0, candidates);
+      if (b0->inter.mv_dir & (1 << reflist)) {
+        mv_cand[candidates][0] = b0->inter.mv[reflist][0];
+        mv_cand[candidates][1] = b0->inter.mv[reflist][1];
+        APPLY_MV_SCALING(b0, candidates, reflist);
+      } else {
+        mv_cand[candidates][0] = b0->inter.mv[reflist2nd][0];
+        mv_cand[candidates][1] = b0->inter.mv[reflist2nd][1];
+        APPLY_MV_SCALING(b0, candidates, reflist2nd);
+      }
      candidates++;
    } else if (b1 && b1->type == CU_INTER) {
-      mv_cand[candidates][0] = b1->inter.mv[0];
-      mv_cand[candidates][1] = b1->inter.mv[1];
-      APPLY_MV_SCALING(b1, candidates);
+      if (b1->inter.mv_dir & (1 << reflist)) {
+        mv_cand[candidates][0] = b1->inter.mv[reflist][0];
+        mv_cand[candidates][1] = b1->inter.mv[reflist][1];
+        APPLY_MV_SCALING(b1, candidates, reflist);
+      } else {
+        mv_cand[candidates][0] = b1->inter.mv[reflist2nd][0];
+        mv_cand[candidates][1] = b1->inter.mv[reflist2nd][1];
+        APPLY_MV_SCALING(b1, candidates, reflist2nd);
+      }
      candidates++;
    } else if(b2 && b2->type == CU_INTER) {
-      mv_cand[candidates][0] = b2->inter.mv[0];
-      mv_cand[candidates][1] = b2->inter.mv[1];
-      APPLY_MV_SCALING(b2, candidates);
+      if (b2->inter.mv_dir & (1 << reflist)) {
+        mv_cand[candidates][0] = b2->inter.mv[reflist][0];
+        mv_cand[candidates][1] = b2->inter.mv[reflist][1];
+        APPLY_MV_SCALING(b2, candidates, reflist);
+      } else {
+        mv_cand[candidates][0] = b2->inter.mv[reflist2nd][0];
+        mv_cand[candidates][1] = b2->inter.mv[reflist2nd][1];
+        APPLY_MV_SCALING(b2, candidates, reflist2nd);
+      }
      candidates++;
    }
  }
@ -513,7 +646,7 @@ void inter_get_mv_cand(const encoder_state_t * const state, int32_t x, int32_t y
 * \param depth current block depth
 * \param mv_pred[MRG_MAX_NUM_CANDS][2] MRG_MAX_NUM_CANDS motion vector prediction
 */
-uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu)
+uint8_t inter_get_merge_cand(const encoder_state_t * const state, int32_t x, int32_t y, int8_t depth, inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], lcu_t *lcu)
 {
  uint8_t candidates = 0;
  int8_t duplicate = 0;
@ -525,23 +658,38 @@ uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand


 #define CHECK_DUPLICATE(CU1,CU2) {duplicate = 0; if ((CU2) && (CU2)->type == CU_INTER && \
-                                                     (CU1)->inter.mv[0] == (CU2)->inter.mv[0] && \
-                                                     (CU1)->inter.mv[1] == (CU2)->inter.mv[1] && \
-                                                     (CU1)->inter.mv_ref == (CU2)->inter.mv_ref) duplicate = 1; }
+                                                     (CU1)->inter.mv_dir == (CU2)->inter.mv_dir && \
+                                                    (!(((CU1)->inter.mv_dir & 1) && ((CU2)->inter.mv_dir & 1)) || \
+                                                      ((CU1)->inter.mv[0][0] == (CU2)->inter.mv[0][0] && \
+                                                       (CU1)->inter.mv[0][1] ==  (CU2)->inter.mv[0][1] && \
+                                                       (CU1)->inter.mv_ref[0] == (CU2)->inter.mv_ref[0]) ) && \
+                                                    (!(((CU1)->inter.mv_dir & 2) && ((CU2)->inter.mv_dir & 2) )  || \
+                                                      ((CU1)->inter.mv[1][0] == (CU2)->inter.mv[1][0] && \
+                                                       (CU1)->inter.mv[1][1] == (CU2)->inter.mv[1][1] && \
+                                                       (CU1)->inter.mv_ref[1] == (CU2)->inter.mv_ref[1]) ) \
+                                                      ) duplicate = 1; }

  if (a1 && a1->type == CU_INTER) {
-      mv_cand[candidates][0] = a1->inter.mv[0];
-      mv_cand[candidates][1] = a1->inter.mv[1];
-      mv_cand[candidates][2] = a1->inter.mv_ref;
-      candidates++;
+    mv_cand[candidates].mv[0][0] = a1->inter.mv[0][0];
+    mv_cand[candidates].mv[0][1] = a1->inter.mv[0][1];
+    mv_cand[candidates].mv[1][0] = a1->inter.mv[1][0];
+    mv_cand[candidates].mv[1][1] = a1->inter.mv[1][1];
+    mv_cand[candidates].ref[0] = a1->inter.mv_ref[0];
+    mv_cand[candidates].ref[1] = a1->inter.mv_ref[1];
+    mv_cand[candidates].dir = a1->inter.mv_dir;
+    candidates++;
  }

  if (b1 && b1->type == CU_INTER) {
    if(candidates) CHECK_DUPLICATE(b1, a1);
    if(!duplicate) {
-      mv_cand[candidates][0] = b1->inter.mv[0];
-      mv_cand[candidates][1] = b1->inter.mv[1];
-      mv_cand[candidates][2] = b1->inter.mv_ref;
+      mv_cand[candidates].mv[0][0] = b1->inter.mv[0][0];
+      mv_cand[candidates].mv[0][1] = b1->inter.mv[0][1];
+      mv_cand[candidates].mv[1][0] = b1->inter.mv[1][0];
+      mv_cand[candidates].mv[1][1] = b1->inter.mv[1][1];
+      mv_cand[candidates].ref[0] = b1->inter.mv_ref[0];
+      mv_cand[candidates].ref[1] = b1->inter.mv_ref[1];
+      mv_cand[candidates].dir = b1->inter.mv_dir;
      candidates++;
    }
  }
@ -549,9 +697,13 @@ uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand
  if (b0 && b0->type == CU_INTER) {
    if(candidates) CHECK_DUPLICATE(b0,b1);
    if(!duplicate) {
-      mv_cand[candidates][0] = b0->inter.mv[0];
-      mv_cand[candidates][1] = b0->inter.mv[1];
-      mv_cand[candidates][2] = b0->inter.mv_ref;
+      mv_cand[candidates].mv[0][0] = b0->inter.mv[0][0];
+      mv_cand[candidates].mv[0][1] = b0->inter.mv[0][1];
+      mv_cand[candidates].mv[1][0] = b0->inter.mv[1][0];
+      mv_cand[candidates].mv[1][1] = b0->inter.mv[1][1];
+      mv_cand[candidates].ref[0] = b0->inter.mv_ref[0];
+      mv_cand[candidates].ref[1] = b0->inter.mv_ref[1];
+      mv_cand[candidates].dir = b0->inter.mv_dir;
      candidates++;
    }
  }
@ -559,9 +711,13 @@ uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand
  if (a0 && a0->type == CU_INTER) {
    if(candidates) CHECK_DUPLICATE(a0,a1);
    if(!duplicate) {
-      mv_cand[candidates][0] = a0->inter.mv[0];
-      mv_cand[candidates][1] = a0->inter.mv[1];
-      mv_cand[candidates][2] = a0->inter.mv_ref;
+      mv_cand[candidates].mv[0][0] = a0->inter.mv[0][0];
+      mv_cand[candidates].mv[0][1] = a0->inter.mv[0][1];
+      mv_cand[candidates].mv[1][0] = a0->inter.mv[1][0];
+      mv_cand[candidates].mv[1][1] = a0->inter.mv[1][1];
+      mv_cand[candidates].ref[0] = a0->inter.mv_ref[0];
+      mv_cand[candidates].ref[1] = a0->inter.mv_ref[1];
+      mv_cand[candidates].dir = a0->inter.mv_dir;
      candidates++;
    }
  }
@ -572,9 +728,13 @@ uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand
      if(!duplicate) {
        CHECK_DUPLICATE(b2,b1);
        if(!duplicate) {
-          mv_cand[candidates][0] = b2->inter.mv[0];
-          mv_cand[candidates][1] = b2->inter.mv[1];
-          mv_cand[candidates][2] = b2->inter.mv_ref;
+          mv_cand[candidates].mv[0][0] = b2->inter.mv[0][0];
+          mv_cand[candidates].mv[0][1] = b2->inter.mv[0][1];
+          mv_cand[candidates].mv[1][0] = b2->inter.mv[1][0];
+          mv_cand[candidates].mv[1][1] = b2->inter.mv[1][1];
+          mv_cand[candidates].ref[0] = b2->inter.mv_ref[0];
+          mv_cand[candidates].ref[1] = b2->inter.mv_ref[1];
+          mv_cand[candidates].dir = b2->inter.mv_dir;
          candidates++;
        }
      }
@ -588,11 +748,71 @@ uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand
  }
 #endif

+  if (candidates == MRG_MAX_NUM_CANDS) return MRG_MAX_NUM_CANDS;
+
+  if (state->global->slicetype == SLICE_B) {
+    #define NUM_PRIORITY_LIST 12;
+    static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+    static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+    uint8_t cutoff = candidates;
+    for (int32_t idx = 0; idx<cutoff*(cutoff - 1) && candidates != MRG_MAX_NUM_CANDS; idx++) {
+      uint8_t i = priorityList0[idx];
+      uint8_t j = priorityList1[idx];
+      if (i >= candidates || j >= candidates) break;
+
+      // Find one L0 and L1 candidate according to the priority list
+      if ((mv_cand[i].dir & 0x1) && (mv_cand[j].dir & 0x2)) {
+        mv_cand[candidates].dir = 3;
+
+        // get Mv from cand[i] and cand[j]
+        mv_cand[candidates].mv[0][0] = mv_cand[i].mv[0][0];
+        mv_cand[candidates].mv[0][1] = mv_cand[i].mv[0][1];
+        mv_cand[candidates].mv[1][0] = mv_cand[j].mv[1][0];
+        mv_cand[candidates].mv[1][1] = mv_cand[j].mv[1][1];
+        mv_cand[candidates].ref[0]   = mv_cand[i].ref[0];
+        mv_cand[candidates].ref[1]   = mv_cand[j].ref[1];
+
+        if (mv_cand[i].ref[0] == mv_cand[j].ref[1] &&
+          mv_cand[i].mv[0][0] == mv_cand[j].mv[1][0] && 
+          mv_cand[i].mv[0][1] == mv_cand[j].mv[1][1]) {
+          // Not a candidate
+        } else {
+          candidates++;
+        }
+      }
+    }
+  }
+
+  if (candidates == MRG_MAX_NUM_CANDS) return MRG_MAX_NUM_CANDS;
+
+  int num_ref = state->global->ref->used_size;
+
+  if (state->global->slicetype == SLICE_B) {
+    int j;
+    int ref_negative = 0;
+    int ref_positive = 0;
+    for (j = 0; j < state->global->ref->used_size; j++) {
+      if (state->global->ref->images[j]->poc < state->global->poc) {
+        ref_negative++;
+      } else {
+        ref_positive++;
+      }
+    }
+    num_ref = MIN(ref_negative, ref_positive);
+  }
+  
  // Add (0,0) prediction
-  if (candidates != 5) {
-    mv_cand[candidates][0] = 0;
-    mv_cand[candidates][1] = 0;
-    mv_cand[candidates][2] = zero_idx;
+  while (candidates != MRG_MAX_NUM_CANDS) {
+    mv_cand[candidates].mv[0][0] = 0;
+    mv_cand[candidates].mv[0][1] = 0;
+    mv_cand[candidates].ref[0] = (zero_idx>=num_ref-1)?0:zero_idx;
+    mv_cand[candidates].ref[1] = mv_cand[candidates].ref[0];
+    mv_cand[candidates].dir = 1;
+    if (state->global->slicetype == SLICE_B) {
+      mv_cand[candidates].mv[1][0] = 0;
+      mv_cand[candidates].mv[1][1] = 0;
+      mv_cand[candidates].dir = 3;
+    }
    zero_idx++;
    candidates++;
  }
--- a/src/inter.h
+++ b/src/inter.h
@ -31,12 +31,20 @@
 #include "encoder.h"
 #include "encoderstate.h"

+typedef struct {
+  uint8_t dir;
+  uint8_t ref[2];
+  int16_t mv[2][2];
+
+} inter_merge_cand_t;
+

 //void inter_set_block(image* im,uint32_t x_cu, uint32_t y_cu, uint8_t depth, cu_info *cur_cu);
 void inter_recon_lcu(const encoder_state_t * const state, const image_t * ref, int32_t xpos, int32_t ypos, int32_t width, const int16_t mv_param[2], lcu_t* lcu);
+void inter_recon_lcu_bipred(const encoder_state_t * const state, const image_t * ref1, const image_t * ref2, int32_t xpos, int32_t ypos, int32_t width, const int16_t mv_param[2][2], lcu_t* lcu);

 void inter_get_spatial_merge_candidates(int32_t x, int32_t y, int8_t depth, cu_info_t **b0, cu_info_t **b1,
                                        cu_info_t **b2, cu_info_t **a0, cu_info_t **a1, lcu_t *lcu);
-void inter_get_mv_cand(const encoder_state_t *state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info_t* cur_cu, lcu_t *lcu);
-uint8_t inter_get_merge_cand(int32_t x, int32_t y, int8_t depth, int16_t mv_cand[MRG_MAX_NUM_CANDS][3], lcu_t *lcu);
+void inter_get_mv_cand(const encoder_state_t *state, int32_t x, int32_t y, int8_t depth, int16_t mv_cand[2][2], cu_info_t* cur_cu, lcu_t *lcu, int8_t reflist);
+uint8_t inter_get_merge_cand(const encoder_state_t *state, int32_t x, int32_t y, int8_t depth, inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], lcu_t *lcu);
 #endif
--- a/src/search.c
+++ b/src/search.c
@ -162,7 +162,7 @@ static uint32_t get_mvd_coding_cost(vector2d_t *mvd)
 }

 static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int mv_shift,
-                         int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+                         int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                         int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
 {
  uint32_t temp_bitcost = 0;
@ -177,9 +177,10 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int

  // Check every candidate to find a match
  for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
-    if (merge_cand[merge_idx][0] == x &&
-        merge_cand[merge_idx][1] == y &&
-        merge_cand[merge_idx][2] == ref_idx) {
+    if (merge_cand[merge_idx].dir == 3) continue;
+    if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x &&
+        merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y &&
+        merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
      temp_bitcost += merge_idx;
      merged = 1;
      break;
@ -208,7 +209,7 @@ static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int

 unsigned tz_pattern_search(const encoder_state_t * const state, const image_t *pic, const image_t *ref, unsigned pattern_type,
                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
-                           int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
+                           int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
                           int block_width, int max_lcu_below)
 {
  int n_points;
@ -365,7 +366,7 @@ unsigned tz_pattern_search(const encoder_state_t * const state, const image_t *p

 unsigned tz_raster_search(const encoder_state_t * const state, const image_t *pic, const image_t *ref,
                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
-                          int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
+                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS], int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
                          int block_width, int iSearchRange, int iRaster, int max_lcu_below)
 {
  int i;
@ -417,7 +418,7 @@ unsigned tz_raster_search(const encoder_state_t * const state, const image_t *pi
 static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
                          const image_t *pic, const image_t *ref,
                          const vector2d_t *orig, vector2d_t *mv_in_out,
-                          int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+                          int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                          int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
 {

@ -469,8 +470,9 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
  // both mv_cand vectors and (0, 0).
  for (i = 0; i < num_cand; ++i) 
  {
-    mv.x = merge_cand[i][0] >> 2;
-    mv.y = merge_cand[i][1] >> 2;
+    if (merge_cand[i].dir == 3) continue;
+    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
+    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;

    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);

@ -495,8 +497,8 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
  }
  
  if (best_index < (unsigned)num_cand) {
-    mv.x = merge_cand[best_index][0] >> 2;
-    mv.y = merge_cand[best_index][1] >> 2;
+    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
+    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
  } else {
    mv.x = mv_in_out->x >> 2;
    mv.y = mv_in_out->y >> 2;
@ -575,7 +577,7 @@ static unsigned tz_search(const encoder_state_t * const state, unsigned depth,
 static unsigned hexagon_search(const encoder_state_t * const state, unsigned depth,
                               const image_t *pic, const image_t *ref,
                               const vector2d_t *orig, vector2d_t *mv_in_out,
-                               int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+                               int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
 {
  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
@ -593,7 +595,9 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
  // Check mv_in, if it's not in merge candidates.
  bool mv_in_merge_cand = false;
  for (int i = 0; i < num_cand; ++i) {
-    if (merge_cand[i][0] >> 2 == mv.x && merge_cand[i][1] == mv.y) {
+    if (merge_cand[i].dir == 3) continue;
+    if (merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2 == mv.x &&
+        merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2 == mv.y) {
      mv_in_merge_cand = true;
      break;
    }
@ -620,8 +624,9 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
  // Select starting point from among merge candidates. These should include
  // both mv_cand vectors and (0, 0).
  for (i = 0; i < num_cand; ++i) {
-    mv.x = merge_cand[i][0] >> 2;
-    mv.y = merge_cand[i][1] >> 2;
+    if (merge_cand[i].dir == 3) continue;
+    mv.x = merge_cand[i].mv[merge_cand[i].dir - 1][0] >> 2;
+    mv.y = merge_cand[i].mv[merge_cand[i].dir - 1][1] >> 2;

    PERFORMANCE_MEASURE_START(_DEBUG_PERF_SEARCH_PIXELS);

@ -644,8 +649,8 @@ static unsigned hexagon_search(const encoder_state_t * const state, unsigned dep
    }
  }
  if (best_index < num_cand) {
-    mv.x = merge_cand[best_index][0] >> 2;
-    mv.y = merge_cand[best_index][1] >> 2;
+    mv.x = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][0] >> 2;
+    mv.y = merge_cand[best_index].mv[merge_cand[best_index].dir - 1][1] >> 2;
  } else {
    mv.x = mv_in_out->x >> 2;
    mv.y = mv_in_out->y >> 2;
@ -836,7 +841,7 @@ static unsigned search_frac(const encoder_state_t * const state,
                            unsigned depth,
                            const image_t *pic, const image_t *ref,
                            const vector2d_t *orig, vector2d_t *mv_in_out,
-                            int16_t mv_cand[2][2], int16_t merge_cand[MRG_MAX_NUM_CANDS][3],
+                            int16_t mv_cand[2][2], inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
                            int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
 {

@ -979,12 +984,14 @@ static int search_cu_inter(const encoder_state_t * const state, int x, int y, in

  int16_t mv_cand[2][2];
  // Search for merge mode candidate
-  int16_t merge_cand[MRG_MAX_NUM_CANDS][3];
+  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS];
  // Get list of candidates
-  int16_t num_cand = inter_get_merge_cand(x, y, depth, merge_cand, lcu);
+  int16_t num_cand = inter_get_merge_cand(state, x, y, depth, merge_cand, lcu);

-  // Select better candidate
-  cur_cu->inter.mv_cand = 0; // Default to candidate 0
+
+  // Default to candidate 0
+  cur_cu->inter.mv_cand[0] = 0;
+  cur_cu->inter.mv_cand[1] = 0;

  cur_cu->inter.cost = UINT_MAX;

@ -996,13 +1003,14 @@ static int search_cu_inter(const encoder_state_t * const state, int x, int y, in
    int32_t merged = 0;
    uint8_t cu_mv_cand = 0;
    int8_t merge_idx = 0;
-    int8_t temp_ref_idx = cur_cu->inter.mv_ref;
+    int8_t ref_list = state->global->refmap[ref_idx].list-1;
+    int8_t temp_ref_idx = cur_cu->inter.mv_ref[ref_list];
    orig.x = x_cu * CU_MIN_SIZE_PIXELS;
    orig.y = y_cu * CU_MIN_SIZE_PIXELS;
    // Get MV candidates
-    cur_cu->inter.mv_ref = ref_idx;
-    inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu);
-    cur_cu->inter.mv_ref = temp_ref_idx;
+    cur_cu->inter.mv_ref[ref_list] = ref_idx;
+    inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, ref_list);
+    cur_cu->inter.mv_ref[ref_list] = temp_ref_idx;

    vector2d_t mv = { 0, 0 };
    {
@ -1013,8 +1021,13 @@ static int search_cu_inter(const encoder_state_t * const state, int x, int y, in
      int mid_y_cu = (y + (LCU_WIDTH >> (depth+1))) / 8;
      cu_info_t *ref_cu = &state->global->ref->cu_arrays[ref_idx]->data[mid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH)];
      if (ref_cu->type == CU_INTER) {
-        mv.x = ref_cu->inter.mv[0];
-        mv.y = ref_cu->inter.mv[1];
+        if (ref_cu->inter.mv_dir & 1) {
+          mv.x = ref_cu->inter.mv[0][0];
+          mv.y = ref_cu->inter.mv[0][1];
+        } else {
+          mv.x = ref_cu->inter.mv[1][0];
+          mv.y = ref_cu->inter.mv[1][1];
+        }
      }
    }

@ -1038,9 +1051,10 @@ static int search_cu_inter(const encoder_state_t * const state, int x, int y, in
    merged = 0;
    // Check every candidate to find a match
    for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-      if (merge_cand[merge_idx][0] == mv.x &&
-          merge_cand[merge_idx][1] == mv.y &&
-          (uint32_t)merge_cand[merge_idx][2] == ref_idx) {
+      if (merge_cand[merge_idx].dir != 3 &&
+          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == mv.x &&
+          merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == mv.y &&          
+          (uint32_t)merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1] == ref_idx) {
        merged = 1;
        break;
      }
@ -1068,20 +1082,140 @@ static int search_cu_inter(const encoder_state_t * const state, int x, int y, in
    mvd.y = mv.y - mv_cand[cu_mv_cand][1];

    if(temp_cost < cur_cu->inter.cost) {
+
+      // Map reference index to L0/L1 pictures
+      cur_cu->inter.mv_dir = ref_list+1;
+      cur_cu->inter.mv_ref_coded[ref_list] = state->global->refmap[ref_idx].idx;
+
      cur_cu->merged        = merged;
      cur_cu->merge_idx     = merge_idx;
-      cur_cu->inter.mv_ref  = ref_idx;
-      cur_cu->inter.mv_dir  = 1;
-      cur_cu->inter.mv[0]   = (int16_t)mv.x;
-      cur_cu->inter.mv[1]   = (int16_t)mv.y;
-      cur_cu->inter.mvd[0]  = (int16_t)mvd.x;
-      cur_cu->inter.mvd[1]  = (int16_t)mvd.y;
+      cur_cu->inter.mv_ref[ref_list] = ref_idx;
+      cur_cu->inter.mv[ref_list][0] = (int16_t)mv.x;
+      cur_cu->inter.mv[ref_list][1] = (int16_t)mv.y;
+      cur_cu->inter.mvd[ref_list][0] = (int16_t)mvd.x;
+      cur_cu->inter.mvd[ref_list][1] = (int16_t)mvd.y;
      cur_cu->inter.cost    = temp_cost;
-      cur_cu->inter.bitcost = temp_bitcost + ref_idx;
-      cur_cu->inter.mv_cand = cu_mv_cand;
+      cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[ref_list];
+      cur_cu->inter.mv_cand[ref_list] = cu_mv_cand;
    }
  }

+  // Search bi-pred positions
+  if (state->global->slicetype == SLICE_B && state->encoder_control->cfg->bipred) {
+    lcu_t *templcu = MALLOC(lcu_t, 1);
+    cost_pixel_nxn_func *satd = pixels_get_satd_func(LCU_WIDTH >> depth);
+    #define NUM_PRIORITY_LIST 12;
+    static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+    static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+    uint8_t cutoff = num_cand;
+    for (int32_t idx = 0; idx<cutoff*(cutoff - 1); idx++) {
+      uint8_t i = priorityList0[idx];
+      uint8_t j = priorityList1[idx];
+      if (i >= num_cand || j >= num_cand) break;
+
+      // Find one L0 and L1 candidate according to the priority list
+      if ((merge_cand[i].dir & 0x1) && (merge_cand[j].dir & 0x2)) {
+        if (merge_cand[i].ref[0] != merge_cand[j].ref[1] ||
+          merge_cand[i].mv[0][0] != merge_cand[j].mv[1][0] ||
+          merge_cand[i].mv[0][1] != merge_cand[j].mv[1][1]) {
+          uint32_t bitcost[2];
+          uint32_t cost = 0;
+          int8_t cu_mv_cand = 0;
+          int16_t mv[2][2];
+          pixel_t tmp_block[64 * 64];
+          pixel_t tmp_pic[64 * 64];
+          // Force L0 and L1 references
+          if (state->global->refmap[merge_cand[i].ref[0]].list == 2 || state->global->refmap[merge_cand[j].ref[1]].list == 1) continue;
+
+          // TODO: enable fractional pixel bipred search
+          mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
+          mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
+          mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
+          mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
+
+          inter_recon_lcu_bipred(state, state->global->ref->images[merge_cand[i].ref[0]], state->global->ref->images[merge_cand[j].ref[1]], x, y, LCU_WIDTH >> depth, mv, templcu);
+
+          for (int ypos = 0; ypos < LCU_WIDTH >> depth; ++ypos) {
+            int dst_y = ypos*(LCU_WIDTH >> depth);
+            for (int xpos = 0; xpos < (LCU_WIDTH >> depth); ++xpos) {
+              tmp_block[dst_y + xpos] = templcu->rec.y[((y + ypos)&(LCU_WIDTH - 1))*LCU_WIDTH + ((x + xpos)&(LCU_WIDTH - 1))];              
+              tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width];
+            }
+          }
+
+          cost = satd(tmp_pic, tmp_block);
+
+          // TODO: enable fractional pixel bipred search
+          cost += calc_mvd_cost(state, merge_cand[i].mv[0][0] & 0xfff8, merge_cand[i].mv[0][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[0]);
+          cost += calc_mvd_cost(state, merge_cand[i].mv[1][0] & 0xfff8, merge_cand[i].mv[1][1] & 0xfff8, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost[1]);
+
+          if (cost < cur_cu->inter.cost) {
+
+            cur_cu->inter.mv_dir = 3;
+            cur_cu->inter.mv_ref_coded[0] = state->global->refmap[merge_cand[i].ref[0]].idx;
+            cur_cu->inter.mv_ref_coded[1] = state->global->refmap[merge_cand[j].ref[1]].idx;
+
+
+
+            cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0];
+            cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1];
+
+            // TODO: enable fractional pixel bipred search
+            cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0] & 0xfff8;
+            cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1] & 0xfff8;
+            cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0] & 0xfff8;
+            cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1] & 0xfff8;
+            cur_cu->merged = 0;
+                        
+            // Check every candidate to find a match
+            for(int merge_idx = 0; merge_idx < num_cand; merge_idx++) {
+              if (
+                  merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] &&
+                  merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] &&     
+                  merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] &&
+                  merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] &&    
+                  merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && 
+                  merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) {
+                cur_cu->merged = 1;
+                cur_cu->merge_idx = merge_idx;
+                break;
+              }
+            }
+
+            // Each motion vector has its own candidate
+            for (int reflist = 0; reflist < 2; reflist++) {
+              cu_mv_cand = 0;
+              inter_get_mv_cand(state, x, y, depth, mv_cand, cur_cu, lcu, reflist);
+              if ((mv_cand[0][0] != mv_cand[1][0] || mv_cand[0][1] != mv_cand[1][1])) {
+                vector2d_t mvd_temp1, mvd_temp2;
+                int cand1_cost, cand2_cost;
+
+                mvd_temp1.x = cur_cu->inter.mv[reflist][0] - mv_cand[0][0];
+                mvd_temp1.y = cur_cu->inter.mv[reflist][1] - mv_cand[0][1];
+                cand1_cost = get_mvd_coding_cost(&mvd_temp1);
+
+                mvd_temp2.x = cur_cu->inter.mv[reflist][0] - mv_cand[1][0];
+                mvd_temp2.y = cur_cu->inter.mv[reflist][1] - mv_cand[1][1];
+                cand2_cost = get_mvd_coding_cost(&mvd_temp2);
+
+                // Select candidate 1 if it has lower cost
+                if (cand2_cost < cand1_cost) {
+                  cu_mv_cand = 1;                  
+                }
+              }
+              cur_cu->inter.mvd[reflist][0] = cur_cu->inter.mv[reflist][0] - mv_cand[cu_mv_cand][0];
+              cur_cu->inter.mvd[reflist][1] = cur_cu->inter.mv[reflist][1] - mv_cand[cu_mv_cand][1];
+              cur_cu->inter.mv_cand[reflist] = cu_mv_cand;
+            }
+            cur_cu->inter.cost = cost;
+            cur_cu->inter.bitcost = bitcost[0] + bitcost[1] + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded[0] + cur_cu->inter.mv_ref_coded[1];
+          }
+        }
+      }
+    }
+    FREE_POINTER(templcu);
+  }
+
  return cur_cu->inter.cost;
 }

@ -2388,7 +2522,12 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      int tr_depth = depth > 0 ? depth : 1;
      lcu_set_trdepth(&work_tree[depth], x, y, depth, tr_depth);

-      inter_recon_lcu(state, state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]);
+      if (cur_cu->inter.mv_dir == 3) {
+        inter_recon_lcu_bipred(state, state->global->ref->images[cur_cu->inter.mv_ref[0]], state->global->ref->images[cur_cu->inter.mv_ref[1]], x, y, LCU_WIDTH >> depth, cur_cu->inter.mv, &work_tree[depth]);
+      } else {
+        inter_recon_lcu(state, state->global->ref->images[cur_cu->inter.mv_ref[cur_cu->inter.mv_dir - 1]], x, y, LCU_WIDTH >> depth, cur_cu->inter.mv[cur_cu->inter.mv_dir - 1], &work_tree[depth]);
+      }
+
      quantize_lcu_luma_residual(state, x, y, depth, NULL, &work_tree[depth]);
      quantize_lcu_chroma_residual(state, x, y, depth, NULL, &work_tree[depth]);