From 6c4f2d196a2ac333e80c75478c8db671750bfba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Sun, 21 Aug 2016 12:27:58 +0900
Subject: [PATCH 01/12] Move fields from encoder_state_t to frame

Moves fields prepared and frame_done from encoder_state_t to
encoder_state_config_frame_t.
---
 src/encoder_state-ctors_dtors.c |  5 +++--
 src/encoderstate.c              |  8 ++++----
 src/encoderstate.h              | 26 +++++++++++++-------------
 src/kvazaar.c                   | 10 +++++-----
 4 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index 24b7add2..e53ac4e4 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -48,8 +48,11 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
   state->frame->poc = 0;
   state->frame->total_bits_coded = 0;
   state->frame->cur_gop_bits_coded = 0;
+  state->frame->prepared = 0;
+  state->frame->done = 1;
   state->frame->rc_alpha = 3.2003;
   state->frame->rc_beta = -1.367;
+
   return 1;
 }
 
@@ -303,8 +306,6 @@ int kvz_encoder_state_init(encoder_state_t * const child_state, encoder_state_t
   child_state->children[0].encoder_control = NULL;
   child_state->tqj_bitstream_written = NULL;
   child_state->tqj_recon_done = NULL;
-  child_state->prepared = 0;
-  child_state->frame_done = 1;
   
   if (!parent_state) {
     const encoder_control_t * const encoder = child_state->encoder_control;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 172c101a..bd1cbf99 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -967,7 +967,7 @@ void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame)
     assert(!state->tqj_bitstream_written);
     state->tqj_bitstream_written = job;
   }
-  state->frame_done = 0;
+  state->frame->done = 0;
   //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
 }
 
@@ -985,7 +985,7 @@ void kvz_encoder_prepare(encoder_state_t *state)
   const encoder_control_t * const encoder = state->encoder_control;
 
   // The previous frame must be done before the next one is started.
-  assert(state->frame_done);
+  assert(state->frame->done);
 
   if (state->frame->num == -1) {
     // We're at the first frame, so don't care about all this stuff.
@@ -993,7 +993,7 @@ void kvz_encoder_prepare(encoder_state_t *state)
     state->frame->poc   = 0;
     assert(!state->tile->frame->source);
     assert(!state->tile->frame->rec);
-    state->prepared = 1;
+    state->frame->prepared = 1;
     return;
   }
 
@@ -1034,7 +1034,7 @@ void kvz_encoder_prepare(encoder_state_t *state)
   state->frame->num = prev_state->frame->num + 1;
   state->frame->poc   = prev_state->frame->poc   + 1;
 
-  state->prepared = 1;
+  state->frame->prepared = 1;
 }
 
 coeff_scan_order_t kvz_get_scan_order(int8_t cu_type, int intra_mode, int depth)
diff --git a/src/encoderstate.h b/src/encoderstate.h
index de495a98..321f8137 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -88,6 +88,19 @@ typedef struct encoder_state_config_frame_t {
   double rc_alpha;
   double rc_beta;
 
+  /**
+   * \brief Indicates that this encoder state is ready for encoding the
+   * next frame i.e. kvz_encoder_prepare has been called.
+   */
+  bool prepared;
+
+  /**
+   * \brief Indicates that the previous frame has been encoded and the
+   * encoded data written and the encoding the next frame has not been
+   * started yet.
+   */
+  bool done;
+
 } encoder_state_config_frame_t;
 
 typedef struct encoder_state_config_tile_t {
@@ -185,19 +198,6 @@ typedef struct encoder_state_t {
   bitstream_t stream;
   cabac_data_t cabac;
 
-  /**
-   * \brief Indicates that this encoder state is ready for encoding the
-   * next frame i.e. kvz_encoder_prepare has been called.
-   */
-  int prepared;
-
-  /**
-   * \brief Indicates that the previous frame has been encoded and the
-   * encoded data written and the encoding the next frame has not been
-   * started yet.
-   */
-  int frame_done;
-
   uint32_t stats_bitstream_length; //Bitstream length written in bytes
   
   //Jobs to wait for
diff --git a/src/kvazaar.c b/src/kvazaar.c
index d5d3dcac..b18e18ac 100644
--- a/src/kvazaar.c
+++ b/src/kvazaar.c
@@ -213,7 +213,7 @@ static int kvazaar_encode(kvz_encoder *enc,
 
   encoder_state_t *state = &enc->states[enc->cur_state_num];
 
-  if (!state->prepared) {
+  if (!state->frame->prepared) {
     kvz_encoder_prepare(state);
   }
 
@@ -235,13 +235,13 @@ static int kvazaar_encode(kvz_encoder *enc,
     return 1;
   }
 
-  if (!state->frame_done) {
+  if (!state->frame->done) {
     // We started encoding a frame; move to the next encoder state.
     enc->cur_state_num = (enc->cur_state_num + 1) % (enc->num_encoder_states);
   }
 
   encoder_state_t *output_state = &enc->states[enc->out_state_num];
-  if (!output_state->frame_done &&
+  if (!output_state->frame->done &&
       (pic_in == NULL || enc->cur_state_num == enc->out_state_num)) {
 
     kvz_threadqueue_waitfor(enc->control->threadqueue, output_state->tqj_bitstream_written);
@@ -256,8 +256,8 @@ static int kvazaar_encode(kvz_encoder *enc,
     if (src_out) *src_out = kvz_image_copy_ref(output_state->tile->frame->source);
     if (info_out) set_frame_info(info_out, output_state);
 
-    output_state->frame_done = 1;
-    output_state->prepared = 0;
+    output_state->frame->done = 1;
+    output_state->frame->prepared = 0;
     enc->frames_done += 1;
 
     enc->out_state_num = (enc->out_state_num + 1) % (enc->num_encoder_states);

From 435c3873577d5cfaedbfa63525a0747b75073508 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 14 Sep 2016 12:52:56 +0900
Subject: [PATCH 02/12] Refactor rate control

- Defines MIN_LAMBDA and MAX_LAMBDA constants.
- Moves resetting state->frame->cur_gop_bits_coded to rate_control.c.
- Changes gop_allocate_bits to return the number of bits allocated like
  pic_allocate_bits does.
---
 src/encoder_state-bitstream.c |  6 +---
 src/rate_control.c            | 61 ++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 9fe424f6..7de90bb4 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -972,11 +972,7 @@ static void encoder_state_write_bitstream_main(encoder_state_t * const state)
   }
   state->frame->total_bits_coded += newpos - curpos;
 
-  if (encoder->cfg->gop_len > 0 && state->frame->gop_offset > 0) {
-    state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
-  } else {
-    state->frame->cur_gop_bits_coded = 0;
-  }
+  state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
   state->frame->cur_gop_bits_coded += newpos - curpos;
 }
 
diff --git a/src/rate_control.c b/src/rate_control.c
index 07173198..6e86f3f1 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -27,6 +27,8 @@
 
 
 static const int SMOOTHING_WINDOW = 40;
+static const double MIN_LAMBDA    = 0.1;
+static const double MAX_LAMBDA    = 10000;
 
 /**
  * \brief Update alpha and beta parameters.
@@ -45,7 +47,7 @@ static void update_rc_parameters(encoder_state_t * state)
   const double alpha_old = state->frame->rc_alpha;
   const double beta_old = state->frame->rc_beta;
   // lambda computed from real bpp
-  const double lambda_comp = CLIP(0.1, 10000, alpha_old * pow(bpp, beta_old));
+  const double lambda_comp = CLIP(MIN_LAMBDA, MAX_LAMBDA, alpha_old * pow(bpp, beta_old));
   // lambda used in encoding
   const double lambda_real = state->frame->cur_lambda_cost;
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
@@ -59,13 +61,10 @@ static void update_rc_parameters(encoder_state_t * state)
 
 /**
  * \brief Allocate bits for the current GOP.
- * \param state the main encoder state
- *
- * If GOPs are not used, allocates bits for a single picture.
- *
- * Sets the cur_gop_target_bits of the encoder state.
+ * \param state   the main encoder state
+ * \return        target number of bits
  */
-static void gop_allocate_bits(encoder_state_t * const state)
+static double gop_allocate_bits(encoder_state_t * const state)
 {
   const encoder_control_t * const encoder = state->encoder_control;
 
@@ -83,21 +82,35 @@ static void gop_allocate_bits(encoder_state_t * const state)
     pictures_coded -= gop_offset + 1;
   }
 
+  // Equation 12 from https://doi.org/10.1109/TIP.2014.2336550
   double gop_target_bits =
     (encoder->target_avg_bppic * (pictures_coded + SMOOTHING_WINDOW) - bits_coded)
     * MAX(1, encoder->cfg->gop_len) / SMOOTHING_WINDOW;
-  state->frame->cur_gop_target_bits = MAX(200, gop_target_bits);
+  // Allocate at least 200 bits for each GOP like HM does.
+  return MAX(200, gop_target_bits);
 }
 
 /**
  * Allocate bits for the current picture.
- * \param state the main encoder state
- * \return target number of bits
+ * \param state   the main encoder state
+ * \return        target number of bits
  */
-static double pic_allocate_bits(const encoder_state_t * const state)
+static double pic_allocate_bits(encoder_state_t * const state)
 {
   const encoder_control_t * const encoder = state->encoder_control;
 
+  if (encoder->cfg->gop_len == 0 ||
+      state->frame->gop_offset == 0 ||
+      state->frame->num == 0)
+  {
+    // A new GOP starts at this frame.
+    state->frame->cur_gop_target_bits = gop_allocate_bits(state);
+    state->frame->cur_gop_bits_coded  = 0;
+  } else {
+    state->frame->cur_gop_target_bits =
+      state->previous_encoder_state->frame->cur_gop_target_bits;
+  }
+
   if (encoder->cfg->gop_len <= 0) {
     return state->frame->cur_gop_target_bits;
   }
@@ -105,13 +118,14 @@ static double pic_allocate_bits(const encoder_state_t * const state)
   const double pic_weight = encoder->gop_layer_weights[
     encoder->cfg->gop[state->frame->gop_offset].layer - 1];
   double pic_target_bits = state->frame->cur_gop_target_bits * pic_weight;
+  // Allocate at least 100 bits for each picture like HM does.
   return MAX(100, pic_target_bits);
 }
 
 /**
  * \brief Select a lambda value for encoding the next picture
- * \param state the main encoder state
- * \return lambda for the next picture
+ * \param state   the main encoder state
+ * \return        lambda for the next picture
  *
  * Rate control must be enabled (i.e. cfg->target_bitrate > 0) when this
  * function is called.
@@ -127,24 +141,11 @@ double kvz_select_picture_lambda(encoder_state_t * const state)
     update_rc_parameters(state);
   }
 
-  if (encoder->cfg->gop_len == 0 ||
-      state->frame->gop_offset == 0 ||
-      state->frame->num == 0)
-  {
-    // A new GOP begins at this frame.
-    gop_allocate_bits(state);
-  } else {
-    state->frame->cur_gop_target_bits =
-      state->previous_encoder_state->frame->cur_gop_target_bits;
-  }
-
   // TODO: take the picture headers into account
-  const double target_bits_current_picture = pic_allocate_bits(state);
-  const double target_bits_per_pixel =
-    target_bits_current_picture / encoder->in.pixels_per_pic;
-  const double lambda =
-    state->frame->rc_alpha * pow(target_bits_per_pixel, state->frame->rc_beta);
-  return CLIP(0.1, 10000, lambda);
+  const double pic_target_bits = pic_allocate_bits(state);
+  const double target_bpp = pic_target_bits / encoder->in.pixels_per_pic;
+  const double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);
+  return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
 }
 
 int8_t kvz_lambda_to_QP(const double lambda)

From 640ff94ecd3dd54db515a819613ebb490e9bd1be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Sun, 21 Aug 2016 13:16:59 +0900
Subject: [PATCH 03/12] Use separate lambda and QP for each LCU

Adds fields lambda, lambda_sqrt and qp to encoder_state_t. Drops field
cur_lambda_cost_sqrt from encoder_state_config_frame_t and renames
cur_lambda_cost to lambda.
---
 src/encoderstate.c                     | 11 +++++---
 src/encoderstate.h                     | 36 ++++++++++++++++++++------
 src/filter.c                           |  4 +--
 src/rate_control.c                     |  4 +--
 src/rate_control.h                     |  2 +-
 src/rdo.c                              | 30 ++++++++++-----------
 src/sao.c                              | 12 ++++-----
 src/search.c                           | 22 ++++++++--------
 src/search_inter.c                     |  2 +-
 src/search_intra.c                     | 14 +++++-----
 src/strategies/avx2/quant-avx2.c       |  4 +--
 src/strategies/generic/quant-generic.c |  4 +--
 src/transform.c                        |  2 +-
 13 files changed, 85 insertions(+), 62 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index bd1cbf99..25daa1b8 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -201,7 +201,11 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
   encoder_state_t *state = lcu->encoder_state;
   const encoder_control_t * const encoder = state->encoder_control;
   videoframe_t* const frame = state->tile->frame;
-  
+
+  state->lambda      = state->frame->lambda;
+  state->lambda_sqrt = sqrt(state->frame->lambda);
+  state->qp          = state->frame->QP;
+
   //This part doesn't write to bitstream, it's only search, deblock and sao
   
   kvz_search_lcu(state, lcu->position_px.x, lcu->position_px.y, state->tile->hor_buf_search, state->tile->ver_buf_search);
@@ -902,7 +906,7 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
   if (cfg->target_bitrate > 0) {
     // Rate control enabled.
     lambda = kvz_select_picture_lambda(state);
-    state->frame->QP = kvz_lambda_to_QP(lambda);
+    state->frame->QP = kvz_lambda_to_qp(lambda);
   } else {
     if (cfg->gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
       kvz_gop_config const * const gop =
@@ -914,8 +918,7 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
     }
     lambda = kvz_select_picture_lambda_from_qp(state);
   }
-  state->frame->cur_lambda_cost = lambda;
-  state->frame->cur_lambda_cost_sqrt = sqrt(lambda);
+  state->frame->lambda = lambda;
 
   encoder_state_init_children(state);
 }
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 321f8137..a2ee3cc2 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -51,16 +51,29 @@ typedef enum {
 
 
 typedef struct encoder_state_config_frame_t {
-  double cur_lambda_cost; //!< \brief Lambda for SSE
-  double cur_lambda_cost_sqrt; //!< \brief Lambda for SAD and SATD
-  
+  /**
+   * \brief Frame-level lambda.
+   *
+   * Use state->lambda or state->lambda_sqrt for cost computations.
+   *
+   * \see encoder_state_t::lambda
+   * \see encoder_state_t::lambda_sqrt
+   */
+  double lambda;
+
   int32_t num;       /*!< \brief Frame number */
   int32_t poc;       /*!< \brief Picture order count */
   int8_t gop_offset; /*!< \brief Offset in the gop structure */
-  
-  int8_t QP;   //!< \brief Quantization parameter
-  double QP_factor; //!< \brief Quantization factor
-  
+
+  /**
+   * \brief Frame-level quantization parameter
+   *
+   * \see encoder_state_t::qp
+   */
+  int8_t QP;
+  //! \brief quantization factor
+  double QP_factor;
+
   //Current picture available references
   image_list_t *ref;
   int8_t ref_list;
@@ -199,7 +212,14 @@ typedef struct encoder_state_t {
   cabac_data_t cabac;
 
   uint32_t stats_bitstream_length; //Bitstream length written in bytes
-  
+
+  //! \brief Lambda for SSE
+  double lambda;
+  //! \brief Lambda for SAD and SATD
+  double lambda_sqrt;
+  //! \brief Quantization parameter for the current LCU
+  int8_t qp;
+
   //Jobs to wait for
   threadqueue_job_t * tqj_recon_done; //Reconstruction is done
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
diff --git a/src/filter.c b/src/filter.c
index d0fa01bf..dce73199 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -291,7 +291,7 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
     kvz_pixel *src = orig_src;
 
     int8_t strength = 0;
-    int32_t qp              = state->frame->QP;
+    int32_t qp              = state->qp;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = kvz_g_beta_table_8x8[b_index] * bitdepth_scale;
@@ -490,7 +490,7 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
     };
     int8_t strength = 2;
 
-    int32_t QP             = kvz_g_chroma_scale[state->frame->QP];
+    int32_t QP             = kvz_g_chroma_scale[state->qp];
     int32_t bitdepth_scale = 1 << (encoder->bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = kvz_g_tc_table_8x8[TC_index]*bitdepth_scale;
diff --git a/src/rate_control.c b/src/rate_control.c
index 6e86f3f1..6e1f0db8 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -49,7 +49,7 @@ static void update_rc_parameters(encoder_state_t * state)
   // lambda computed from real bpp
   const double lambda_comp = CLIP(MIN_LAMBDA, MAX_LAMBDA, alpha_old * pow(bpp, beta_old));
   // lambda used in encoding
-  const double lambda_real = state->frame->cur_lambda_cost;
+  const double lambda_real = state->frame->lambda;
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
 
   const double alpha = alpha_old + 0.1 * lambda_log_ratio * alpha_old;
@@ -148,7 +148,7 @@ double kvz_select_picture_lambda(encoder_state_t * const state)
   return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
 }
 
-int8_t kvz_lambda_to_QP(const double lambda)
+int8_t kvz_lambda_to_qp(const double lambda)
 {
   const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
   return CLIP(0, 51, qp);
diff --git a/src/rate_control.h b/src/rate_control.h
index 3c2d4362..f0b2befb 100644
--- a/src/rate_control.h
+++ b/src/rate_control.h
@@ -33,7 +33,7 @@
 
 double kvz_select_picture_lambda(encoder_state_t * const state);
 
-int8_t kvz_lambda_to_QP(const double lambda);
+int8_t kvz_lambda_to_qp(const double lambda);
 
 double kvz_select_picture_lambda_from_qp(encoder_state_t const * const state);
 
diff --git a/src/rdo.c b/src/rdo.c
index b64fb5df..65ca6dd9 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -257,7 +257,7 @@ uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost
   cabac_ctx_t* base_sig_model = type?(cabac->ctx.cu_sig_model_chroma):(cabac->ctx.cu_sig_model_luma);
 
   if( !last && max_abs_level < 3 ) {
-    *coded_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
+    *coded_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
     *coded_cost     = *coded_cost0 + *coded_cost_sig;
     if (max_abs_level == 0) return best_abs_level;
   } else {
@@ -265,13 +265,13 @@ uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost
   }
 
   if( !last ) {
-    cur_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
+    cur_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
   }
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
     double err       = (double)(level_double - ( abs_level << q_bits ) );
-    double cur_cost  = err * err * temp + state->frame->cur_lambda_cost *
+    double cur_cost  = err * err * temp + state->lambda *
                        kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
                                     abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
@@ -308,7 +308,7 @@ static double get_rate_last(const encoder_state_t * const state,
   if( ctx_y > 3 ) {
     uiCost += 32768.0 * ((ctx_y-2)>>1);
   }
-  return state->frame->cur_lambda_cost*uiCost;
+  return state->lambda * uiCost;
 }
 
 static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t height, int8_t type,
@@ -358,7 +358,7 @@ void kvz_rdoq_sign_hiding(const encoder_state_t *const state,
 
   int64_t rd_factor = (int64_t)(
     kvz_g_inv_quant_scales[qp_scaled % 6] * kvz_g_inv_quant_scales[qp_scaled % 6] * (1 << (2 * (qp_scaled / 6)))
-    / state->frame->cur_lambda_cost / 16 / (1 << (2 * (encoder->bitdepth - 8)))
+    / state->lambda / 16 / (1 << (2 * (encoder->bitdepth - 8)))
     + 0.5);
   int32_t lastCG = -1;
   int32_t absSum = 0;
@@ -467,7 +467,7 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
   uint32_t max_num_coeff     = width * height;
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
 
@@ -669,7 +669,7 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
       if (sig_coeffgroup_flag[cg_blkpos] == 0) {
         uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                         cg_pos_y, width);
-        cost_coeffgroup_sig[cg_scanpos] = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
+        cost_coeffgroup_sig[cg_scanpos] = state->lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
         base_cost += cost_coeffgroup_sig[cg_scanpos]  - rd_stats.sig_cost;
       } else {
         if (cg_scanpos < cg_last_scanpos){
@@ -686,9 +686,9 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
           ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
             cg_pos_y, width);
 
-          cost_coeffgroup_sig[cg_scanpos] = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
+          cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
           base_cost += cost_coeffgroup_sig[cg_scanpos];
-          cost_zero_cg += state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
+          cost_zero_cg += state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
 
           // try to convert the current coeff group from non-zero to all-zero
           cost_zero_cg += rd_stats.uncoded_dist;          // distortion for resetting non-zero levels to zero levels
@@ -701,7 +701,7 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
             sig_coeffgroup_flag[cg_blkpos] = 0;
             base_cost = cost_zero_cg;
 
-            cost_coeffgroup_sig[cg_scanpos] = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
+            cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
 
             // reset coeffs to 0 in this block
             for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
@@ -728,13 +728,13 @@ void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff,
   int32_t best_last_idx_p1 = 0;
 
   if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
-    best_cost  = block_uncoded_cost +   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
-    base_cost +=   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
+    best_cost  = block_uncoded_cost +   state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
+    base_cost +=   state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
   } else {
     cabac_ctx_t* base_cbf_model = type?(cabac->ctx.qt_cbf_model_chroma):(cabac->ctx.qt_cbf_model_luma);
     ctx_cbf    = ( type ? tr_depth : !tr_depth);
-    best_cost  = block_uncoded_cost +  state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
-    base_cost +=   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
+    best_cost  = block_uncoded_cost +  state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
+    base_cost +=   state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
   }
 
   for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
@@ -1006,5 +1006,5 @@ int kvz_calc_mvd_cost_cabac(encoder_state_t * const state, int x, int y, int mv_
   *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
 
   // Store bitcost before restoring cabac
-  return *bitcost * (int32_t)(state->frame->cur_lambda_cost_sqrt + 0.5);
+  return *bitcost * (int32_t)(state->lambda_sqrt + 0.5);
 }
diff --git a/src/sao.c b/src/sao.c
index 2bafbca5..36a88bd5 100644
--- a/src/sao.c
+++ b/src/sao.c
@@ -501,7 +501,7 @@ static void sao_search_edge_sao(const encoder_state_t * const state,
 
     {
       float mode_bits = sao_mode_bits_edge(state, edge_class, edge_offset, sao_top, sao_left, buf_cnt);
-      sum_ddistortion += (int)((double)mode_bits*state->frame->cur_lambda_cost+0.5);
+      sum_ddistortion += (int)((double)mode_bits*state->lambda +0.5);
     }
     // SAO is not applied for category 0.
     edge_offset[SAO_EO_CAT0] = 0;
@@ -545,7 +545,7 @@ static void sao_search_band_sao(const encoder_state_t * const state, const kvz_p
     }
 
     temp_rate = sao_mode_bits_band(state, sao_out->band_position, temp_offsets, sao_top, sao_left, buf_cnt);
-    ddistortion += (int)((double)temp_rate*state->frame->cur_lambda_cost + 0.5);
+    ddistortion += (int)((double)temp_rate*state->lambda + 0.5);
 
     // Select band sao over edge sao when distortion is lower
     if (ddistortion < sao_out->ddistortion) {
@@ -589,7 +589,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
 
   {
     float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt);
-    int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
+    int ddistortion = (int)(mode_bits * state->lambda + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -603,7 +603,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
 
   {
     float mode_bits = sao_mode_bits_band(state, band_sao.band_position, band_sao.offsets, sao_top, sao_left, buf_cnt);
-    int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
+    int ddistortion = (int)(mode_bits * state->lambda + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -626,7 +626,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->frame->cur_lambda_cost + 0.5);
+    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
       merge_cost[0] = cost_of_nothing;
@@ -643,7 +643,7 @@ static void sao_search_best_mode(const encoder_state_t * const state, const kvz_
       if (merge_cand) {
         unsigned buf_i;
         float mode_bits = sao_mode_bits_merge(state, i + 1);
-        int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
+        int ddistortion = (int)(mode_bits * state->lambda + 0.5);
 
         switch (merge_cand->type) {
           case SAO_TYPE_EDGE:
diff --git a/src/search.c b/src/search.c
index adb1a890..440e7dca 100644
--- a/src/search.c
+++ b/src/search.c
@@ -321,7 +321,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
     sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + tr_tree_bits * state->frame->cur_lambda_cost;
+    return sum + tr_tree_bits * state->lambda;
   }
 
   // Add transform_tree cbf_luma bit cost.
@@ -353,7 +353,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->frame->cur_lambda_cost;
+  return (double)ssd * LUMA_MULT + bits * state->lambda;
 }
 
 
@@ -398,7 +398,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
     sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + tr_tree_bits * state->frame->cur_lambda_cost;
+    return sum + tr_tree_bits * state->lambda;
   }
 
   // Chroma SSD
@@ -428,7 +428,7 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * CHROMA_MULT + bits * state->frame->cur_lambda_cost;
+  return (double)ssd * CHROMA_MULT + bits * state->lambda;
 }
 
 
@@ -682,7 +682,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       mode_bits = inter_bitcost;
     }
 
-    cost += mode_bits * state->frame->cur_lambda_cost;
+    cost += mode_bits * state->lambda;
   }
   
   // Recursively split all the way to max search depth.
@@ -695,15 +695,15 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       // Add cost of cu_split_flag.
       uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
       const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->frame->cur_lambda_cost;
+      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
+      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;
     }
 
     if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
       // Add cost of intra part_size.
       const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model[0]);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->frame->cur_lambda_cost;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;  // NxN
+      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->lambda;  // 2Nx2N
+      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;  // NxN
     }
 
     // If skip mode was selected for the block, skip further search.
@@ -750,11 +750,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // Add the cost of coding no-split.
         uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
         const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_model[split_model]);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;
+        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
 
         // Add the cost of coding intra mode only once.
         double mode_bits = calc_mode_bits(state, &work_tree[depth], cur_cu, x, y);
-        cost += mode_bits * state->frame->cur_lambda_cost;
+        cost += mode_bits * state->lambda;
       }
     }
 
diff --git a/src/search_inter.c b/src/search_inter.c
index ebd51524..3b81e709 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -253,7 +253,7 @@ static int calc_mvd_cost(encoder_state_t * const state, int x, int y, int mv_shi
     temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
   }
   *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(state->frame->cur_lambda_cost_sqrt+0.5);
+  return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5);
 }
 
 
diff --git a/src/search_intra.c b/src/search_intra.c
index 0d63ea05..2e79fc96 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -117,7 +117,7 @@ static double get_cost(encoder_state_t * const state,
       trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
     }
 
-    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->frame->cur_lambda_cost_sqrt * trskip_bits;
+    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->lambda_sqrt * trskip_bits;
     if (sad_cost < satd_cost) {
       return sad_cost;
     }
@@ -164,7 +164,7 @@ static void get_cost_dual(encoder_state_t * const state,
     double sad_costs[PARALLEL_BLKS] = { 0 };
     sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      sad_costs[i] = TRSKIP_RATIO * (double)unsigned_sad_costs[i] + state->frame->cur_lambda_cost_sqrt * trskip_bits;
+      sad_costs[i] = TRSKIP_RATIO * (double)unsigned_sad_costs[i] + state->lambda_sqrt * trskip_bits;
       if (sad_costs[i] < (double)satd_costs[i]) {
         costs_out[i] = sad_costs[i];
       }
@@ -254,7 +254,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->frame->cur_lambda_cost;
+    split_cost = 3 * state->lambda;
 
     split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     if (split_cost < nosplit_cost) {
@@ -296,7 +296,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
     }
 
     double bits = tr_split_bit + cbf_bits;
-    split_cost += bits * state->frame->cur_lambda_cost;
+    split_cost += bits * state->lambda;
   } else {
     assert(width <= TR_MAX_WIDTH);
   }
@@ -529,7 +529,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->frame->cur_lambda_cost_sqrt + 0.5);
+  int lambda_cost = (int)(state->lambda_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
     costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds);
   }
@@ -600,7 +600,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
     int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds);
-    costs[rdo_mode] = rdo_bitcost * (int)(state->frame->cur_lambda_cost + 0.5);
+    costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
 
     // Perform transform split search and save mode RD cost for the best one.
     cu_info_t pred_cu;
@@ -701,7 +701,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
-      chroma.cost += mode_bits * state->frame->cur_lambda_cost;
+      chroma.cost += mode_bits * state->lambda;
 
       if (chroma.cost < best_chroma.cost) {
         best_chroma = chroma;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index de973122..bb9d5ac4 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -52,7 +52,7 @@ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coe
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6];
@@ -457,7 +457,7 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit[ width ] + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;
 
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index 75dd127c..69f8ca01 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -41,7 +41,7 @@ void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6];
@@ -286,7 +286,7 @@ void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit[ width ] + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;
 
diff --git a/src/transform.c b/src/transform.c
index 6beb3492..4e5bf236 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -232,7 +232,7 @@ int kvz_quantize_residual_trskip(
     int has_coeffs;
   } skip, noskip, *best;
 
-  const int bit_cost = (int)(state->frame->cur_lambda_cost+0.5);
+  const int bit_cost = (int)(state->lambda + 0.5);
   
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,

From 71633889ce132d55f993dd427776be11117ae31e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Sun, 21 Aug 2016 13:36:13 +0900
Subject: [PATCH 04/12] Enable CU QP delta when using rate control

When rate control is enabled, enable cu_qp_delta_enabled_flag in PPS
with diff_cu_qp_delta_depth set to 0. Also adds code for writing the QP
deltas and a new cabac context.
---
 src/cabac.h                   |  1 +
 src/context.c                 |  9 +++++++++
 src/encode_coding_tree.c      | 28 ++++++++++++++++++++++++++--
 src/encode_coding_tree.h      |  2 +-
 src/encoder_state-bitstream.c | 11 ++++++++---
 src/encoderstate.c            |  7 +++++++
 src/encoderstate.h            | 11 +++++++++++
 src/global.h                  |  1 +
 8 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index 95571c3b..a1779c59 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -60,6 +60,7 @@ typedef struct
     cabac_ctx_t trans_subdiv_model[3]; //!< \brief intra mode context models
     cabac_ctx_t qt_cbf_model_luma[4];
     cabac_ctx_t qt_cbf_model_chroma[4];
+    cabac_ctx_t cu_qp_delta_abs[4];
     cabac_ctx_t part_size_model[4];
     cabac_ctx_t cu_sig_coeff_group_model[4];
     cabac_ctx_t cu_sig_model_luma[27];
diff --git a/src/context.c b/src/context.c
index 1244245c..ada2683e 100644
--- a/src/context.c
+++ b/src/context.c
@@ -121,6 +121,12 @@ static const uint8_t INIT_QT_CBF[3][8] = {
   { 111,  141,  CNU,  CNU,    94,  138,  182,  154 },
 };
 
+static const uint8_t INIT_CU_QP_DELTA_ABS[3][2] = {
+  { 154, 154 },
+  { 154, 154 },
+  { 154, 154 },
+};
+
 static const uint8_t INIT_SIG_CG_FLAG[3][4] = {
   { 121,  140,  61,  154  },
   { 121,  140,  61,  154 },
@@ -243,6 +249,9 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
   kvz_ctx_init(&cabac->ctx.mvp_idx_model[0], QP, INIT_MVP_IDX[slice][0]);
   kvz_ctx_init(&cabac->ctx.mvp_idx_model[1], QP, INIT_MVP_IDX[slice][1]);
 
+  kvz_ctx_init(&cabac->ctx.cu_qp_delta_abs[0], QP, INIT_CU_QP_DELTA_ABS[slice][0]);
+  kvz_ctx_init(&cabac->ctx.cu_qp_delta_abs[1], QP, INIT_CU_QP_DELTA_ABS[slice][1]);
+
   for (i = 0; i < 4; i++) {
     kvz_ctx_init(&cabac->ctx.cu_sig_coeff_group_model[i], QP, INIT_SIG_CG_FLAG[slice][i]);
     kvz_ctx_init(&cabac->ctx.cu_abs_model_luma[i], QP, INIT_ABS_FLAG[slice][i]);
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 42ca1557..f7aa1688 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -517,6 +517,28 @@ static void encode_transform_coeff(encoder_state_t * const state,
   }
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
+    if (state->must_code_qp_delta) {
+      const int qp_delta      = state->qp - state->ref_qp;
+      const int qp_delta_abs  = ABS(qp_delta);
+      cabac_data_t* cabac     = &state->cabac;
+
+      // cu_qp_delta_abs prefix
+      cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0];
+      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5);
+
+      if (qp_delta_abs >= 5) {
+        // cu_qp_delta_abs suffix
+        kvz_cabac_write_ep_ex_golomb(state, cabac, qp_delta_abs - 5, 0);
+      }
+
+      if (qp_delta != 0) {
+        CABAC_BIN_EP(cabac, (qp_delta >= 0 ? 0 : 1), "qp_delta_sign_flag");
+      }
+
+      state->must_code_qp_delta = false;
+      state->ref_qp = state->qp;
+    }
+
     encode_transform_unit(state, x_pu, y_pu, depth);
   }
 }
@@ -894,14 +916,16 @@ static void encode_part_mode(encoder_state_t * const state,
 }
 
 void kvz_encode_coding_tree(encoder_state_t * const state,
-                        uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
+                            uint16_t x_ctb,
+                            uint16_t y_ctb,
+                            uint8_t depth)
 {
   cabac_data_t * const cabac = &state->cabac;
   const videoframe_t * const frame = state->tile->frame;
   const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb);
   uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
   uint8_t split_model = 0;
-  
+
   //Absolute ctb
   uint16_t abs_x_ctb = x_ctb + (state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
   uint16_t abs_y_ctb = y_ctb + (state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index b8e2dc10..e284d7c2 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -29,7 +29,7 @@
 #include "encoderstate.h"
 #include "global.h"
 
-void kvz_encode_coding_tree(encoder_state_t *state,
+void kvz_encode_coding_tree(encoder_state_t * const state,
                             uint16_t x_ctb,
                             uint16_t y_ctb,
                             uint8_t depth);
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 7de90bb4..edeef983 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -451,9 +451,14 @@ static void encoder_state_write_bitstream_pic_parameter_set(bitstream_t* stream,
   WRITE_SE(stream, ((int8_t)encoder->cfg->qp) - 26, "pic_init_qp_minus26");
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->trskip_enable, 1, "transform_skip_enabled_flag");
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
-  //if cu_qp_delta_enabled_flag
-  //WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+
+  if (encoder->cfg->target_bitrate > 0) {
+    // Use separate QP for each LCU when rate control is enabled.
+    WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
+    WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+  } else {
+    WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+  }
 
   //TODO: add QP offsets
   WRITE_SE(stream, 0, "pps_cb_qp_offset");
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 25daa1b8..cf8c54e6 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -255,6 +255,10 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]);
   }
   
+
+  // QP delta is not used when rate control is turned off.
+  state->must_code_qp_delta = (state->encoder_control->cfg->target_bitrate > 0);
+
   //Encode coding tree
   kvz_encode_coding_tree(state, lcu->position.x << MAX_DEPTH, lcu->position.y << MAX_DEPTH, 0);
 
@@ -305,6 +309,9 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) {
 	  InitC(state->tile->dbs_g);
 	  state->tile->m_prev_pos = 0;
    }
+
+  state->ref_qp = state->frame->QP;
+
   // Select whether to encode the frame/tile in current thread or to define
   // wavefront jobs for other threads to handle.
   bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW;
diff --git a/src/encoderstate.h b/src/encoderstate.h
index a2ee3cc2..ab3603b9 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -220,6 +220,17 @@ typedef struct encoder_state_t {
   //! \brief Quantization parameter for the current LCU
   int8_t qp;
 
+  /**
+   * \brief Whether a QP delta value must be coded for the current LCU.
+   */
+  bool must_code_qp_delta;
+
+  /**
+   * \brief Reference for computing QP delta for the next LCU that is coded
+   * next. Updated whenever a QP delta is coded.
+   */
+  int8_t ref_qp;
+
   //Jobs to wait for
   threadqueue_job_t * tqj_recon_done; //Reconstruction is done
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
diff --git a/src/global.h b/src/global.h
index 0a423911..9da0a74d 100644
--- a/src/global.h
+++ b/src/global.h
@@ -158,6 +158,7 @@ typedef int16_t coeff_t;
 #define MRG_MAX_NUM_CANDS 5
 
 /* Some tools */
+#define ABS(a) ((a) >= 0 ? (a) : (-a))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define CLIP(low,high,value) MAX((low),MIN((high),(value)))

From 2a4243acbe08edad4028e61d86577ceb5d36f77c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Sun, 21 Aug 2016 14:03:57 +0900
Subject: [PATCH 05/12] Refactor rate control

Moves all code related to setting QP and lambda values to rate_control
module.
---
 src/encoderstate.c |  23 ++-------
 src/rate_control.c | 116 +++++++++++++++++++++++----------------------
 src/rate_control.h |   7 +--
 3 files changed, 65 insertions(+), 81 deletions(-)

diff --git a/src/encoderstate.c b/src/encoderstate.c
index cf8c54e6..31212ebe 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -202,9 +202,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
   const encoder_control_t * const encoder = state->encoder_control;
   videoframe_t* const frame = state->tile->frame;
 
-  state->lambda      = state->frame->lambda;
-  state->lambda_sqrt = sqrt(state->frame->lambda);
-  state->qp          = state->frame->QP;
+  kvz_set_lcu_lambda_and_qp(state);
 
   //This part doesn't write to bitstream, it's only search, deblock and sao
   
@@ -909,23 +907,8 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
 
   encoder_state_remove_refs(state);
   encoder_state_ref_sort(state);
-  double lambda;
-  if (cfg->target_bitrate > 0) {
-    // Rate control enabled.
-    lambda = kvz_select_picture_lambda(state);
-    state->frame->QP = kvz_lambda_to_qp(lambda);
-  } else {
-    if (cfg->gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
-      kvz_gop_config const * const gop =
-        cfg->gop + state->frame->gop_offset;
-      state->frame->QP = cfg->qp + gop->qp_offset;
-      state->frame->QP_factor = gop->qp_factor;
-    } else {
-      state->frame->QP = cfg->qp;
-    }
-    lambda = kvz_select_picture_lambda_from_qp(state);
-  }
-  state->frame->lambda = lambda;
+
+  kvz_set_picture_lambda_and_qp(state);
 
   encoder_state_init_children(state);
 }
diff --git a/src/rate_control.c b/src/rate_control.c
index 6e1f0db8..5be244ae 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -122,73 +122,77 @@ static double pic_allocate_bits(encoder_state_t * const state)
   return MAX(100, pic_target_bits);
 }
 
-/**
- * \brief Select a lambda value for encoding the next picture
- * \param state   the main encoder state
- * \return        lambda for the next picture
- *
- * Rate control must be enabled (i.e. cfg->target_bitrate > 0) when this
- * function is called.
- */
-double kvz_select_picture_lambda(encoder_state_t * const state)
-{
-  const encoder_control_t * const encoder = state->encoder_control;
-
-  assert(encoder->cfg->target_bitrate > 0);
-
-  if (state->frame->num > encoder->owf) {
-    // At least one frame has been written.
-    update_rc_parameters(state);
-  }
-
-  // TODO: take the picture headers into account
-  const double pic_target_bits = pic_allocate_bits(state);
-  const double target_bpp = pic_target_bits / encoder->in.pixels_per_pic;
-  const double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);
-  return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
-}
-
-int8_t kvz_lambda_to_qp(const double lambda)
+int8_t lambda_to_qp(const double lambda)
 {
   const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
   return CLIP(0, 51, qp);
 }
 
 /**
- * \brief Select a lambda value according to current QP value
+ * \brief Allocate bits and set lambda and QP for the current picture.
  * \param state the main encoder state
- * \return lambda for the next picture
- *
- * This function should be used to select lambda when rate control is
- * disabled.
  */
-double kvz_select_picture_lambda_from_qp(encoder_state_t const * const state)
+void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
 {
-  const int gop_len = state->encoder_control->cfg->gop_len;
-  const int intra_period = state->encoder_control->cfg->intra_period;
-  const int keyframe_period = gop_len > 0 ? gop_len : intra_period;
-  
-  double lambda = pow(2.0, (state->frame->QP - 12) / 3.0);
+  const encoder_control_t * const ctrl = state->encoder_control;
 
-  if (state->frame->slicetype == KVZ_SLICE_I) {
-    lambda *= 0.57;
-    
-    // Reduce lambda for I-frames according to the number of references.
-    if (keyframe_period == 0) {
-      lambda *= 0.5;
-    } else {
-      lambda *= 1.0 - CLIP(0.0, 0.5, 0.05 * (keyframe_period - 1));
+  if (ctrl->cfg->target_bitrate > 0) {
+    // Rate control enabled
+
+    if (state->frame->num > ctrl->owf) {
+      // At least one frame has been written.
+      update_rc_parameters(state);
     }
-  } else if (gop_len > 0) {
-    lambda *= state->frame->QP_factor;
-  } else {
-    lambda *= 0.4624;
-  }
 
-  // Increase lambda if not key-frame.
-  if (keyframe_period > 0 && state->frame->poc % keyframe_period != 0) {
-    lambda *= CLIP(2.0, 4.0, (state->frame->QP - 12) / 6.0);
+    // TODO: take the picture headers into account
+    const double pic_target_bits = pic_allocate_bits(state);
+    const double target_bpp = pic_target_bits / ctrl->in.pixels_per_pic;
+    const double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);
+    state->frame->lambda = CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
+    state->frame->QP = lambda_to_qp(lambda);
+
+  } else {
+    // Rate control disabled
+    kvz_gop_config const * const gop = &ctrl->cfg->gop[state->frame->gop_offset];
+    const int gop_len = ctrl->cfg->gop_len;
+    const int period  = gop_len > 0 ? gop_len : ctrl->cfg->intra_period;
+
+    state->frame->QP = ctrl->cfg->qp;
+
+    if (gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
+      state->frame->QP += gop->qp_offset;
+    }
+
+    double lambda = pow(2.0, (state->frame->QP - 12) / 3.0);
+
+    if (state->frame->slicetype == KVZ_SLICE_I) {
+      lambda *= 0.57;
+
+      // Reduce lambda for I-frames according to the number of references.
+      if (period == 0) {
+        lambda *= 0.5;
+      } else {
+        lambda *= 1.0 - CLIP(0.0, 0.5, 0.05 * (period - 1));
+      }
+    } else if (gop_len > 0) {
+      lambda *= gop->qp_factor;
+
+    } else {
+      lambda *= 0.4624;
+    }
+
+    // Increase lambda if not key-frame.
+    if (period > 0 && state->frame->poc % period != 0) {
+      lambda *= CLIP(2.0, 4.0, (state->frame->QP - 12) / 6.0);
+    }
+
+    state->frame->lambda = lambda;
   }
-  
-  return lambda;
+}
+
+void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state)
+{
+  state->lambda      = state->frame->lambda;
+  state->lambda_sqrt = sqrt(state->frame->lambda);
+  state->qp          = state->frame->QP;
 }
diff --git a/src/rate_control.h b/src/rate_control.h
index f0b2befb..b1e9281d 100644
--- a/src/rate_control.h
+++ b/src/rate_control.h
@@ -30,11 +30,8 @@
 
 #include "encoderstate.h"
 
+void kvz_set_picture_lambda_and_qp(encoder_state_t * const state);
 
-double kvz_select_picture_lambda(encoder_state_t * const state);
-
-int8_t kvz_lambda_to_qp(const double lambda);
-
-double kvz_select_picture_lambda_from_qp(encoder_state_t const * const state);
+void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state);
 
 #endif // RATE_CONTROL_H_

From ff5e5ec6d42f07ef31af65df83965c5c6fcd0800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 24 Aug 2016 10:16:48 +0900
Subject: [PATCH 06/12] Record info about coded LCUs

Adds field lcu_stats to encoder_state_config_frame_t. The following data
is recorded for each LCU:
    - number of bits
    - squared cost
    - used lambda value
    - alpha parameter used for rate control
    - beta parameter used for rate control
---
 src/encoder_state-ctors_dtors.c |  5 +++++
 src/encoderstate.c              | 33 +++++++++++++++++++++++++++++++--
 src/encoderstate.h              | 26 ++++++++++++++++++++++++++
 src/rate_control.c              |  8 +++++++-
 src/rate_control.h              |  3 ++-
 src/search.c                    |  5 ++++-
 6 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index e53ac4e4..49d438b9 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -53,11 +53,16 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
   state->frame->rc_alpha = 3.2003;
   state->frame->rc_beta = -1.367;
 
+  const encoder_control_t * const encoder = state->encoder_control;
+  const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
+  state->frame->lcu_stats = MALLOC(lcu_stats_t, num_lcus);
+
   return 1;
 }
 
 static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
   kvz_image_list_destroy(state->frame->ref);
+  FREE_POINTER(state->frame->lcu_stats);
 }
 
 static int encoder_state_config_tile_init(encoder_state_t * const state, 
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 31212ebe..967cfa8c 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -202,7 +202,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
   const encoder_control_t * const encoder = state->encoder_control;
   videoframe_t* const frame = state->tile->frame;
 
-  kvz_set_lcu_lambda_and_qp(state);
+  kvz_set_lcu_lambda_and_qp(state, lcu->position);
 
   //This part doesn't write to bitstream, it's only search, deblock and sao
   
@@ -241,6 +241,7 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
   }
   
   //Now write data to bitstream (required to have a correct CABAC state)
+  const uint64_t existing_bits = kvz_bitstream_tell(&state->stream);
   
   //First LCU, and we are in a slice. We need a slice header
   if (state->type == ENCODER_STATE_TYPE_SLICE && lcu->index == 0) {
@@ -266,7 +267,10 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
     //Always 0 since otherwise it would be split
     kvz_cabac_encode_bin_trm(&state->cabac, 0);  // end_of_slice_segment_flag
   }
-  
+
+  const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits;
+  kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->bits = bits;
+
   //Wavefronts need the context to be copied to the next row
   if (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && lcu->index == 1) {
     int j;
@@ -860,6 +864,22 @@ static void encoder_state_init_children(encoder_state_t * const state) {
   }
 }
 
+static void normalize_lcu_weights(encoder_state_t * const state)
+{
+  if (state->frame->num == 0) return;
+
+  const uint32_t num_lcus = state->encoder_control->in.width_in_lcu *
+                            state->encoder_control->in.height_in_lcu;
+  double sum = 0.0;
+  for (uint32_t i = 0; i < num_lcus; i++) {
+    sum += state->frame->lcu_stats[i].weight;
+  }
+
+  for (uint32_t i = 0; i < num_lcus; i++) {
+    state->frame->lcu_stats[i].weight /= sum;
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
   assert(state->type == ENCODER_STATE_TYPE_MAIN);
 
@@ -908,6 +928,7 @@ static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_pict
   encoder_state_remove_refs(state);
   encoder_state_ref_sort(state);
 
+  normalize_lcu_weights(state);
   kvz_set_picture_lambda_and_qp(state);
 
   encoder_state_init_children(state);
@@ -1045,3 +1066,11 @@ coeff_scan_order_t kvz_get_scan_order(int8_t cu_type, int intra_mode, int depth)
 
   return SCAN_DIAG;
 }
+
+lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y)
+{
+  const int index = lcu_x + state->tile->lcu_offset_x +
+                    (lcu_y + state->tile->lcu_offset_y) *
+                    state->encoder_control->in.width_in_lcu;
+  return &state->frame->lcu_stats[index];
+}
diff --git a/src/encoderstate.h b/src/encoderstate.h
index ab3603b9..d883c11a 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -49,6 +49,23 @@ typedef enum {
 } encoder_state_type;
 
 
+typedef struct lcu_stats_t {
+  //! \brief Number of bits that were spent
+  uint32_t bits;
+
+  //! \brief Weight of the LCU for rate control
+  double weight;
+
+  //! \brief Lambda value which was used for this LCU
+  double lambda;
+
+  //! \brief Rate control alpha parameter
+  double rc_alpha;
+
+  //! \brief Rate control beta parameter
+  double rc_beta;
+} lcu_stats_t;
+
 
 typedef struct encoder_state_config_frame_t {
   /**
@@ -114,6 +131,13 @@ typedef struct encoder_state_config_frame_t {
    */
   bool done;
 
+  /**
+   * \brief Information about the coded LCUs.
+   *
+   * Used for rate control.
+   */
+  lcu_stats_t *lcu_stats;
+
 } encoder_state_config_frame_t;
 
 typedef struct encoder_state_config_tile_t {
@@ -249,6 +273,8 @@ void kvz_encoder_get_ref_lists(const encoder_state_t *const state,
                                int ref_list_len_out[2],
                                int ref_list_poc_out[2][16]);
 
+lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y);
+
 static const uint8_t g_group_idx[32] = {
   0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
   6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
diff --git a/src/rate_control.c b/src/rate_control.c
index 5be244ae..9ce91959 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -190,9 +190,15 @@ void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
   }
 }
 
-void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state)
+void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
+                               vector2d_t pos)
 {
   state->lambda      = state->frame->lambda;
   state->lambda_sqrt = sqrt(state->frame->lambda);
   state->qp          = state->frame->QP;
+
+  lcu_stats_t *lcu_stats = kvz_get_lcu_stats(state, pos.x, pos.y);
+  lcu_stats->lambda      = state->lambda;
+  lcu_stats->rc_alpha    = state->frame->rc_alpha;
+  lcu_stats->rc_beta     = state->frame->rc_beta;
 }
diff --git a/src/rate_control.h b/src/rate_control.h
index b1e9281d..1ead1ca6 100644
--- a/src/rate_control.h
+++ b/src/rate_control.h
@@ -32,6 +32,7 @@
 
 void kvz_set_picture_lambda_and_qp(encoder_state_t * const state);
 
-void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state);
+void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
+                               vector2d_t pos);
 
 #endif // RATE_CONTROL_H_
diff --git a/src/search.c b/src/search.c
index 440e7dca..6d31a50f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -949,7 +949,10 @@ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, con
   }
 
   // Start search from depth 0.
-  search_cu(state, x, y, 0, work_tree);
+  double cost = search_cu(state, x, y, 0, work_tree);
+
+  // Save squared cost for rate control.
+  kvz_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight = cost * cost;
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.

From 3af4e9cc8a06dbdde4eea385e4a4d19bf9b708b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 24 Aug 2016 11:38:10 +0900
Subject: [PATCH 07/12] Allocate bits separately for each LCU

Bits are allocated based on the costs of the LCUs in the previous
completely coded frame.

Breaks deblock when rate control is used.
---
 src/encoderstate.h |  3 ++
 src/rate_control.c | 77 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/src/encoderstate.h b/src/encoderstate.h
index d883c11a..f0a56e3a 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -114,6 +114,9 @@ typedef struct encoder_state_config_frame_t {
   //! Number of bits targeted for the current GOP.
   double cur_gop_target_bits;
 
+  //! Number of bits targeted for the current picture.
+  double cur_pic_target_bits;
+
   // Parameters used in rate control
   double rc_alpha;
   double rc_beta;
diff --git a/src/rate_control.c b/src/rate_control.c
index 9ce91959..154570b2 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -30,6 +30,14 @@ static const int SMOOTHING_WINDOW = 40;
 static const double MIN_LAMBDA    = 0.1;
 static const double MAX_LAMBDA    = 10000;
 
+/**
+ * \brief Clip lambda value to a valid range.
+ */
+static double clip_lambda(double lambda) {
+  if (isnan(lambda)) return MAX_LAMBDA;
+  return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
+}
+
 /**
  * \brief Update alpha and beta parameters.
  * \param state the main encoder state
@@ -47,7 +55,7 @@ static void update_rc_parameters(encoder_state_t * state)
   const double alpha_old = state->frame->rc_alpha;
   const double beta_old = state->frame->rc_beta;
   // lambda computed from real bpp
-  const double lambda_comp = CLIP(MIN_LAMBDA, MAX_LAMBDA, alpha_old * pow(bpp, beta_old));
+  const double lambda_comp = clip_lambda(alpha_old * pow(bpp, beta_old));
   // lambda used in encoding
   const double lambda_real = state->frame->lambda;
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
@@ -147,9 +155,12 @@ void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
     // TODO: take the picture headers into account
     const double pic_target_bits = pic_allocate_bits(state);
     const double target_bpp = pic_target_bits / ctrl->in.pixels_per_pic;
-    const double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);
-    state->frame->lambda = CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
-    state->frame->QP = lambda_to_qp(lambda);
+    double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);
+    lambda = clip_lambda(lambda);
+
+    state->frame->lambda              = lambda;
+    state->frame->QP                  = lambda_to_qp(lambda);
+    state->frame->cur_pic_target_bits = pic_target_bits;
 
   } else {
     // Rate control disabled
@@ -190,15 +201,59 @@ void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
   }
 }
 
+/**
+ * \brief Allocate bits for a LCU.
+ * \param state   the main encoder state
+ * \param pos     location of the LCU as number of LCUs from top left
+ * \return number of bits allocated for the LCU
+ */
+static double lcu_allocate_bits(encoder_state_t * const state,
+                                vector2d_t pos)
+{
+  double lcu_weight;
+  if (state->frame->num > state->encoder_control->owf) {
+    lcu_weight = kvz_get_lcu_stats(state, pos.x, pos.y)->weight;
+  } else {
+    const uint32_t num_lcus = state->encoder_control->in.width_in_lcu *
+                              state->encoder_control->in.height_in_lcu;
+    lcu_weight = 1.0 / num_lcus;
+  }
+
+  // Target number of bits for the current LCU.
+  const double lcu_target_bits = state->frame->cur_pic_target_bits * lcu_weight;
+
+  // Allocate at least one bit for each LCU.
+  return MAX(1, lcu_target_bits);
+}
+
 void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
                                vector2d_t pos)
 {
-  state->lambda      = state->frame->lambda;
-  state->lambda_sqrt = sqrt(state->frame->lambda);
-  state->qp          = state->frame->QP;
+  const encoder_control_t * const ctrl = state->encoder_control;
 
-  lcu_stats_t *lcu_stats = kvz_get_lcu_stats(state, pos.x, pos.y);
-  lcu_stats->lambda      = state->lambda;
-  lcu_stats->rc_alpha    = state->frame->rc_alpha;
-  lcu_stats->rc_beta     = state->frame->rc_beta;
+  if (ctrl->cfg->target_bitrate > 0) {
+    const int32_t pixels     = MIN(LCU_WIDTH, state->tile->frame->width  - LCU_WIDTH * pos.x) *
+                               MIN(LCU_WIDTH, state->tile->frame->height - LCU_WIDTH * pos.y);
+    const double target_bits = lcu_allocate_bits(state, pos);
+    const double target_bpp  = target_bits / pixels;
+    const double alpha = state->frame->rc_alpha;
+    const double beta  = state->frame->rc_beta;
+
+    double lambda = alpha * pow(target_bpp, beta);
+    lambda = clip_lambda(lambda);
+
+    state->qp          = lambda_to_qp(lambda);
+    state->lambda      = lambda;
+    state->lambda_sqrt = sqrt(lambda);
+
+    lcu_stats_t *lcu_stats = kvz_get_lcu_stats(state, pos.x, pos.y);
+    lcu_stats->lambda      = lambda;
+    lcu_stats->rc_alpha    = alpha;
+    lcu_stats->rc_beta     = beta;
+
+  } else {
+    state->qp          = state->frame->QP;
+    state->lambda      = state->frame->lambda;
+    state->lambda_sqrt = sqrt(state->frame->lambda);
+  }
 }

From 93172fd25114aa45cc81539b87122fdecca52e49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 24 Aug 2016 12:51:54 +0900
Subject: [PATCH 08/12] Use separate alpha, beta and lambda for each LCU

Changes rate control to use the alpha and beta values stored in
lcu_stats_t instead of the frame-level values when selecting lambda and
QP for an LCU.
---
 src/rate_control.c | 69 +++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/rate_control.c b/src/rate_control.c
index 154570b2..a631e52a 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -40,31 +40,28 @@ static double clip_lambda(double lambda) {
 
 /**
  * \brief Update alpha and beta parameters.
- * \param state the main encoder state
  *
- * Sets global->rc_alpha and global->rc_beta of the encoder state.
+ * \param         bits        number of bits spent for coding the area
+ * \param         pixels      size of the area in pixels
+ * \param         lambda_real lambda used for coding the area
+ * \param[in,out] alpha       alpha parameter to update
+ * \param[in,out] beta        beta parameter to update
  */
-static void update_rc_parameters(encoder_state_t * state)
+static void update_parameters(uint32_t bits,
+                              uint32_t pixels,
+                              double lambda_real,
+                              double *alpha,
+                              double *beta)
 {
-  const encoder_control_t * const encoder = state->encoder_control;
-
-  const double pixels_per_picture = encoder->in.width * encoder->in.height;
-  const double bpp = state->stats_bitstream_length * 8 / pixels_per_picture;
-  const double log_bpp = log(bpp);
-
-  const double alpha_old = state->frame->rc_alpha;
-  const double beta_old = state->frame->rc_beta;
-  // lambda computed from real bpp
-  const double lambda_comp = clip_lambda(alpha_old * pow(bpp, beta_old));
-  // lambda used in encoding
-  const double lambda_real = state->frame->lambda;
+  const double bpp              = bits / (double)pixels;
+  const double lambda_comp      = clip_lambda(*alpha * pow(bpp, *beta));
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
 
-  const double alpha = alpha_old + 0.1 * lambda_log_ratio * alpha_old;
-  state->frame->rc_alpha = CLIP(0.05, 20, alpha);
+  *alpha += 0.10 * lambda_log_ratio * (*alpha);
+  *alpha = CLIP(0.05, 20, *alpha);
 
-  const double beta = beta_old + 0.05 * lambda_log_ratio * CLIP(-5, 1, log_bpp);
-  state->frame->rc_beta = CLIP(-3, -0.1, beta);
+  *beta  += 0.05 * lambda_log_ratio * CLIP(-5.0, -1.0, log(bpp));
+  *beta  = CLIP(-3, -0.1, *beta);
 }
 
 /**
@@ -149,7 +146,11 @@ void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
 
     if (state->frame->num > ctrl->owf) {
       // At least one frame has been written.
-      update_rc_parameters(state);
+      update_parameters(state->stats_bitstream_length * 8,
+                        ctrl->in.pixels_per_pic,
+                        state->frame->lambda,
+                        &state->frame->rc_alpha,
+                        &state->frame->rc_beta);
     }
 
     // TODO: take the picture headers into account
@@ -232,24 +233,30 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
   const encoder_control_t * const ctrl = state->encoder_control;
 
   if (ctrl->cfg->target_bitrate > 0) {
-    const int32_t pixels     = MIN(LCU_WIDTH, state->tile->frame->width  - LCU_WIDTH * pos.x) *
+    lcu_stats_t *lcu         = kvz_get_lcu_stats(state, pos.x, pos.y);
+    const uint32_t pixels    = MIN(LCU_WIDTH, state->tile->frame->width  - LCU_WIDTH * pos.x) *
                                MIN(LCU_WIDTH, state->tile->frame->height - LCU_WIDTH * pos.y);
+
+    if (state->frame->num > ctrl->owf) {
+      update_parameters(lcu->bits,
+                        pixels,
+                        lcu->lambda,
+                        &lcu->rc_alpha,
+                        &lcu->rc_beta);
+    } else {
+      lcu->rc_alpha = state->frame->rc_alpha;
+      lcu->rc_beta  = state->frame->rc_beta;
+    }
+
     const double target_bits = lcu_allocate_bits(state, pos);
     const double target_bpp  = target_bits / pixels;
-    const double alpha = state->frame->rc_alpha;
-    const double beta  = state->frame->rc_beta;
 
-    double lambda = alpha * pow(target_bpp, beta);
-    lambda = clip_lambda(lambda);
+    double lambda = clip_lambda(lcu->rc_alpha * pow(target_bpp, lcu->rc_beta));
 
-    state->qp          = lambda_to_qp(lambda);
+    lcu->lambda        = lambda;
     state->lambda      = lambda;
     state->lambda_sqrt = sqrt(lambda);
-
-    lcu_stats_t *lcu_stats = kvz_get_lcu_stats(state, pos.x, pos.y);
-    lcu_stats->lambda      = lambda;
-    lcu_stats->rc_alpha    = alpha;
-    lcu_stats->rc_beta     = beta;
+    state->qp          = lambda_to_qp(lambda);
 
   } else {
     state->qp          = state->frame->QP;

From 82a98180e43eaf85da60ad76bbdc90161d8b3500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 24 Aug 2016 12:57:31 +0900
Subject: [PATCH 09/12] Clip LCU lambda to reduce quality fluctuation

Limits lambdas for each LCU based on the computed lambda from the
previous frame and the frame-level lambda.
---
 src/rate_control.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/rate_control.c b/src/rate_control.c
index a631e52a..76c435a3 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -252,6 +252,19 @@ void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
     const double target_bpp  = target_bits / pixels;
 
     double lambda = clip_lambda(lcu->rc_alpha * pow(target_bpp, lcu->rc_beta));
+    // Clip lambda according to the equations 24 and 26 in
+    // https://doi.org/10.1109/TIP.2014.2336550
+    if (state->frame->num > ctrl->owf) {
+      const double bpp         = lcu->bits / (double)pixels;
+      const double lambda_comp = clip_lambda(lcu->rc_alpha * pow(bpp, lcu->rc_beta));
+      lambda = CLIP(lambda_comp * 0.7937005259840998,
+                    lambda_comp * 1.2599210498948732,
+                    lambda);
+    }
+    lambda = CLIP(state->frame->lambda * 0.6299605249474366,
+                  state->frame->lambda * 1.5874010519681994,
+                  lambda);
+    lambda = clip_lambda(lambda);
 
     lcu->lambda        = lambda;
     state->lambda      = lambda;

From c219d3cd944993c536c038d7ca0e17363380a0d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Tue, 27 Sep 2016 20:39:37 +0900
Subject: [PATCH 10/12] Fix deblock when CU QP delta is enabled

Fixes deblock functions so that they use the correct QP for the filtered
edge. Adds field qp to cu_info_t.
---
 src/cu.h           |  7 +++++
 src/encoderstate.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 src/filter.c       | 27 +++++++++++++++--
 3 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index cf2a4e9a..6b5bdf38 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -126,6 +126,13 @@ typedef struct
 
   uint16_t cbf;
 
+  /**
+   * \brief QP used for the CU.
+   *
+   * This is required for deblocking when per-LCU QPs are enabled.
+   */
+  uint8_t qp;
+
   union {
     struct {
       int8_t mode;
diff --git a/src/encoderstate.c b/src/encoderstate.c
index 967cfa8c..825a7373 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -196,7 +196,76 @@ static void encode_sao(encoder_state_t * const state,
 }
 
 
-static void encoder_state_worker_encode_lcu(void * opaque) {
+/**
+ * \brief Sets the QP for each CU in state->tile->frame->cu_array.
+ *
+ * The QPs are used in deblocking.
+ *
+ * The delta QP for an LCU is coded when the first CU with coded block flag
+ * set is encountered. Hence, for the purposes of deblocking, all CUs
+ * before the first one with cbf set use state->ref_qp and all CUs after
+ * that use state->qp.
+ *
+ * \param state           encoder state
+ * \param x               x-coordinate of the left edge of the root CU
+ * \param y               y-coordinate of the top edge of the root CU
+ * \param depth           depth in the CU quadtree
+ * \param coeffs_coded    Used for tracking whether a CU with a residual
+ *                        has been encountered. Should be set to false at
+ *                        the top level.
+ * \return Whether there were any CUs with residual or not.
+ */
+static bool set_cu_qps(encoder_state_t *state, int x, int y, int depth, bool coeffs_coded)
+{
+  if (state->qp == state->ref_qp) {
+    // If the QPs are equal there is no need to care about the residuals.
+    coeffs_coded = true;
+  }
+
+  cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
+  const int cu_width = LCU_WIDTH >> depth;
+  coeffs_coded = coeffs_coded || cbf_is_set_any(cu->cbf, cu->depth);
+
+  if (!coeffs_coded && cu->depth > depth) {
+    // Recursively process sub-CUs.
+    const int d = cu_width >> 1;
+    coeffs_coded = set_cu_qps(state, x,     y,     depth + 1, coeffs_coded);
+    coeffs_coded = set_cu_qps(state, x + d, y,     depth + 1, coeffs_coded);
+    coeffs_coded = set_cu_qps(state, x,     y + d, depth + 1, coeffs_coded);
+    coeffs_coded = set_cu_qps(state, x + d, y + d, depth + 1, coeffs_coded);
+
+  } else {
+    if (!coeffs_coded && cu->tr_depth > depth) {
+      // The CU is split into smaller transform units. Check whether coded
+      // block flag is set for any of the TUs.
+      const int tu_width = LCU_WIDTH >> cu->tr_depth;
+      for (int y_scu = y; y_scu < y + cu_width; y_scu += tu_width) {
+        for (int x_scu = x; x_scu < x + cu_width; x_scu += tu_width) {
+          cu_info_t *tu = kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
+          if (cbf_is_set_any(tu->cbf, cu->depth)) {
+            coeffs_coded = true;
+          }
+        }
+      }
+    }
+
+    // Set the correct QP for all state->tile->frame->cu_array elements in
+    // the area covered by the CU.
+    const int8_t qp = coeffs_coded ? state->qp : state->ref_qp;
+
+    for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
+      for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
+        kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
+      }
+    }
+  }
+
+  return coeffs_coded;
+}
+
+
+static void encoder_state_worker_encode_lcu(void * opaque)
+{
   const lcu_order_element_t * const lcu = opaque;
   encoder_state_t *state = lcu->encoder_state;
   const encoder_control_t * const encoder = state->encoder_control;
@@ -211,6 +280,10 @@ static void encoder_state_worker_encode_lcu(void * opaque) {
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
   if (encoder->deblock_enable) {
+    if (encoder->cfg->target_bitrate > 0) {
+      set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, false);
+    }
+
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
 
diff --git a/src/filter.c b/src/filter.c
index dce73199..d96db710 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -247,6 +247,27 @@ static bool is_on_8x8_grid(int x, int y, edge_dir dir)
   }
 }
 
+static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
+{
+  if (state->encoder_control->cfg->target_bitrate <= 0) {
+    return state->qp;
+  }
+
+  int32_t qp_p;
+  if (dir == EDGE_HOR && y > 0) {
+    qp_p = kvz_cu_array_at_const(state->tile->frame->cu_array, x, y - 1)->qp;
+  } else if (dir == EDGE_VER && x > 0) {
+    qp_p = kvz_cu_array_at_const(state->tile->frame->cu_array, x - 1, y)->qp;
+  } else {
+    qp_p = state->frame->QP;
+  }
+
+  const int32_t qp_q =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x, y)->qp;
+
+  return (qp_p + qp_q + 1) >> 1;
+}
+
 /**
  * \brief Apply the deblocking filter to luma pixels on a single edge.
  *
@@ -290,8 +311,9 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
     kvz_pixel *orig_src = &frame->rec->y[x + y*stride];
     kvz_pixel *src = orig_src;
 
+    const int32_t qp = get_qp_y_pred(state, x, y, dir);
+
     int8_t strength = 0;
-    int32_t qp              = state->qp;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = kvz_g_beta_table_8x8[b_index] * bitdepth_scale;
@@ -490,7 +512,8 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state,
     };
     int8_t strength = 2;
 
-    int32_t QP             = kvz_g_chroma_scale[state->qp];
+    const int32_t luma_qp  = get_qp_y_pred(state, x << 1, y << 1, dir);
+    int32_t QP             = kvz_g_chroma_scale[luma_qp];
     int32_t bitdepth_scale = 1 << (encoder->bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = kvz_g_tc_table_8x8[TC_index]*bitdepth_scale;

From ee518e8ac4f3b82f3d3a1ed4494ad38ba1796142 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Thu, 6 Oct 2016 21:27:01 +0900
Subject: [PATCH 11/12] Take header bits into account in rate control

---
 src/encoder_state-bitstream.c |  6 ++---
 src/encoderstate.h            | 13 ++++++++++
 src/rate_control.c            | 45 ++++++++++++++++++++++++++++++++---
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index edeef983..cb2be0f9 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -918,10 +918,8 @@ static void encoder_state_write_bitstream_main(encoder_state_t * const state)
     first_nal_in_au = false;
     encoder_state_write_bitstream_aud(state);
   }
-  
-  if ((encoder->vps_period > 0 && state->frame->num % encoder->vps_period == 0)
-      || (state->frame->num == 0 && encoder->vps_period >= 0))
-  {
+
+  if (encoder_state_must_write_vps(state)) {
     first_nal_in_au = false;
     kvz_encoder_state_write_parameter_sets(&state->stream, state);
   }
diff --git a/src/encoderstate.h b/src/encoderstate.h
index f0a56e3a..5354bc34 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -278,6 +278,19 @@ void kvz_encoder_get_ref_lists(const encoder_state_t *const state,
 
 lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y);
 
+
+/**
+ * Whether the parameter sets should be written with the current frame.
+ */
+static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
+{
+  const int32_t frame = state->frame->num;
+  const int32_t vps_period = state->encoder_control->vps_period;
+
+  return (vps_period >  0 && frame % vps_period == 0) ||
+         (vps_period >= 0 && frame == 0);
+}
+
 static const uint8_t g_group_idx[32] = {
   0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
   6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
diff --git a/src/rate_control.c b/src/rate_control.c
index 76c435a3..aa770528 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -95,10 +95,49 @@ static double gop_allocate_bits(encoder_state_t * const state)
   return MAX(200, gop_target_bits);
 }
 
+/**
+ * Estimate number of bits used for headers of the current picture.
+ * \param state   the main encoder state
+ * \return        number of header bits
+ */
+static uint64_t pic_header_bits(encoder_state_t * const state)
+{
+  const kvz_config* cfg = state->encoder_control->cfg;
+
+  // nal type and slice header
+  uint64_t bits = 48 + 24;
+
+  // entry points
+  bits += 12 * state->encoder_control->in.height_in_lcu;
+
+  switch (cfg->hash) {
+    case KVZ_HASH_CHECKSUM:
+      bits += 168;
+      break;
+
+    case KVZ_HASH_MD5:
+      bits += 456;
+      break;
+
+    case KVZ_HASH_NONE:
+      break;
+  }
+
+  if (encoder_state_must_write_vps(state)) {
+    bits += 613;
+  }
+
+  if (state->frame->num == 0 && cfg->add_encoder_info) {
+    bits += 1392;
+  }
+
+  return bits;
+}
+
 /**
  * Allocate bits for the current picture.
  * \param state   the main encoder state
- * \return        target number of bits
+ * \return        target number of bits, excluding headers
  */
 static double pic_allocate_bits(encoder_state_t * const state)
 {
@@ -122,7 +161,8 @@ static double pic_allocate_bits(encoder_state_t * const state)
 
   const double pic_weight = encoder->gop_layer_weights[
     encoder->cfg->gop[state->frame->gop_offset].layer - 1];
-  double pic_target_bits = state->frame->cur_gop_target_bits * pic_weight;
+  const double pic_target_bits =
+    state->frame->cur_gop_target_bits * pic_weight - pic_header_bits(state);
   // Allocate at least 100 bits for each picture like HM does.
   return MAX(100, pic_target_bits);
 }
@@ -153,7 +193,6 @@ void kvz_set_picture_lambda_and_qp(encoder_state_t * const state)
                         &state->frame->rc_beta);
     }
 
-    // TODO: take the picture headers into account
     const double pic_target_bits = pic_allocate_bits(state);
     const double target_bpp = pic_target_bits / ctrl->in.pixels_per_pic;
     double lambda = state->frame->rc_alpha * pow(target_bpp, state->frame->rc_beta);

From 05794c3548400ab5d9f958e9faaeecedaae380d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Wed, 11 Jan 2017 15:47:53 +0900
Subject: [PATCH 12/12] Add missing static to function lambda_to_qp

---
 src/rate_control.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rate_control.c b/src/rate_control.c
index aa770528..e9c6096e 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -167,7 +167,7 @@ static double pic_allocate_bits(encoder_state_t * const state)
   return MAX(100, pic_target_bits);
 }
 
-int8_t lambda_to_qp(const double lambda)
+static int8_t lambda_to_qp(const double lambda)
 {
   const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
   return CLIP(0, 51, qp);