From 0577d481c146f90cc47cc0ff562a523d73810c8f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tut.fi>
Date: Wed, 25 Sep 2019 12:12:21 +0300
Subject: [PATCH] CTU level code

---
 src/encoder_state-ctors_dtors.c |  30 +++---
 src/encoderstate.h              |   3 +-
 src/kvazaar.h                   |   2 +-
 src/rate_control.c              | 177 +++++++++++++++++++++++++++++---
 4 files changed, 179 insertions(+), 33 deletions(-)

diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index 09300a85..24bcbdf5 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -64,19 +64,19 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
   }
 
   for(int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
-    state->frame->new_lookahead.c_para[i] = malloc(sizeof(double) * num_lcus);
-    state->frame->new_lookahead.k_para[i] = malloc(sizeof(double) * num_lcus);
-    state->frame->new_lookahead.pic_c_para[i] = 5.0;
-    state->frame->new_lookahead.pic_k_para[i] = -0.1;
+    state->frame->new_ratecontrol.c_para[i] = malloc(sizeof(double) * num_lcus);
+    state->frame->new_ratecontrol.k_para[i] = malloc(sizeof(double) * num_lcus);
+    state->frame->new_ratecontrol.pic_c_para[i] = 5.0;
+    state->frame->new_ratecontrol.pic_k_para[i] = -0.1;
     for(int j = 0; j < num_lcus; j++) {
-      state->frame->new_lookahead.c_para[i][j] = 5.0;
-      state->frame->new_lookahead.k_para[i][j] = -0.1;
+      state->frame->new_ratecontrol.c_para[i][j] = 5.0;
+      state->frame->new_ratecontrol.k_para[i][j] = -0.1;
     }
   }
-  state->frame->new_lookahead.intra_slice_bpp = calloc(num_lcus, sizeof(double));
-  state->frame->new_lookahead.intra_slice_dis = calloc(num_lcus, sizeof(double));
-  memset(state->frame->new_lookahead.previous_lambdas, 0, sizeof(state->frame->new_lookahead.previous_lambdas));
-  state->frame->new_lookahead.last_frame_lambda = 0.0;
+  state->frame->new_ratecontrol.intra_slice_bpp = calloc(num_lcus, sizeof(double));
+  state->frame->new_ratecontrol.intra_slice_dis = calloc(num_lcus, sizeof(double));
+  memset(state->frame->new_ratecontrol.previous_lambdas, 0, sizeof(state->frame->new_ratecontrol.previous_lambdas));
+  state->frame->new_ratecontrol.last_frame_lambda = 0.0;
 
   return 1;
 }
@@ -84,11 +84,11 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
 static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
   if (state->frame == NULL) return;
 
-  FREE_POINTER(state->frame->new_lookahead.intra_slice_bpp);
-  FREE_POINTER(state->frame->new_lookahead.intra_slice_dis);
-  for(int i = 0; i < 6; i++) {
-    FREE_POINTER(state->frame->new_lookahead.c_para[i]);
-    FREE_POINTER(state->frame->new_lookahead.k_para[i]);
+  FREE_POINTER(state->frame->new_ratecontrol.intra_slice_bpp);
+  FREE_POINTER(state->frame->new_ratecontrol.intra_slice_dis);
+  for(int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
+    FREE_POINTER(state->frame->new_ratecontrol.c_para[i]);
+    FREE_POINTER(state->frame->new_ratecontrol.k_para[i]);
   }
 
   kvz_image_list_destroy(state->frame->ref);
diff --git a/src/encoderstate.h b/src/encoderstate.h
index 4d670487..5837c295 100644
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@@ -66,6 +66,7 @@ typedef struct lcu_stats_t {
 
   //! \brief Rate control beta parameter
   double rc_beta;
+  int8_t qp;
 } lcu_stats_t;
 
 
@@ -153,7 +154,7 @@ typedef struct encoder_state_config_frame_t {
     double *intra_slice_dis;
     double previous_lambdas[KVZ_MAX_GOP_LAYERS+1];
     double last_frame_lambda;
-  } new_lookahead;
+  } new_ratecontrol;
 
   /**
    * \brief Whether next NAL is the first NAL in the access unit.
diff --git a/src/kvazaar.h b/src/kvazaar.h
index 76857efa..1ed47168 100644
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@@ -394,7 +394,7 @@ typedef struct kvz_config
 
   /** \brief Enable Early Skip Mode Decision */
   uint8_t early_skip;
-  uint8_t frame_allocation;
+  int8_t frame_allocation;
 } kvz_config;
 
 /**
diff --git a/src/rate_control.c b/src/rate_control.c
index 69f0f1a2..06d6c293 100644
--- a/src/rate_control.c
+++ b/src/rate_control.c
@@ -167,6 +167,11 @@ static double pic_allocate_bits(encoder_state_t * const state)
   return MAX(100, pic_target_bits);
 }
 
+static int8_t lambda_to_qp(const double lambda)
+{
+  const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
+  return CLIP_TO_QP(qp);
+}
 
 static double solve_cubic_equation(const encoder_state_config_frame_t * const state,
                             int ctu_index,
@@ -190,10 +195,10 @@ static double solve_cubic_equation(const encoder_state_config_frame_t * const st
     double b = 0.0;
     double c = 0.0;
     double d = 0.0;
-    assert((state->new_lookahead.c_para[layer][i] <= 0) || (state->new_lookahead.k_para[layer][i] >= 0)); //Check C and K during each solution 
+    assert((state->new_ratecontrol.c_para[layer][i] <= 0) || (state->new_ratecontrol.k_para[layer][i] >= 0)); //Check C and K during each solution 
 
-    double CLCU = state->new_lookahead.c_para[layer][i];
-    double KLCU = state->new_lookahead.k_para[layer][i];
+    double CLCU = state->new_ratecontrol.c_para[layer][i];
+    double KLCU = state->new_ratecontrol.k_para[layer][i];
     a = -CLCU * KLCU / pow(state->lcu_stats[i].pixels, KLCU - 1.0);
     b = -1.0 / (KLCU - 1.0);
     d = est_lambda;
@@ -248,8 +253,8 @@ static double solve_cubic_equation(const encoder_state_config_frame_t * const st
 static INLINE double calculate_weights(encoder_state_t* const state, const int layer, const int ctu_count, double estLambda) {
   double total_weight = 0;
   for(int i = 0; i < ctu_count; i++) {
-    double CLCU = state->frame->new_lookahead.c_para[layer][i];
-    double KLCU = state->frame->new_lookahead.k_para[layer][i];
+    double CLCU = state->frame->new_ratecontrol.c_para[layer][i];
+    double KLCU = state->frame->new_ratecontrol.k_para[layer][i];
     double a = -CLCU * KLCU / pow(state->frame->lcu_stats[i].pixels, KLCU - 1.0);
     double b = -1.0 / (KLCU - 1.0);
     state->frame->lcu_stats[i].weight = pow(a / estLambda, b);
@@ -261,6 +266,7 @@ static INLINE double calculate_weights(encoder_state_t* const state, const int l
   return total_weight;
 }
 
+// TODO: Missing QP calculation
 void estimatePicLambda(encoder_state_t * const state) {
   double bits = pic_allocate_bits(state);
   const int layer = state->frame->gop_offset - (state->frame->is_irap ? 1 : 0);
@@ -273,9 +279,9 @@ void estimatePicLambda(encoder_state_t * const state) {
     beta = state->frame->rc_beta;
   }
   else {
-    alpha = -state->frame->new_lookahead.pic_c_para[state->frame->gop_offset] *
-      state->frame->new_lookahead.pic_k_para[state->frame->gop_offset];
-    beta = state->frame->new_lookahead.pic_k_para[state->frame->gop_offset] - 1;
+    alpha = -state->frame->new_ratecontrol.pic_c_para[state->frame->gop_offset] *
+      state->frame->new_ratecontrol.pic_k_para[state->frame->gop_offset];
+    beta = state->frame->new_ratecontrol.pic_k_para[state->frame->gop_offset] - 1;
   }
   double estLambda;
   double bpp = bits / (state->encoder_control->cfg.width * state->encoder_control->cfg.height);
@@ -288,11 +294,11 @@ void estimatePicLambda(encoder_state_t * const state) {
   }
 
   double temp_lambda;
-  if ((temp_lambda = state->frame->new_lookahead.previous_lambdas[layer]) > 0.0) {
+  if ((temp_lambda = state->frame->new_ratecontrol.previous_lambdas[layer]) > 0.0) {
     estLambda = CLIP(temp_lambda * pow(2.0, -1), temp_lambda * 2, estLambda);
   }
 
-  if((temp_lambda = state->frame->new_lookahead.last_frame_lambda) > 0.0) {
+  if((temp_lambda = state->frame->new_ratecontrol.last_frame_lambda) > 0.0) {
     estLambda = CLIP(temp_lambda * pow(2.0, -10.0 / 3.0), temp_lambda * pow(2.0, 10.0 / 3.0), estLambda);
   }
 
@@ -310,8 +316,8 @@ void estimatePicLambda(encoder_state_t * const state) {
         taylor_e3 = 0.0;
         best_lambda = temp_lambda = solve_cubic_equation(state->frame, 0, ctu_count, layer, temp_lambda, bits);
         for (int i = 0; i < ctu_count; ++i) {
-          double CLCU = state->frame->new_lookahead.c_para[layer][i];
-          double KLCU = state->frame->new_lookahead.k_para[layer][i];
+          double CLCU = state->frame->new_ratecontrol.c_para[layer][i];
+          double KLCU = state->frame->new_ratecontrol.k_para[layer][i];
           double a = -CLCU * KLCU / pow(state->frame->lcu_stats[i].pixels, KLCU - 1.0);
           double b = -1.0 / (KLCU - 1.0);
           taylor_e3 += pow(a / best_lambda, b);
@@ -338,12 +344,151 @@ void estimatePicLambda(encoder_state_t * const state) {
 }
 
 
-static int8_t lambda_to_qp(const double lambda)
-{
-  const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
-  return CLIP_TO_QP(qp);
+static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
+  double bpp;
+  int avg_bits;
+
+  const int layer = state->frame->gop_offset - (state->frame->is_irap ? 1 : 0);
+
+  const int num_ctu = state->encoder_control->in.width_in_lcu * state->encoder_control->in.height_in_lcu;
+  const int index = pos.x + pos.y * state->tile->frame->width_in_lcu;
+
+  if (state->frame->is_irap) {
+    // TODO: intra
+    avg_bits = state->frame->cur_pic_target_bits / ((double)state->frame->lcu_stats[index].pixels / 
+      (state->encoder_control->in.height * state->encoder_control->in.width));
+  }
+  else {
+    double totalWeight = 0;
+    const int realInfluenceLCU = MIN(4, num_ctu - index); //g_RCLCUSmoothWindowSize, the same as the original RC scheme
+    int TargetbitsForSmoothWindow = 0;
+    double bestlambda = 0.0;
+    double Templambda = state->frame->lambda;
+    double TaylorE3 = 0.0;
+    int IterationNum = 0;
+    double estLambda = Templambda;
+
+    for (int i = index; i < num_ctu; i++) {
+      totalWeight += state->frame->lcu_stats[i].weight;
+    }
+
+    int last_ctu = index + realInfluenceLCU;
+    for (int i = index; i < last_ctu; i++) {
+      TargetbitsForSmoothWindow += state->frame->lcu_stats[i].weight;
+    }
+
+    TargetbitsForSmoothWindow = MAX(TargetbitsForSmoothWindow + state->frame->total_bits_coded - (int)totalWeight, 10); //obtain the total bit-rate for the realInfluenceLCU (=4) CTUs
+
+    //just similar with the process at frame level, details can refer to the function TEncRCPic::estimatePicLambda
+    do {
+      TaylorE3 = 0.0;
+      bestlambda = solve_cubic_equation(state->frame, index, last_ctu, layer, Templambda, TargetbitsForSmoothWindow);
+      Templambda = bestlambda;
+      for (int i = index; i < last_ctu; i++) {
+
+        double CLCU = state->frame->new_ratecontrol.c_para[layer][i];
+        double KLCU = state->frame->new_ratecontrol.k_para[layer][i];
+        double a = -CLCU * KLCU / pow((double)state->frame->lcu_stats[i].pixels, KLCU - 1.0);
+        double b = -1.0 / (KLCU - 1.0);
+        TaylorE3 += pow(a / bestlambda, b);
+      }
+      IterationNum++;
+    } while (fabs(TaylorE3 - TargetbitsForSmoothWindow) > 0.01 && IterationNum < 5);
+
+    double CLCU = state->frame->new_ratecontrol.c_para[layer][index];
+    double KLCU = state->frame->new_ratecontrol.k_para[layer][index];
+    double a = -CLCU * KLCU / pow(((double)state->frame->lcu_stats[index].pixels), KLCU - 1.0);
+    double b = -1.0 / (KLCU - 1.0);
+
+    state->frame->lcu_stats[index].weight = MAX(pow(a / bestlambda, b), 0.01);
+
+    avg_bits = (int)(state->frame->lcu_stats[index].weight + 0.5);
+  }
+
+  if (avg_bits < 1) {
+    avg_bits = 1;
+  }
+  return avg_bits;
 }
 
+
+void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
+  double bits = get_ctu_bits(state, pos);
+
+  const int frame_allocation = state->encoder_control->cfg.frame_allocation;
+
+  int index = pos.x + pos.y * state->encoder_control->in.width_in_lcu;
+  double bpp = bits / state->frame->lcu_stats[index].pixels;
+
+  double alpha;
+  double beta;
+  if (state->frame->poc == 0) {
+    alpha = state->frame->rc_alpha;
+    beta = state->frame->rc_beta;
+  }
+  else {
+    alpha = -state->frame->new_ratecontrol.c_para[state->frame->gop_offset][index] *
+      state->frame->new_ratecontrol.k_para[state->frame->gop_offset][index];
+    beta = state->frame->new_ratecontrol.k_para[state->frame->gop_offset][index] - 1;
+  }
+
+  double est_lambda = alpha * pow(bpp, beta);
+  double clip_lambda = state->frame->lambda;
+
+  double clip_neighbor_lambda = -1;
+  for(int temp_index = index - 1; temp_index >= 0; --temp_index) {
+    if(state->frame->lcu_stats[index].lambda > 0) {
+      clip_neighbor_lambda = state->frame->lcu_stats[index].lambda;
+      break;
+    }
+  }
+
+  if (clip_neighbor_lambda > 0) {
+    est_lambda = CLIP(clip_neighbor_lambda * pow(2, -(1.0 + frame_allocation) / 3.0),
+      clip_neighbor_lambda * pow(2.0, (1.0 + frame_allocation) / 3.0),
+      est_lambda);
+  }
+
+  if (clip_lambda > 0) {
+    est_lambda = CLIP(clip_lambda * pow(2, -(2.0 + frame_allocation) / 3.0),
+      clip_lambda * pow(2.0, (1.0 + frame_allocation) / 3.0),
+      est_lambda);
+  }
+  else {
+    est_lambda = CLIP(10.0, 1000.0, est_lambda);
+  }
+
+  if (est_lambda < 0.1) {
+    est_lambda = 0.1;
+  }
+
+  int est_qp = lambda_to_qp(est_lambda);
+
+  int clip_qp = -1;
+  for (int temp_index = index - 1; temp_index >= 0; --temp_index) {
+    if (state->frame->lcu_stats[index].qp > -1) {
+      clip_qp = state->frame->lcu_stats[index].qp;
+      break;
+    }
+  }
+
+  if( clip_qp > -1) {
+    est_qp = CLIP(clip_qp - 1 - frame_allocation,
+      clip_qp + 1 + frame_allocation,
+      clip_qp);
+  }
+
+  est_qp = CLIP(state->frame->QP - 2 - frame_allocation,
+    state->frame->QP + 2 + frame_allocation,
+    est_qp);
+
+  state->lambda = est_lambda;
+  state->lambda_sqrt = sqrt(est_lambda);
+  state->qp = est_qp;
+  state->frame->lcu_stats[index].qp = est_qp;
+}
+
+
 static double qp_to_lamba(encoder_state_t * const state, int qp)
 {
   const encoder_control_t * const ctrl = state->encoder_control;