WPP with threading

2024-11-24 02:24:07 +00:00 · 2019-11-12 12:12:57 +02:00 · 2019-11-12 12:12:57 +02:00 · b9b01f8036
parent 615973adca
commit b9b01f8036
4 changed files with 30 additions and 13 deletions
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@ -60,10 +60,12 @@ static int encoder_state_config_frame_init(encoder_state_t * const state) {
  for (int y = 0; y < encoder->in.height_in_lcu; y++) {
    for (int x = 0; x < encoder->in.width_in_lcu; x++) {
      int temp = MIN(encoder->cfg.width - x * 64, 64) * MIN(encoder->cfg.height - y * 64, 64);
-      state->frame->lcu_stats[x + y * encoder->in.width_in_lcu].pixels =temp;
+      state->frame->lcu_stats[x + y * encoder->in.width_in_lcu].pixels = temp;
    }
  }

+  pthread_mutex_init(&state->frame->rc_lock, NULL);
+
  for(int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
    state->frame->new_ratecontrol.c_para[i] = malloc(sizeof(double) * num_lcus);
    state->frame->new_ratecontrol.k_para[i] = malloc(sizeof(double) * num_lcus);
@ -98,6 +100,7 @@ static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
    FREE_POINTER(state->frame->new_ratecontrol.k_para[i]);
  }

+  pthread_mutex_destroy(&state->frame->rc_lock);
  // fclose(state->frame->bpp_d);
  // fclose(state->frame->c_d);
  // fclose(state->frame->k_d);
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -703,9 +703,17 @@ static void encoder_state_worker_encode_lcu(void * opaque)
    }
  }

+  pthread_mutex_lock(&state->frame->rc_lock);
  const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits;
  state->frame->cur_frame_bits_coded += bits;
+  // This variable is used differently by intra and inter frames and shouldn't
+  // be touched in intra frames here
+  state->frame->remaining_weight -= !state->frame->is_irap ?
+    kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->weight :
+    0;
+  pthread_mutex_unlock(&state->frame->rc_lock);
  kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->bits = bits;
+
  uint8_t not_skip = false;
  for(int y = 0; y < 64 && !not_skip; y+=8) {
    for(int x = 0; x < 64 && !not_skip; x+=8) {
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -151,6 +151,8 @@ typedef struct encoder_state_config_frame_t {
   */
  lcu_stats_t *lcu_stats;

+  pthread_mutex_t rc_lock;
+
  struct
  {
    double *c_para[KVZ_MAX_GOP_LAYERS];
@ -175,7 +177,7 @@ typedef struct encoder_state_config_frame_t {
   */
  bool first_nal;
  double icost;
-  double remaining_icost;
+  double remaining_weight;
  double i_bits_left;
 } encoder_state_config_frame_t;

--- a/src/rate_control.c
+++ b/src/rate_control.c
@ -382,7 +382,7 @@ void kvz_estimate_pic_lambda(encoder_state_t * const state) {
      }
    }
    state->frame->icost = total_cost;
-    state->frame->remaining_icost = total_cost;
+    state->frame->remaining_weight = total_cost;
  }

  const int layer = encoder->cfg.gop[state->frame->gop_offset].layer - (state->frame->is_irap ? 1 : 0);
@ -451,6 +451,7 @@ void kvz_estimate_pic_lambda(encoder_state_t * const state) {
      while (fabs(taylor_e3 - bits) > 0.01 && iteration_number <= 11);
    }
    total_weight = calculate_weights(state, layer, ctu_count, best_lambda);
+    state->frame->remaining_weight = bits;
  }
  else {
    for (int i = 0; i < ctu_count; ++i) {
@ -480,15 +481,17 @@ static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
  const int index = pos.x + pos.y * state->tile->frame->width_in_lcu;

  if (state->frame->is_irap) {
-    // TODO: intra
    int cus_left = num_ctu - index + 1;
    int window = MIN(4, cus_left);
    double mad = kvz_get_lcu_stats(state, pos.x, pos.y)->i_cost;
+
+    pthread_mutex_lock(&state->frame->rc_lock);
    double bits_left = state->frame->cur_pic_target_bits - state->frame->cur_frame_bits_coded;
    double weighted_bits_left = (bits_left * window + (bits_left - state->frame->i_bits_left)*cus_left) / window;
-    avg_bits = mad * weighted_bits_left / state->frame->remaining_icost;
-    state->frame->remaining_icost -= mad;
+    avg_bits = mad * weighted_bits_left / state->frame->remaining_weight;
+    state->frame->remaining_weight -= mad;
    state->frame->i_bits_left -= state->frame->cur_pic_target_bits * mad / state->frame->icost;
+    pthread_mutex_unlock(&state->frame->rc_lock);
  }
  else {
    double total_weight = 0;
@ -500,16 +503,15 @@ static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
    double taylor_e3 = 0.0;
    int iter = 0;

-    for (int i = index; i < num_ctu; i++) {
-      total_weight += state->frame->lcu_stats[i].weight;
-    }
-
    int last_ctu = index + used_ctu_count;
    for (int i = index; i < last_ctu; i++) {
      target_bits += state->frame->lcu_stats[i].weight;
    }

-    target_bits = MAX(target_bits + state->frame->cur_pic_target_bits - state->frame->cur_frame_bits_coded - (int)total_weight, 10); //obtain the total bit-rate for the realInfluenceLCU (=4) CTUs
+    pthread_mutex_lock(&state->frame->rc_lock);
+    total_weight = state->frame->remaining_weight;
+    target_bits = MAX(target_bits + state->frame->cur_pic_target_bits - state->frame->cur_frame_bits_coded - (int)total_weight, 10);
+    pthread_mutex_unlock(&state->frame->rc_lock);

    //just similar with the process at frame level, details can refer to the function TEncRCPic::kvz_estimate_pic_lambda
    do {
@ -582,12 +584,14 @@ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
    est_qp = lambda_to_qp(est_lambda);
  }
  else {
+    // In case wpp is used the previous ctus may not be ready from above rows
+    const int ctu_limit = encoder->cfg.wpp ? pos.y * encoder->in.width_in_lcu : 0;
    
    est_lambda = alpha * pow(bpp, beta);
    const double clip_lambda = state->frame->lambda;

    double clip_neighbor_lambda = -1;
-    for(int temp_index = index - 1; temp_index >= 0; --temp_index) {
+    for(int temp_index = index - 1; temp_index >= ctu_limit; --temp_index) {
      if(state->frame->lcu_stats[temp_index].lambda > 0) {
        clip_neighbor_lambda = state->frame->lcu_stats[temp_index].lambda;
        break;
@ -616,7 +620,7 @@ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
    est_qp = lambda_to_qp(est_lambda);

    int clip_qp = -1;
-    for (int temp_index = index - 1; temp_index >= 0; --temp_index) {
+    for (int temp_index = index - 1; temp_index >= ctu_limit; --temp_index) {
      if (state->frame->lcu_stats[temp_index].qp > -1) {
        clip_qp = state->frame->lcu_stats[temp_index].qp;
        break;