From 8c4a3473a85152d692d43eaa5e5f0b9fda73738e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= <arttu.yla-outinen@tut.fi>
Date: Mon, 17 Jul 2017 12:03:02 +0300
Subject: [PATCH] Change --owf=auto and --threads=auto selection

Changes OWF selection so that it is chosen based on the maximum number
of parallel CTUs. Number of threads is limited to prevent overhead from
extra threads.
---
 src/encoder.c | 197 ++++++++++++++++++++++++++++----------------------
 1 file changed, 111 insertions(+), 86 deletions(-)

diff --git a/src/encoder.c b/src/encoder.c
index df2fe376..85ff1f51 100644
--- a/src/encoder.c
+++ b/src/encoder.c
@@ -40,88 +40,84 @@ static const double ERP_AQP_STRENGTH = 3.0;
 
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
-static int size_of_wpp_ends(int threads)
-{
-  // Based on the shape of the area where all threads can't yet run in parallel.
-  return 4 * threads * threads - 2 * threads;
-}
-
-static int select_owf_auto(const kvz_config *const cfg)
-{
-  if (cfg->intra_period == 1) {
-    if (cfg->wpp) {
-      // If wpp is on, select owf such that less than 15% of the
-      // frame is covered by the are threads can not work at the same time.
-      const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-      const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
-
-      // Find the largest number of threads per frame that satifies the
-      // the condition: wpp start/stop inefficiency takes up  less than 15%
-      // of frame area.
-      int threads_per_frame = 1;
-      const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
-      while ((threads_per_frame + 1) * 2 < lcu_width &&
-        threads_per_frame + 1 < lcu_height &&
-        size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) {
-        ++threads_per_frame;
-      }
-
-      const int threads = MAX(cfg->threads, 1);
-      const int frames = CEILDIV(threads, threads_per_frame);
-
-      // Convert from number of parallel frames to number of additional frames.
-      return CLIP(0, threads - 1, frames - 1);
-    } else {
-      // If wpp is not on, select owf such that there is enough
-      // tiles for twice the number of threads.
-
-      int tiles_per_frame = cfg->tiles_width_count * cfg->tiles_height_count;
-      int threads = (cfg->threads > 1 ? cfg->threads : 1);
-      int frames = CEILDIV(threads * 4, tiles_per_frame);
-
-      // Limit number of frames to 1.25x the number of threads for the case
-      // where there is only 1 tile per frame.
-      frames = CLIP(1, threads * 4 / 3, frames);
-      return frames - 1;
-    }
-  } else {
-    // Try and estimate a good number of parallel frames for inter.
-    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
-    int threads_per_frame = MIN(lcu_width / 2, lcu_height);
-    int threads = cfg->threads;
-
-    // If all threads fit into one frame, at least two parallel frames should
-    // be used to reduce the effect of WPP spin-up and wind-down.
-    int frames = 1;
-
-    while (threads > 0 && threads_per_frame > 0) {
-      frames += 1;
-      threads -= threads_per_frame;
-      threads_per_frame -= 2;
-    }
-
-    if (cfg->gop_len && cfg->gop_lowdelay && cfg->gop_lp_definition.t > 1) {
-      // Temporal skipping makes every other frame very fast to encode so
-      // more parallel frames should be used.
-      frames *= 2;
-    }
-    return CLIP(0, cfg->threads * 2 - 1, frames - 1);
-  }
-}
-
-
 static unsigned cfg_num_threads(void)
 {
-  unsigned cpus = kvz_g_hardware_flags.physical_cpu_count;
-  unsigned fake_cpus = kvz_g_hardware_flags.logical_cpu_count - cpus;
+  if (kvz_g_hardware_flags.logical_cpu_count == 0) {
+    // Default to 4 if we don't know the number of CPUs.
+    return 4;
+  }
 
-  // Default to 4 if we don't know the number of CPUs.
-  if (cpus == 0) return 4;
+  return kvz_g_hardware_flags.logical_cpu_count;
+}
 
-  // 1.5 times the number of physical cores seems to be a good compromise
-  // when hyperthreading is available on Haswell.
-  return cpus + fake_cpus / 2;
+
+static int get_max_parallelism(const encoder_control_t *const encoder)
+{
+  const int width_lcu  = CEILDIV(encoder->cfg.width, LCU_WIDTH);
+  const int height_lcu = CEILDIV(encoder->cfg.height, LCU_WIDTH);
+  const int wpp_limit  = MIN(height_lcu, CEILDIV(width_lcu, 2));
+  const int par_frames = encoder->cfg.owf + 1;
+
+  int parallelism = 0;
+
+  if (encoder->cfg.intra_period == 1) {
+    int threads_per_frame;
+    if (encoder->cfg.wpp) {
+      // Usually limited by width because starting to code a CTU requires
+      // that the next two CTUs in the row above have been completed.
+      threads_per_frame = wpp_limit;
+    } else {
+      // One thread for each tile.
+      threads_per_frame = encoder->cfg.tiles_width_count *
+                          encoder->cfg.tiles_height_count;
+    }
+    // Divide by two since all frames cannot achieve the maximum
+    // parallelism all the time.
+    parallelism = par_frames * threads_per_frame / 2;
+
+  } else {
+    if (encoder->cfg.wpp) {
+      const int last_diagonal = (width_lcu - 1) + (height_lcu - 1) * 2;
+
+      // Index of a diagonal. The diagonal contains CTUs whose coordinates
+      // satisfy x + 2*y == diagonal. We start the sum from the longest
+      // diagonal.
+      int diagonal = CEILDIV(last_diagonal, 2);
+
+      // Difference between diagonal indices in consecutive frames.
+      const int frame_delay = 1 + encoder->max_inter_ref_lcu.right +
+                              2 * encoder->max_inter_ref_lcu.down;
+      int step = frame_delay;
+      int direction = -1;
+
+      // Compute number of threads for each parallel frame.
+      for (int num_frames = 0; num_frames < par_frames; num_frames++) {
+        if (diagonal < 0 || diagonal > last_diagonal) {
+          // No room for more threads.
+          break;
+        }
+
+        // Count number of CTUs on the diagonal.
+        if (diagonal < MIN(2 * height_lcu, width_lcu)) {
+          parallelism += 1 + diagonal / 2;
+        } else {
+          parallelism += MIN(
+            wpp_limit,
+            height_lcu + CEILDIV(width_lcu, 2) - 1 - CEILDIV(diagonal, 2)
+          );
+        }
+        diagonal += direction * step;
+        step += frame_delay;
+        direction = -direction;
+      }
+
+    } else {
+      parallelism = encoder->cfg.tiles_width_count *
+                    encoder->cfg.tiles_height_count;
+    }
+  }
+
+  return parallelism;
 }
 
 
@@ -235,21 +231,53 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
   encoder->cfg.tiles_height_split = NULL;
   encoder->cfg.slice_addresses_in_ts = NULL;
 
-  if (encoder->cfg.threads == -1) {
-    encoder->cfg.threads = cfg_num_threads();
-  }
-
   if (encoder->cfg.gop_len > 0) {
     if (encoder->cfg.gop_lowdelay) {
       kvz_config_process_lp_gop(&encoder->cfg);
     }
   }
 
+  encoder->max_inter_ref_lcu.right = 1;
+  encoder->max_inter_ref_lcu.down  = 1;
+
+  int max_threads = encoder->cfg.threads;
+  if (max_threads < 0) {
+    max_threads = cfg_num_threads();
+  }
+  max_threads = MAX(1, max_threads);
+
   // Need to set owf before initializing threadqueue.
   if (encoder->cfg.owf < 0) {
-    encoder->cfg.owf = select_owf_auto(&encoder->cfg);
+    int best_parallelism = 0;
+
+    for (encoder->cfg.owf = 0; true; encoder->cfg.owf++) {
+      int parallelism = get_max_parallelism(encoder);
+
+      if (parallelism <= best_parallelism) {
+        // No improvement over previous OWF.
+        encoder->cfg.owf--;
+        break;
+      }
+
+      best_parallelism = parallelism;
+      if (parallelism >= max_threads) {
+        // Cannot have more parallelism than there are threads.
+        break;
+      }
+    }
+
+    // Add two frames so that we have frames ready to be coded when one is
+    // completed.
+    encoder->cfg.owf += 2;
+
     fprintf(stderr, "--owf=auto value set to %d.\n", encoder->cfg.owf);
   }
+
+  if (encoder->cfg.threads < 0) {
+    encoder->cfg.threads = MIN(max_threads, get_max_parallelism(encoder));
+    fprintf(stderr, "--threads=auto value set to %d.\n", encoder->cfg.threads);
+  }
+
   if (encoder->cfg.source_scan_type != KVZ_INTERLACING_NONE) {
     // If using interlaced coding with OWF, the OWF has to be an even number
     // to ensure that the pair of fields will be output for the same picture.
@@ -574,9 +602,6 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
     encoder->cfg.trskip_enable   = false;
   }
 
-  encoder->max_inter_ref_lcu.right = 1;
-  encoder->max_inter_ref_lcu.down  = 1;
-
   // If fractional framerate is set, use that instead of the floating point framerate.
   if (encoder->cfg.framerate_num != 0) {
     encoder->vui.timing_info_present_flag = 1;