Merge branch 'threadqueue-rewrite'

2024-11-24 02:24:07 +00:00 · 2017-08-11 14:23:31 +03:00 · 2017-08-11 14:23:31 +03:00 · 4e2e0a76cc
parent e5cbc7a205 7144a00beb
commit 4e2e0a76cc
11 changed files with 509 additions and 744 deletions
--- a/src/encoder.c
+++ b/src/encoder.c
@ -286,11 +286,8 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
    }
  }
-  encoder->threadqueue = MALLOC(threadqueue_queue_t, 1);
+  encoder->threadqueue = kvz_threadqueue_init(encoder->cfg.threads);
-  if (!encoder->threadqueue ||
+  if (!encoder->threadqueue) {
      !kvz_threadqueue_init(encoder->threadqueue,
                        encoder->cfg.threads,
                        encoder->cfg.owf > 0)) {
    fprintf(stderr, "Could not initialize threadqueue.\n");
    goto init_failed;
  }
@ -653,10 +650,8 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
  kvz_scalinglist_destroy(&encoder->scaling_list);
-  if (encoder->threadqueue) {
+  kvz_threadqueue_free(encoder->threadqueue);
-    kvz_threadqueue_finalize(encoder->threadqueue);
+  encoder->threadqueue = NULL;
  }
  FREE_POINTER(encoder->threadqueue);
  free(encoder);
 }
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -1018,19 +1018,13 @@ static void encoder_state_write_bitstream_main(encoder_state_t * const state)
    kvz_bitstream_add_rbsp_trailing_bits(stream);
  }
-  {
+  encoder_state_write_bitstream_children(state);
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
+
    encoder_state_write_bitstream_children(state);
    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", state->frame->num, state->type);
  }
  if (state->encoder_control->cfg.hash != KVZ_HASH_NONE) {
    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
    // Calculate checksum
    add_checksum(state);
    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", state->frame->num, state->type);
  }
-  
+
  //Get bitstream length for stats
  uint64_t newpos = kvz_bitstream_tell(stream);
  state->stats_bitstream_length = (newpos >> 3) - (curpos >> 3);
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -725,16 +725,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
    // frame is encoded. Deblocking and SAO search is done during LCU encoding.
    for (int i = 0; i < state->lcu_order_count; ++i) {
      PERFORMANCE_MEASURE_START(KVZ_PERF_LCU);
      encoder_state_worker_encode_lcu(&state->lcu_order[i]);
 #ifdef KVZ_DEBUG
      {
        const lcu_order_element_t * const lcu = &state->lcu_order[i];
        PERFORMANCE_MEASURE_END(KVZ_PERF_LCU, ctrl->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
      }
 #endif //KVZ_DEBUG
    }
  } else {
    // Add each LCU in the wavefront row as it's own job to the queue.
@ -769,14 +760,8 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
    for (int i = 0; i < state->lcu_order_count; ++i) {
      const lcu_order_element_t * const lcu = &state->lcu_order[i];
 #ifdef KVZ_DEBUG
      char job_description[256];
      sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
 #else
      char* job_description = NULL;
 #endif
      kvz_threadqueue_free_job(&state->tile->wf_jobs[lcu->id]);
-      state->tile->wf_jobs[lcu->id] = kvz_threadqueue_submit(ctrl->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
+      state->tile->wf_jobs[lcu->id] = kvz_threadqueue_job_create(encoder_state_worker_encode_lcu, (void*)lcu);
      threadqueue_job_t **job = &state->tile->wf_jobs[lcu->id];
      // If job object was returned, add dependancies and allow it to run.
@ -814,7 +799,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
          }
        }
-        kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
+        kvz_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
        // The wavefront row is done when the last LCU in the row is done.
        if (i + 1 == state->lcu_order_count) {
@ -911,27 +896,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
      for (int i = 0; main_state->children[i].encoder_control; ++i) {
        //If we don't have wavefronts, parallelize encoding of children.
        if (main_state->children[i].type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
 #ifdef KVZ_DEBUG
          char job_description[256];
          switch (main_state->children[i].type) {
            case ENCODER_STATE_TYPE_TILE: 
              sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->children[i].frame->num, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, 
                      main_state->children[i].lcu_order[0].position_px.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.x + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH - 1,
                      main_state->children[i].lcu_order[0].position_px.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.y + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH - 1);
              break;
            case ENCODER_STATE_TYPE_SLICE:
              sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].frame->num, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
              break;
            default:
              sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->children[i].frame->num);
              break;
          }
 #else
          char* job_description = NULL;
 #endif
          kvz_threadqueue_free_job(&main_state->children[i].tqj_recon_done);
-          main_state->children[i].tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->children[i]), 1, job_description);
+          main_state->children[i].tqj_recon_done =
-          if (main_state->children[i].previous_encoder_state != &main_state->children[i] && main_state->children[i].previous_encoder_state->tqj_recon_done && !main_state->children[i].frame->is_idr_frame) {
+            kvz_threadqueue_job_create(encoder_state_worker_encode_children, &main_state->children[i]);
          if (main_state->children[i].previous_encoder_state != &main_state->children[i] &&
              main_state->children[i].previous_encoder_state->tqj_recon_done &&
              !main_state->children[i].frame->is_idr_frame)
          {
 #if 0
            // Disabled due to non-determinism.
            if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
@ -947,7 +918,7 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
              }
            }
          }
-          kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);
+          kvz_threadqueue_submit(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);
        } else {
          //Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
          //FIXME: add an assert: we can only have wavefront children
@ -1256,39 +1227,22 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
 void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame)
 {
-  {
+  encoder_state_init_new_frame(state, frame);
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
+  encoder_state_encode(state);
    encoder_state_init_new_frame(state, frame);
    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=init_new_frame,frame=%d,poc=%d", state->frame->num, state->frame->poc);
  }
  {
    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
    encoder_state_encode(state);
    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=encode,frame=%d", state->frame->num);
  }
  //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
  {
    threadqueue_job_t *job;
 #ifdef KVZ_DEBUG
    char job_description[256];
    sprintf(job_description, "type=write_bitstream,frame=%d", state->frame->num);
 #else
    char* job_description = NULL;
 #endif
-    job = kvz_threadqueue_submit(state->encoder_control->threadqueue, kvz_encoder_state_worker_write_bitstream, (void*) state, 1, job_description);
+  threadqueue_job_t *job =
-    
+    kvz_threadqueue_job_create(kvz_encoder_state_worker_write_bitstream, state);
-    _encode_one_frame_add_bitstream_deps(state, job);
+
-    if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_bitstream_written) {
+  _encode_one_frame_add_bitstream_deps(state, job);
-      //We need to depend on previous bitstream generation
+  if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_bitstream_written) {
-      kvz_threadqueue_job_dep_add(job, state->previous_encoder_state->tqj_bitstream_written);
+    //We need to depend on previous bitstream generation
-    }
+    kvz_threadqueue_job_dep_add(job, state->previous_encoder_state->tqj_bitstream_written);
    kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, job);
    assert(!state->tqj_bitstream_written);
    state->tqj_bitstream_written = job;
  }
  kvz_threadqueue_submit(state->encoder_control->threadqueue, job);
  assert(!state->tqj_bitstream_written);
  state->tqj_bitstream_written = job;
  state->frame->done = 0;
  //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
 }
--- a/src/search.c
+++ b/src/search.c
@ -399,10 +399,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  int x_local = SUB_SCU(x);
  int y_local = SUB_SCU(y);
 #ifdef KVZ_DEBUG
  int debug_split = 0;
 #endif
  PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHCU);
  // Stop recursion if the CU is completely outside the frame.
  if (x >= frame->width || y >= frame->height) {
@ -713,13 +709,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    work_tree_copy_down(x_local, y_local, depth, work_tree);
  }
  PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHCU, state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", state->frame->num, state->tile->id, state->slice->id,
                          state->tile->offset_x + x,
                          state->tile->offset_x + x + cu_width,
                          state->tile->offset_y + y,
                          state->tile->offset_y + y + cu_width,
                          depth, debug_split, (cur_cu->type == CU_INTRA) ? 1 : 0);
  assert(cur_cu->type != CU_NOTSET);
  return cost;
--- a/src/threadqueue.c
+++ b/src/threadqueue.c
--- a/src/threadqueue.h
+++ b/src/threadqueue.h
@ -30,151 +30,22 @@
 #include "global.h" // IWYU pragma: keep
-typedef enum {
+typedef struct threadqueue_job_t threadqueue_job_t;
-  THREADQUEUE_JOB_STATE_QUEUED = 0,
+typedef struct threadqueue_queue_t threadqueue_queue_t;
  THREADQUEUE_JOB_STATE_RUNNING = 1,
  THREADQUEUE_JOB_STATE_DONE = 2
 } threadqueue_job_state;
-typedef struct threadqueue_job_t {
+threadqueue_queue_t * kvz_threadqueue_init(int thread_count);
  pthread_mutex_t lock;
  threadqueue_job_state state;
  unsigned int ndepends; //Number of active dependencies that this job wait for
  struct threadqueue_job_t **rdepends; //array of pointer to jobs that depend on this one. They have to exist when the thread finishes, because they cannot be run before.
  unsigned int rdepends_count; //number of rdepends
  unsigned int rdepends_size; //allocated size of rdepends
-  // Reference count
+threadqueue_job_t * kvz_threadqueue_job_create(void (*fptr)(void *arg), void *arg);
-  int refcount;
+int kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
-  //Job function and state to use
+int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *dependency);
  void (*fptr)(void *arg);
  void *arg;
 #ifdef KVZ_DEBUG
  const char* debug_description;
  int debug_worker_id;
  KVZ_CLOCK_T debug_clock_enqueue;
  KVZ_CLOCK_T debug_clock_start;
  KVZ_CLOCK_T debug_clock_stop;
  KVZ_CLOCK_T debug_clock_dequeue;
 #endif
 } threadqueue_job_t;
 typedef struct {
  pthread_mutex_t lock;
  pthread_cond_t cond;
  pthread_cond_t cb_cond;
  pthread_t *threads;
  int threads_count;
  int threads_running;
  bool stop; // if true, threads should stop asap
  int fifo;
  threadqueue_job_t **queue;
  unsigned int queue_start;
  unsigned int queue_count;
  unsigned int queue_size;
  unsigned int queue_waiting_execution; //Number of jobs without any dependency which could be run
  unsigned int queue_waiting_dependency; //Number of jobs waiting for a dependency to complete
  unsigned int queue_running; //Number of jobs running
 #ifdef KVZ_DEBUG
  //Format: pointer <tab> worker id <tab> time enqueued <tab> time started <tab> time stopped <tab> time dequeued <tab> job description
  //For threads, pointer = "" and job description == "thread", time enqueued and time dequeued are equal to "-"
  //For flush, pointer = "" and job description == "FLUSH", time enqueued, time dequeued and time started are equal to "-" 
  //Each time field, except the first one in the line be expressed in a relative way, by prepending the number of seconds by +.
  //Dependencies: pointer -> pointer
  FILE *debug_log;
  KVZ_CLOCK_T *debug_clock_thread_start;
  KVZ_CLOCK_T *debug_clock_thread_end;
 #endif
 } threadqueue_queue_t;
 //Init a threadqueue (if fifo, then behave as a FIFO with dependencies, otherwise as a LIFO with dependencies)
 int kvz_threadqueue_init(threadqueue_queue_t * threadqueue, int thread_count, int fifo);
 //Add a job to the queue, and returs a threadqueue_job handle. If wait == 1, one has to run kvz_threadqueue_job_unwait_job in order to have it run
 threadqueue_job_t * kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, void (*fptr)(void *arg), void *arg, int wait, const char* debug_description);
 void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr);
 threadqueue_job_t *kvz_threadqueue_copy_ref(threadqueue_job_t *job);
-int kvz_threadqueue_job_unwait_job(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
+void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr);
 //Add a dependency between two jobs.
 int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *depends_on);
 /**
 * \brief Stop all threads after they finish the current jobs.
 *
 * Blocks until all threads have stopped.
 */
 int kvz_threadqueue_stop(threadqueue_queue_t * const threadqueue);
 //Blocking call until job is executed. Job handles submitted before job should not be used any more as they are removed from the queue.
 int kvz_threadqueue_waitfor(threadqueue_queue_t * threadqueue, threadqueue_job_t * job);
 int kvz_threadqueue_stop(threadqueue_queue_t * threadqueue);
 void kvz_threadqueue_free(threadqueue_queue_t * threadqueue);
-//Free ressources in a threadqueue
+#endif // THREADQUEUE_H_
 int kvz_threadqueue_finalize(threadqueue_queue_t * threadqueue);
 #ifdef KVZ_DEBUG
 int threadqueue_log(threadqueue_queue_t * threadqueue, const KVZ_CLOCK_T *start, const KVZ_CLOCK_T *stop, const char* debug_description);
 // Bitmasks for PERFORMANCE_MEASURE_START and PERFORMANCE_MEASURE_END.
 #define KVZ_PERF_FRAME    (1 << 0)
 #define KVZ_PERF_JOB      (1 << 1)
 #define KVZ_PERF_LCU      (1 << 2)
 #define KVZ_PERF_SAOREC   (1 << 3)
 #define KVZ_PERF_BSLEAF   (1 << 4)
 #define KVZ_PERF_SEARCHCU (1 << 5)
 #define IMPL_PERFORMANCE_MEASURE_START(mask) KVZ_CLOCK_T start, stop; if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&start); }
 #define IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) { if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}} } \
 #ifdef _MSC_VER
 // Disable VS conditional expression warning from debug code.
 # define WITHOUT_CONSTANT_EXP_WARNING(macro) \
  __pragma(warning(push)) \
  __pragma(warning(disable:4127)) \
  macro \
  __pragma(warning(pop))
 # define PERFORMANCE_MEASURE_START(mask) \
    WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_START(mask))
 # define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
    WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__))
 #else
 # define PERFORMANCE_MEASURE_START(mask) \
    IMPL_PERFORMANCE_MEASURE_START(mask)
 # define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
    IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__)
 #endif
 #else
 #define PERFORMANCE_MEASURE_START(mask) 
 #define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) 
 #endif
 /* Constraints: 
 * 
 * - Always first lock threadqueue, than a job inside it
 * - When job A depends on job B, always lock first job B and then job A
 * - Jobs should be submitted in an order which is compatible with serial execution.
 * 
 * */
 #endif //THREADQUEUE_H_
--- a/tests/coeff_sum_tests.c
+++ b/tests/coeff_sum_tests.c
@ -52,7 +52,7 @@ SUITE(coeff_sum_tests)
 {
  setup();
-  for (int i = 0; i < strategies.count; ++i) {
+  for (volatile int i = 0; i < strategies.count; ++i) {
    if (strcmp(strategies.strategies[i].type, "coeff_abs_sum") != 0) {
      continue;
    }
--- a/tests/intra_sad_tests.c
+++ b/tests/intra_sad_tests.c
@ -177,7 +177,7 @@ SUITE(intra_sad_tests)
  // Loop through all strategies picking out the intra sad ones and run
  // selectec strategies though all tests.
-  for (unsigned i = 0; i < strategies.count; ++i) {
+  for (volatile unsigned i = 0; i < strategies.count; ++i) {
    const char * type = strategies.strategies[i].type;
    if (strcmp(type, "sad_4x4") == 0) {
--- a/tests/sad_tests.c
+++ b/tests/sad_tests.c
@ -378,7 +378,7 @@ SUITE(sad_tests)
    sad_test_env.tested_func = strategies.strategies[i].fptr;
    sad_test_env.strategy = &strategies.strategies[i];
    int num_dim_tests = sizeof(tested_dims) / sizeof(tested_dims[0]);
-    for (int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
+    for (volatile int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
      sad_test_env.width = tested_dims[dim_test].width;
      sad_test_env.height = tested_dims[dim_test].height;
      RUN_TEST(test_reg_sad);
--- a/tests/satd_tests.c
+++ b/tests/satd_tests.c
@ -167,7 +167,7 @@ SUITE(satd_tests)
  // Loop through all strategies picking out the intra sad ones and run
  // selectec strategies though all tests.
-  for (unsigned i = 0; i < strategies.count; ++i) {
+  for (volatile unsigned i = 0; i < strategies.count; ++i) {
    const char * type = strategies.strategies[i].type;
    if (strcmp(type, "satd_4x4") == 0) {
--- a/tests/speed_tests.c
+++ b/tests/speed_tests.c
@ -405,7 +405,7 @@ SUITE(speed_tests)
      int num_tested_dims = sizeof(tested_dims) / sizeof(*tested_dims);
      // Call reg_sad with all the sizes it is actually called with.
-      for (int dim_i = 0; dim_i < num_tested_dims; ++dim_i) {
+      for (volatile int dim_i = 0; dim_i < num_tested_dims; ++dim_i) {
        test_env.width = tested_dims[dim_i].x;
        test_env.height = tested_dims[dim_i].y;
        RUN_TEST(inter_sad);