Merge branch 'threadqueue-rewrite'

This commit is contained in:
Arttu Ylä-Outinen 2017-08-11 14:23:31 +03:00
commit 4e2e0a76cc
11 changed files with 509 additions and 744 deletions

View file

@ -286,11 +286,8 @@ encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg)
}
}
encoder->threadqueue = MALLOC(threadqueue_queue_t, 1);
if (!encoder->threadqueue ||
!kvz_threadqueue_init(encoder->threadqueue,
encoder->cfg.threads,
encoder->cfg.owf > 0)) {
encoder->threadqueue = kvz_threadqueue_init(encoder->cfg.threads);
if (!encoder->threadqueue) {
fprintf(stderr, "Could not initialize threadqueue.\n");
goto init_failed;
}
@ -653,10 +650,8 @@ void kvz_encoder_control_free(encoder_control_t *const encoder)
kvz_scalinglist_destroy(&encoder->scaling_list);
if (encoder->threadqueue) {
kvz_threadqueue_finalize(encoder->threadqueue);
}
FREE_POINTER(encoder->threadqueue);
kvz_threadqueue_free(encoder->threadqueue);
encoder->threadqueue = NULL;
free(encoder);
}

View file

@ -1018,17 +1018,11 @@ static void encoder_state_write_bitstream_main(encoder_state_t * const state)
kvz_bitstream_add_rbsp_trailing_bits(stream);
}
{
PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
encoder_state_write_bitstream_children(state);
PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", state->frame->num, state->type);
}
if (state->encoder_control->cfg.hash != KVZ_HASH_NONE) {
PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
// Calculate checksum
add_checksum(state);
PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", state->frame->num, state->type);
}
//Get bitstream length for stats

View file

@ -725,16 +725,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
// frame is encoded. Deblocking and SAO search is done during LCU encoding.
for (int i = 0; i < state->lcu_order_count; ++i) {
PERFORMANCE_MEASURE_START(KVZ_PERF_LCU);
encoder_state_worker_encode_lcu(&state->lcu_order[i]);
#ifdef KVZ_DEBUG
{
const lcu_order_element_t * const lcu = &state->lcu_order[i];
PERFORMANCE_MEASURE_END(KVZ_PERF_LCU, ctrl->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
}
#endif //KVZ_DEBUG
}
} else {
// Add each LCU in the wavefront row as it's own job to the queue.
@ -769,14 +760,8 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
for (int i = 0; i < state->lcu_order_count; ++i) {
const lcu_order_element_t * const lcu = &state->lcu_order[i];
#ifdef KVZ_DEBUG
char job_description[256];
sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
#else
char* job_description = NULL;
#endif
kvz_threadqueue_free_job(&state->tile->wf_jobs[lcu->id]);
state->tile->wf_jobs[lcu->id] = kvz_threadqueue_submit(ctrl->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
state->tile->wf_jobs[lcu->id] = kvz_threadqueue_job_create(encoder_state_worker_encode_lcu, (void*)lcu);
threadqueue_job_t **job = &state->tile->wf_jobs[lcu->id];
// If job object was returned, add dependancies and allow it to run.
@ -814,7 +799,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
}
}
kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
kvz_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]);
// The wavefront row is done when the last LCU in the row is done.
if (i + 1 == state->lcu_order_count) {
@ -911,27 +896,13 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
for (int i = 0; main_state->children[i].encoder_control; ++i) {
//If we don't have wavefronts, parallelize encoding of children.
if (main_state->children[i].type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
#ifdef KVZ_DEBUG
char job_description[256];
switch (main_state->children[i].type) {
case ENCODER_STATE_TYPE_TILE:
sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->children[i].frame->num, main_state->children[i].tile->id, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y, main_state->children[i].lcu_order[0].position.y + main_state->children[i].tile->lcu_offset_y,
main_state->children[i].lcu_order[0].position_px.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.x + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.x + main_state->children[i].tile->lcu_offset_x * LCU_WIDTH - 1,
main_state->children[i].lcu_order[0].position_px.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH, main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].position_px.y + main_state->children[i].lcu_order[main_state->children[i].lcu_order_count-1].size.y + main_state->children[i].tile->lcu_offset_y * LCU_WIDTH - 1);
break;
case ENCODER_STATE_TYPE_SLICE:
sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->children[i].frame->num, main_state->children[i].slice->id, main_state->children[i].slice->start_in_ts);
break;
default:
sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->children[i].frame->num);
break;
}
#else
char* job_description = NULL;
#endif
kvz_threadqueue_free_job(&main_state->children[i].tqj_recon_done);
main_state->children[i].tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->children[i]), 1, job_description);
if (main_state->children[i].previous_encoder_state != &main_state->children[i] && main_state->children[i].previous_encoder_state->tqj_recon_done && !main_state->children[i].frame->is_idr_frame) {
main_state->children[i].tqj_recon_done =
kvz_threadqueue_job_create(encoder_state_worker_encode_children, &main_state->children[i]);
if (main_state->children[i].previous_encoder_state != &main_state->children[i] &&
main_state->children[i].previous_encoder_state->tqj_recon_done &&
!main_state->children[i].frame->is_idr_frame)
{
#if 0
// Disabled due to non-determinism.
if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
@ -947,7 +918,7 @@ static void encoder_state_encode(encoder_state_t * const main_state) {
}
}
}
kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);
kvz_threadqueue_submit(main_state->encoder_control->threadqueue, main_state->children[i].tqj_recon_done);
} else {
//Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
//FIXME: add an assert: we can only have wavefront children
@ -1256,39 +1227,22 @@ static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const s
void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame)
{
{
PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
encoder_state_init_new_frame(state, frame);
PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=init_new_frame,frame=%d,poc=%d", state->frame->num, state->frame->poc);
}
{
PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
encoder_state_encode(state);
PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=encode,frame=%d", state->frame->num);
}
//kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
{
threadqueue_job_t *job;
#ifdef KVZ_DEBUG
char job_description[256];
sprintf(job_description, "type=write_bitstream,frame=%d", state->frame->num);
#else
char* job_description = NULL;
#endif
job = kvz_threadqueue_submit(state->encoder_control->threadqueue, kvz_encoder_state_worker_write_bitstream, (void*) state, 1, job_description);
threadqueue_job_t *job =
kvz_threadqueue_job_create(kvz_encoder_state_worker_write_bitstream, state);
_encode_one_frame_add_bitstream_deps(state, job);
if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_bitstream_written) {
//We need to depend on previous bitstream generation
kvz_threadqueue_job_dep_add(job, state->previous_encoder_state->tqj_bitstream_written);
}
kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, job);
kvz_threadqueue_submit(state->encoder_control->threadqueue, job);
assert(!state->tqj_bitstream_written);
state->tqj_bitstream_written = job;
}
state->frame->done = 0;
//kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
}

View file

@ -399,10 +399,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
int x_local = SUB_SCU(x);
int y_local = SUB_SCU(y);
#ifdef KVZ_DEBUG
int debug_split = 0;
#endif
PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHCU);
// Stop recursion if the CU is completely outside the frame.
if (x >= frame->width || y >= frame->height) {
@ -713,13 +709,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
work_tree_copy_down(x_local, y_local, depth, work_tree);
}
PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHCU, state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", state->frame->num, state->tile->id, state->slice->id,
state->tile->offset_x + x,
state->tile->offset_x + x + cu_width,
state->tile->offset_y + y,
state->tile->offset_y + y + cu_width,
depth, debug_split, (cur_cu->type == CU_INTRA) ? 1 : 0);
assert(cur_cu->type != CU_NOTSET);
return cost;

File diff suppressed because it is too large Load diff

View file

@ -30,151 +30,22 @@
#include "global.h" // IWYU pragma: keep
typedef enum {
THREADQUEUE_JOB_STATE_QUEUED = 0,
THREADQUEUE_JOB_STATE_RUNNING = 1,
THREADQUEUE_JOB_STATE_DONE = 2
} threadqueue_job_state;
typedef struct threadqueue_job_t threadqueue_job_t;
typedef struct threadqueue_queue_t threadqueue_queue_t;
typedef struct threadqueue_job_t {
pthread_mutex_t lock;
threadqueue_queue_t * kvz_threadqueue_init(int thread_count);
threadqueue_job_state state;
threadqueue_job_t * kvz_threadqueue_job_create(void (*fptr)(void *arg), void *arg);
int kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
unsigned int ndepends; //Number of active dependencies that this job wait for
struct threadqueue_job_t **rdepends; //array of pointer to jobs that depend on this one. They have to exist when the thread finishes, because they cannot be run before.
unsigned int rdepends_count; //number of rdepends
unsigned int rdepends_size; //allocated size of rdepends
// Reference count
int refcount;
//Job function and state to use
void (*fptr)(void *arg);
void *arg;
#ifdef KVZ_DEBUG
const char* debug_description;
int debug_worker_id;
KVZ_CLOCK_T debug_clock_enqueue;
KVZ_CLOCK_T debug_clock_start;
KVZ_CLOCK_T debug_clock_stop;
KVZ_CLOCK_T debug_clock_dequeue;
#endif
} threadqueue_job_t;
typedef struct {
pthread_mutex_t lock;
pthread_cond_t cond;
pthread_cond_t cb_cond;
pthread_t *threads;
int threads_count;
int threads_running;
bool stop; // if true, threads should stop asap
int fifo;
threadqueue_job_t **queue;
unsigned int queue_start;
unsigned int queue_count;
unsigned int queue_size;
unsigned int queue_waiting_execution; //Number of jobs without any dependency which could be run
unsigned int queue_waiting_dependency; //Number of jobs waiting for a dependency to complete
unsigned int queue_running; //Number of jobs running
#ifdef KVZ_DEBUG
//Format: pointer <tab> worker id <tab> time enqueued <tab> time started <tab> time stopped <tab> time dequeued <tab> job description
//For threads, pointer = "" and job description == "thread", time enqueued and time dequeued are equal to "-"
//For flush, pointer = "" and job description == "FLUSH", time enqueued, time dequeued and time started are equal to "-"
//Each time field, except the first one in the line be expressed in a relative way, by prepending the number of seconds by +.
//Dependencies: pointer -> pointer
FILE *debug_log;
KVZ_CLOCK_T *debug_clock_thread_start;
KVZ_CLOCK_T *debug_clock_thread_end;
#endif
} threadqueue_queue_t;
//Init a threadqueue (if fifo, then behave as a FIFO with dependencies, otherwise as a LIFO with dependencies)
int kvz_threadqueue_init(threadqueue_queue_t * threadqueue, int thread_count, int fifo);
//Add a job to the queue, and returs a threadqueue_job handle. If wait == 1, one has to run kvz_threadqueue_job_unwait_job in order to have it run
threadqueue_job_t * kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, void (*fptr)(void *arg), void *arg, int wait, const char* debug_description);
void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr);
int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *dependency);
threadqueue_job_t *kvz_threadqueue_copy_ref(threadqueue_job_t *job);
int kvz_threadqueue_job_unwait_job(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr);
//Add a dependency between two jobs.
int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *depends_on);
/**
* \brief Stop all threads after they finish the current jobs.
*
* Blocks until all threads have stopped.
*/
int kvz_threadqueue_stop(threadqueue_queue_t * const threadqueue);
//Blocking call until job is executed. Job handles submitted before job should not be used any more as they are removed from the queue.
int kvz_threadqueue_waitfor(threadqueue_queue_t * threadqueue, threadqueue_job_t * job);
//Free ressources in a threadqueue
int kvz_threadqueue_finalize(threadqueue_queue_t * threadqueue);
#ifdef KVZ_DEBUG
int threadqueue_log(threadqueue_queue_t * threadqueue, const KVZ_CLOCK_T *start, const KVZ_CLOCK_T *stop, const char* debug_description);
// Bitmasks for PERFORMANCE_MEASURE_START and PERFORMANCE_MEASURE_END.
#define KVZ_PERF_FRAME (1 << 0)
#define KVZ_PERF_JOB (1 << 1)
#define KVZ_PERF_LCU (1 << 2)
#define KVZ_PERF_SAOREC (1 << 3)
#define KVZ_PERF_BSLEAF (1 << 4)
#define KVZ_PERF_SEARCHCU (1 << 5)
#define IMPL_PERFORMANCE_MEASURE_START(mask) KVZ_CLOCK_T start, stop; if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&start); }
#define IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) { if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&stop); {char job_description[256]; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}} } \
#ifdef _MSC_VER
// Disable VS conditional expression warning from debug code.
# define WITHOUT_CONSTANT_EXP_WARNING(macro) \
__pragma(warning(push)) \
__pragma(warning(disable:4127)) \
macro \
__pragma(warning(pop))
# define PERFORMANCE_MEASURE_START(mask) \
WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_START(mask))
# define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__))
#else
# define PERFORMANCE_MEASURE_START(mask) \
IMPL_PERFORMANCE_MEASURE_START(mask)
# define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__)
#endif
#else
#define PERFORMANCE_MEASURE_START(mask)
#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...)
#endif
/* Constraints:
*
* - Always first lock threadqueue, than a job inside it
* - When job A depends on job B, always lock first job B and then job A
* - Jobs should be submitted in an order which is compatible with serial execution.
*
* */
int kvz_threadqueue_stop(threadqueue_queue_t * threadqueue);
void kvz_threadqueue_free(threadqueue_queue_t * threadqueue);
#endif // THREADQUEUE_H_

View file

@ -52,7 +52,7 @@ SUITE(coeff_sum_tests)
{
setup();
for (int i = 0; i < strategies.count; ++i) {
for (volatile int i = 0; i < strategies.count; ++i) {
if (strcmp(strategies.strategies[i].type, "coeff_abs_sum") != 0) {
continue;
}

View file

@ -177,7 +177,7 @@ SUITE(intra_sad_tests)
// Loop through all strategies picking out the intra sad ones and run
// selectec strategies though all tests.
for (unsigned i = 0; i < strategies.count; ++i) {
for (volatile unsigned i = 0; i < strategies.count; ++i) {
const char * type = strategies.strategies[i].type;
if (strcmp(type, "sad_4x4") == 0) {

View file

@ -378,7 +378,7 @@ SUITE(sad_tests)
sad_test_env.tested_func = strategies.strategies[i].fptr;
sad_test_env.strategy = &strategies.strategies[i];
int num_dim_tests = sizeof(tested_dims) / sizeof(tested_dims[0]);
for (int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
for (volatile int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
sad_test_env.width = tested_dims[dim_test].width;
sad_test_env.height = tested_dims[dim_test].height;
RUN_TEST(test_reg_sad);

View file

@ -167,7 +167,7 @@ SUITE(satd_tests)
// Loop through all strategies picking out the intra sad ones and run
// selectec strategies though all tests.
for (unsigned i = 0; i < strategies.count; ++i) {
for (volatile unsigned i = 0; i < strategies.count; ++i) {
const char * type = strategies.strategies[i].type;
if (strcmp(type, "satd_4x4") == 0) {

View file

@ -405,7 +405,7 @@ SUITE(speed_tests)
int num_tested_dims = sizeof(tested_dims) / sizeof(*tested_dims);
// Call reg_sad with all the sizes it is actually called with.
for (int dim_i = 0; dim_i < num_tested_dims; ++dim_i) {
for (volatile int dim_i = 0; dim_i < num_tested_dims; ++dim_i) {
test_env.width = tested_dims[dim_i].x;
test_env.height = tested_dims[dim_i].y;
RUN_TEST(inter_sad);