diff --git a/src/encoderstate.c b/src/encoderstate.c index d1902b3a..75846dce 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -333,16 +333,18 @@ static void encoder_state_worker_encode_lcu(void * opaque) { } static void encoder_state_encode_leaf(encoder_state_t * const state) { - const encoder_control_t * const encoder = state->encoder_control; - - int i = 0; - assert(state->is_leaf); assert(state->lcu_order_count > 0); - //If we're not using wavefronts, or we have a WAVEFRONT_ROW which is the single child of its parent, than we should not use parallelism - if (state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW || (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && !state->parent->children[1].encoder_control)) { - for (i = 0; i < state->lcu_order_count; ++i) { + // Select whether to encode the frame/tile in current thread or to define + // wavefront jobs for other threads to handle. + bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW; + bool use_parallel_encoding = (wavefront && state->parent->children[1].encoder_control); + if (!use_parallel_encoding) { + // Encode every LCU in order and perform SAO reconstruction after every + // frame is encoded. Deblocking and SAO search is done during LCU encoding. + + for (int i = 0; i < state->lcu_order_count; ++i) { PERFORMANCE_MEASURE_START(_DEBUG_PERF_ENCODE_LCU); encoder_state_worker_encode_lcu(&state->lcu_order[i]); @@ -355,7 +357,7 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { #endif //_DEBUG } - if (encoder->sao_enable) { + if (state->encoder_control->sao_enable) { PERFORMANCE_MEASURE_START(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME); sao_reconstruct_frame(state); PERFORMANCE_MEASURE_END(_DEBUG_PERF_SAO_RECONSTRUCT_FRAME, state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, state->lcu_order[0].position.y + state->tile->lcu_offset_y, state->lcu_order[state->lcu_order_count-1].position.y + state->tile->lcu_offset_y, @@ -364,7 +366,10 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { ); } } else { - for (i = 0; i < state->lcu_order_count; ++i) { + // Add every LCU in the frame as a job to a queue, along with + // their dependancies, so they can be processed in parallel. + + for (int i = 0; i < state->lcu_order_count; ++i) { const lcu_order_element_t * const lcu = &state->lcu_order[i]; #ifdef _DEBUG char job_description[256]; @@ -373,40 +378,41 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) { char* job_description = NULL; #endif state->tile->wf_jobs[lcu->id] = threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description); + assert(state->tile->wf_jobs[lcu->id] != NULL); + + // Add dependancy for inter frames to the reconstruction of the row + // below current row in the previous frame. This ensures that we can + // search for motion vectors in the previous frame as long as we don't + // go more than one LCU below current row. if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_recon_done && !state->global->is_radl_frame) { - - //Only for the first in the row (we reconstruct row-wise) + // Only add the dependancy to the first LCU in the row. if (!lcu->left) { - //If we have a row below, then we wait till it's completed if (lcu->below) { threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->below->encoder_state->previous_encoder_state->tqj_recon_done); - } - //Also add always a dep on current line - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done); - if (lcu->above) { - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->above->encoder_state->previous_encoder_state->tqj_recon_done); - } - } - } - if (state->tile->wf_jobs[lcu->id]) { - if (lcu->position.x > 0) { - // Wait for the LCU on the left. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]); - } - if (lcu->position.y > 0) { - if (lcu->position.x < state->tile->frame->width_in_lcu - 1) { - // Wait for the LCU to the top-right of this one. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]); } else { - // If there is no top-right LCU, wait for the one above. - threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]); + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], lcu->encoder_state->previous_encoder_state->tqj_recon_done); } } - threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]); } + + // Add local WPP dependancy to the LCU on the left. + if (lcu->left) { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]); + } + // Add local WPP dependancy to the LCU on the top right. + if (lcu->above) { + if (lcu->above->right) { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]); + } else { + threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]); + } + } + + threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobs[lcu->id]); + if (lcu->position.x == state->tile->frame->width_in_lcu - 1) { - if (!encoder->sao_enable) { - //No SAO + last LCU: the row is reconstructed + if (!state->encoder_control->sao_enable) { + // No SAO + last LCU: the row is reconstructed assert(!state->tqj_recon_done); state->tqj_recon_done = state->tile->wf_jobs[lcu->id]; }