Merge branch 'threading_fixes'

This commit is contained in:
Marko Viitanen 2021-09-14 11:00:02 +03:00
commit 4e5482817f
4 changed files with 99 additions and 37 deletions

View file

@ -136,21 +136,24 @@ uint64_t kvz_bitstream_tell(const bitstream_t *const stream)
*/ */
void kvz_bitstream_writebyte(bitstream_t *const stream, const uint8_t byte) void kvz_bitstream_writebyte(bitstream_t *const stream, const uint8_t byte)
{ {
assert(stream->cur_bit == 0); assert(stream->cur_bit == 0 || stream->simulation);
if (stream->last == NULL || stream->last->len == KVZ_DATA_CHUNK_SIZE) { if (!stream->simulation) {
// Need to allocate a new chunk.
kvz_data_chunk *new_chunk = kvz_bitstream_alloc_chunk();
assert(new_chunk);
if (!stream->first) stream->first = new_chunk; if (stream->last == NULL || stream->last->len == KVZ_DATA_CHUNK_SIZE) {
if (stream->last) stream->last->next = new_chunk; // Need to allocate a new chunk.
stream->last = new_chunk; kvz_data_chunk* new_chunk = kvz_bitstream_alloc_chunk();
assert(new_chunk);
if (!stream->first) stream->first = new_chunk;
if (stream->last) stream->last->next = new_chunk;
stream->last = new_chunk;
}
assert(stream->last->len < KVZ_DATA_CHUNK_SIZE);
stream->last->data[stream->last->len] = byte;
stream->last->len += 1;
} }
assert(stream->last->len < KVZ_DATA_CHUNK_SIZE);
stream->last->data[stream->last->len] = byte;
stream->last->len += 1;
stream->len += 1; stream->len += 1;
} }
@ -161,9 +164,9 @@ void kvz_bitstream_writebyte(bitstream_t *const stream, const uint8_t byte)
*/ */
void kvz_bitstream_move(bitstream_t *const dst, bitstream_t *const src) void kvz_bitstream_move(bitstream_t *const dst, bitstream_t *const src)
{ {
assert(dst->cur_bit == 0); assert(dst->cur_bit == 0 || src->simulation);
if (src->len > 0) { if (src->len > 0 && !src->simulation) {
if (dst->first == NULL) { if (dst->first == NULL) {
dst->first = src->first; dst->first = src->first;
dst->last = src->last; dst->last = src->last;
@ -179,6 +182,7 @@ void kvz_bitstream_move(bitstream_t *const dst, bitstream_t *const src)
dst->data = src->data; dst->data = src->data;
dst->cur_bit = src->cur_bit; dst->cur_bit = src->cur_bit;
dst->zerocount = src->zerocount; dst->zerocount = src->zerocount;
dst->simulation = src->simulation;
src->first = src->last = NULL; src->first = src->last = NULL;
kvz_bitstream_clear(src); kvz_bitstream_clear(src);
@ -200,7 +204,7 @@ void kvz_bitstream_clear(bitstream_t *const stream)
*/ */
void kvz_bitstream_put_byte(bitstream_t *const stream, uint32_t data) void kvz_bitstream_put_byte(bitstream_t *const stream, uint32_t data)
{ {
assert(stream->cur_bit == 0); assert(stream->cur_bit == 0 || stream->simulation);
const uint8_t emulation_prevention_three_byte = 0x03; const uint8_t emulation_prevention_three_byte = 0x03;
if ((stream->zerocount == 2) && (data < 4)) { if ((stream->zerocount == 2) && (data < 4)) {
@ -220,6 +224,10 @@ void kvz_bitstream_put_byte(bitstream_t *const stream, uint32_t data)
*/ */
void kvz_bitstream_put(bitstream_t *const stream, const uint32_t data, uint8_t bits) void kvz_bitstream_put(bitstream_t *const stream, const uint32_t data, uint8_t bits)
{ {
if (stream->simulation) {
stream->cur_bit += bits;
return;
}
while (bits--) { while (bits--) {
stream->data <<= 1; stream->data <<= 1;

View file

@ -52,6 +52,8 @@ typedef struct bitstream_t
uint8_t cur_bit; uint8_t cur_bit;
uint8_t zerocount; uint8_t zerocount;
bool simulation;
} bitstream_t; } bitstream_t;
typedef struct typedef struct

View file

@ -612,6 +612,8 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
} }
} }
static void encoder_state_worker_encode_lcu_bitstream(void* opaque);
static void encoder_state_worker_encode_lcu_search(void * opaque) static void encoder_state_worker_encode_lcu_search(void * opaque)
{ {
lcu_order_element_t * const lcu = opaque; lcu_order_element_t * const lcu = opaque;
@ -671,6 +673,12 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y); kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y);
encoder_sao_reconstruct(state, lcu); encoder_sao_reconstruct(state, lcu);
} }
// Do simulated bitstream writing to update the cabac contexts
if (encoder->cfg.alf_type) {
state->stream.simulation = true;
encoder_state_worker_encode_lcu_bitstream(opaque);
}
} }
static void encoder_state_worker_encode_lcu_bitstream(void * opaque) static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
@ -695,9 +703,11 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
//Encode coding tree //Encode coding tree
kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff); kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff);
// Coeffs are not needed anymore. if (!state->stream.simulation) {
free(lcu->coeff); // Coeffs are not needed anymore.
lcu->coeff = NULL; free(lcu->coeff);
lcu->coeff = NULL;
}
bool end_of_slice_segment_flag; bool end_of_slice_segment_flag;
if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) { if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) {
@ -772,9 +782,30 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
} }
} }
static void encoder_state_init_children_after_simulation(encoder_state_t* const state) {
kvz_bitstream_clear(&state->stream);
if (state->is_leaf) {
//Leaf states have cabac and context
kvz_cabac_start(&state->cabac);
kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype);
}
for (int i = 0; state->children[i].encoder_control; ++i) {
encoder_state_init_children_after_simulation(&state->children[i]);
}
}
void kvz_alf_enc_process_job(void* opaque) { void kvz_alf_enc_process_job(void* opaque) {
kvz_alf_enc_process((encoder_state_t* const)opaque); encoder_state_t* const state = (encoder_state_t* const)opaque;
kvz_alf_enc_process(state);
encoder_state_t* parent = state;
while (parent->parent) parent = parent->parent;
// If ALF was used the bitstream coding was simulated in search, reset the cabac/stream
encoder_state_init_children_after_simulation(parent);
} }
static void encoder_state_encode_leaf(encoder_state_t * const state) static void encoder_state_encode_leaf(encoder_state_t * const state)
@ -799,16 +830,24 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
// frame is encoded. Deblocking and SAO search is done during LCU encoding. // frame is encoded. Deblocking and SAO search is done during LCU encoding.
for (int i = 0; i < state->lcu_order_count; ++i) { for (int i = 0; i < state->lcu_order_count; ++i) {
encoder_state_worker_encode_lcu_search(&state->lcu_order[i]); encoder_state_worker_encode_lcu_search(&state->lcu_order[i]);
// Without alf we can code the bitstream right after each LCU to update cabac contexts
if (encoder->cfg.alf_type == 0) {
encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]);
}
} }
//Encode ALF //Encode ALF
if (encoder->cfg.alf_type) { if (encoder->cfg.alf_type) {
kvz_alf_enc_process(state); kvz_alf_enc_process(state);
// If ALF was used the bitstream coding was simulated in search, reset the cabac/stream
// And write the actual bitstream
encoder_state_init_children_after_simulation(state);
for (int i = 0; i < state->lcu_order_count; ++i) {
encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]);
}
} }
for (int i = 0; i < state->lcu_order_count; ++i) {
encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]);
}
} else { } else {
// Add each LCU in the wavefront row as it's own job to the queue. // Add each LCU in the wavefront row as it's own job to the queue.
@ -892,26 +931,39 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_recon_jobs[dep_lcu->id]); kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_recon_jobs[dep_lcu->id]);
} }
} }
// Add local WPP dependancy to the LCU on the left.
if (lcu->left) {
kvz_threadqueue_job_dep_add(job[0], job[-1]);
kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-1]);
}
// Add local WPP dependancy to the LCU on the top.
if (lcu->above) {
kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]);
kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-state->tile->frame->width_in_lcu]);
}
kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
if (state->encoder_control->cfg.alf_type) { if (state->encoder_control->cfg.alf_type) {
encoder_state_t* parent = state; encoder_state_t* parent = state;
while (parent->parent) parent = parent->parent; while (parent->parent) parent = parent->parent;
// Add local WPP dependancy to the LCU on the left.
if (lcu->left) {
kvz_threadqueue_job_dep_add(job[0], job[-1]);
kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-1]);
}
// Add local WPP dependancy to the LCU on the top.
if (lcu->above) {
kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]);
kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-state->tile->frame->width_in_lcu]);
}
kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], parent->tqj_alf_process); kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], parent->tqj_alf_process);
kvz_threadqueue_job_dep_add(parent->tqj_alf_process, state->tile->wf_recon_jobs[lcu->id]); kvz_threadqueue_job_dep_add(parent->tqj_alf_process, state->tile->wf_recon_jobs[lcu->id]);
} else { } else {
// Add local WPP dependancy to the LCU on the left.
if (lcu->left) {
kvz_threadqueue_job_dep_add(job[0], bitstream_job[-1]);
}
// Add local WPP dependancy to the LCU on the top.
if (lcu->above) {
kvz_threadqueue_job_dep_add(job[0], bitstream_job[-state->tile->frame->width_in_lcu]);
}
kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]);
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]); kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]);
} }

View file

@ -6,7 +6,7 @@ set -eu
. "${0%/*}/util.sh" . "${0%/*}/util.sh"
common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no-tmvp --no-deblock --sao=0 --alf=full --pu-depth-intra 0-4' common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no-tmvp --no-deblock --sao=0 --pu-depth-intra 0-4'
valgrind_test $common_args --rd=1 valgrind_test $common_args --rd=1
valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq