diff --git a/src/bitstream.c b/src/bitstream.c index a835116e..22dd383d 100644 --- a/src/bitstream.c +++ b/src/bitstream.c @@ -136,21 +136,24 @@ uint64_t kvz_bitstream_tell(const bitstream_t *const stream) */ void kvz_bitstream_writebyte(bitstream_t *const stream, const uint8_t byte) { - assert(stream->cur_bit == 0); + assert(stream->cur_bit == 0 || stream->simulation); - if (stream->last == NULL || stream->last->len == KVZ_DATA_CHUNK_SIZE) { - // Need to allocate a new chunk. - kvz_data_chunk *new_chunk = kvz_bitstream_alloc_chunk(); - assert(new_chunk); + if (!stream->simulation) { - if (!stream->first) stream->first = new_chunk; - if (stream->last) stream->last->next = new_chunk; - stream->last = new_chunk; + if (stream->last == NULL || stream->last->len == KVZ_DATA_CHUNK_SIZE) { + // Need to allocate a new chunk. + kvz_data_chunk* new_chunk = kvz_bitstream_alloc_chunk(); + assert(new_chunk); + + if (!stream->first) stream->first = new_chunk; + if (stream->last) stream->last->next = new_chunk; + stream->last = new_chunk; + } + assert(stream->last->len < KVZ_DATA_CHUNK_SIZE); + + stream->last->data[stream->last->len] = byte; + stream->last->len += 1; } - assert(stream->last->len < KVZ_DATA_CHUNK_SIZE); - - stream->last->data[stream->last->len] = byte; - stream->last->len += 1; stream->len += 1; } @@ -161,9 +164,9 @@ void kvz_bitstream_writebyte(bitstream_t *const stream, const uint8_t byte) */ void kvz_bitstream_move(bitstream_t *const dst, bitstream_t *const src) { - assert(dst->cur_bit == 0); + assert(dst->cur_bit == 0 || src->simulation); - if (src->len > 0) { + if (src->len > 0 && !src->simulation) { if (dst->first == NULL) { dst->first = src->first; dst->last = src->last; @@ -179,6 +182,7 @@ void kvz_bitstream_move(bitstream_t *const dst, bitstream_t *const src) dst->data = src->data; dst->cur_bit = src->cur_bit; dst->zerocount = src->zerocount; + dst->simulation = src->simulation; src->first = src->last = NULL; kvz_bitstream_clear(src); @@ -200,7 +204,7 @@ void kvz_bitstream_clear(bitstream_t *const stream) */ void kvz_bitstream_put_byte(bitstream_t *const stream, uint32_t data) { - assert(stream->cur_bit == 0); + assert(stream->cur_bit == 0 || stream->simulation); const uint8_t emulation_prevention_three_byte = 0x03; if ((stream->zerocount == 2) && (data < 4)) { @@ -220,6 +224,10 @@ void kvz_bitstream_put_byte(bitstream_t *const stream, uint32_t data) */ void kvz_bitstream_put(bitstream_t *const stream, const uint32_t data, uint8_t bits) { + if (stream->simulation) { + stream->cur_bit += bits; + return; + } while (bits--) { stream->data <<= 1; diff --git a/src/bitstream.h b/src/bitstream.h index 54db4a7a..86541711 100644 --- a/src/bitstream.h +++ b/src/bitstream.h @@ -52,6 +52,8 @@ typedef struct bitstream_t uint8_t cur_bit; uint8_t zerocount; + + bool simulation; } bitstream_t; typedef struct diff --git a/src/encoderstate.c b/src/encoderstate.c index 7f2133d7..637c2d6d 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -612,6 +612,8 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las } } +static void encoder_state_worker_encode_lcu_bitstream(void* opaque); + static void encoder_state_worker_encode_lcu_search(void * opaque) { lcu_order_element_t * const lcu = opaque; @@ -671,6 +673,12 @@ static void encoder_state_worker_encode_lcu_search(void * opaque) kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y); encoder_sao_reconstruct(state, lcu); } + + // Do simulated bitstream writing to update the cabac contexts + if (encoder->cfg.alf_type) { + state->stream.simulation = true; + encoder_state_worker_encode_lcu_bitstream(opaque); + } } static void encoder_state_worker_encode_lcu_bitstream(void * opaque) @@ -695,9 +703,11 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) //Encode coding tree kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0, lcu->coeff); - // Coeffs are not needed anymore. - free(lcu->coeff); - lcu->coeff = NULL; + if (!state->stream.simulation) { + // Coeffs are not needed anymore. + free(lcu->coeff); + lcu->coeff = NULL; + } bool end_of_slice_segment_flag; if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) { @@ -772,9 +782,30 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque) } } +static void encoder_state_init_children_after_simulation(encoder_state_t* const state) { + kvz_bitstream_clear(&state->stream); + + if (state->is_leaf) { + //Leaf states have cabac and context + kvz_cabac_start(&state->cabac); + kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); + } + + for (int i = 0; state->children[i].encoder_control; ++i) { + encoder_state_init_children_after_simulation(&state->children[i]); + } +} void kvz_alf_enc_process_job(void* opaque) { - kvz_alf_enc_process((encoder_state_t* const)opaque); + encoder_state_t* const state = (encoder_state_t* const)opaque; + + kvz_alf_enc_process(state); + + encoder_state_t* parent = state; + while (parent->parent) parent = parent->parent; + + // If ALF was used the bitstream coding was simulated in search, reset the cabac/stream + encoder_state_init_children_after_simulation(parent); } static void encoder_state_encode_leaf(encoder_state_t * const state) @@ -799,16 +830,24 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) // frame is encoded. Deblocking and SAO search is done during LCU encoding. for (int i = 0; i < state->lcu_order_count; ++i) { encoder_state_worker_encode_lcu_search(&state->lcu_order[i]); + // Without alf we can code the bitstream right after each LCU to update cabac contexts + if (encoder->cfg.alf_type == 0) { + encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]); + } } //Encode ALF if (encoder->cfg.alf_type) { kvz_alf_enc_process(state); + // If ALF was used the bitstream coding was simulated in search, reset the cabac/stream + // And write the actual bitstream + encoder_state_init_children_after_simulation(state); + for (int i = 0; i < state->lcu_order_count; ++i) { + encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]); + } } - for (int i = 0; i < state->lcu_order_count; ++i) { - encoder_state_worker_encode_lcu_bitstream(&state->lcu_order[i]); - } + } else { // Add each LCU in the wavefront row as it's own job to the queue. @@ -892,26 +931,39 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_recon_jobs[dep_lcu->id]); } } - - // Add local WPP dependancy to the LCU on the left. - if (lcu->left) { - kvz_threadqueue_job_dep_add(job[0], job[-1]); - kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-1]); - } - // Add local WPP dependancy to the LCU on the top. - if (lcu->above) { - kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]); - kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-state->tile->frame->width_in_lcu]); - } - - kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]); - + if (state->encoder_control->cfg.alf_type) { encoder_state_t* parent = state; while (parent->parent) parent = parent->parent; + + // Add local WPP dependancy to the LCU on the left. + if (lcu->left) { + kvz_threadqueue_job_dep_add(job[0], job[-1]); + kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-1]); + } + // Add local WPP dependancy to the LCU on the top. + if (lcu->above) { + kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]); + kvz_threadqueue_job_dep_add(bitstream_job[0], bitstream_job[-state->tile->frame->width_in_lcu]); + } + + kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]); + kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], parent->tqj_alf_process); kvz_threadqueue_job_dep_add(parent->tqj_alf_process, state->tile->wf_recon_jobs[lcu->id]); } else { + + // Add local WPP dependancy to the LCU on the left. + if (lcu->left) { + kvz_threadqueue_job_dep_add(job[0], bitstream_job[-1]); + } + // Add local WPP dependancy to the LCU on the top. + if (lcu->above) { + kvz_threadqueue_job_dep_add(job[0], bitstream_job[-state->tile->frame->width_in_lcu]); + } + + kvz_threadqueue_submit(state->encoder_control->threadqueue, job[0]); + kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_recon_jobs[lcu->id]); } diff --git a/tests/test_intra.sh b/tests/test_intra.sh index 24acc3c5..4c2e93bd 100755 --- a/tests/test_intra.sh +++ b/tests/test_intra.sh @@ -6,7 +6,7 @@ set -eu . "${0%/*}/util.sh" -common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no-tmvp --no-deblock --sao=0 --alf=full --pu-depth-intra 0-4' +common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no-tmvp --no-deblock --sao=0 --pu-depth-intra 0-4' valgrind_test $common_args --rd=1 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq