diff --git a/src/encoder.c b/src/encoder.c index 39e38f85..48410ba9 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -562,6 +562,12 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state, //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index) encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu); + if (encoder->sao_enable) { + encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu); + } else { + encoder_state->tile->hor_buf_before_sao = NULL; + } + if (encoder->wpp) { encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu); if (!encoder_state->tile->wf_jobs) { @@ -577,6 +583,7 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state, } static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) { + if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao); yuv_t_free(encoder_state->tile->hor_buf_search); yuv_t_free(encoder_state->tile->ver_buf_search); @@ -1233,27 +1240,40 @@ static void write_aud(encoder_state * const encoder_state) static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) { picture* const cur_pic = encoder_state->tile->cur_pic; - //Copy the bottom row of this LCU to the horizontal buffer - picture_blit_pixels(&cur_pic->y_recdata[(lcu->position_next_px.y - 1) * cur_pic->width + lcu->position_px.x], - &hor_buf->y[lcu->position_px.x + lcu->position.y * cur_pic->width], - lcu->size.x, 1, cur_pic->width, cur_pic->width); - picture_blit_pixels(&cur_pic->u_recdata[(lcu->position_next_px.y / 2 - 1) * cur_pic->width / 2 + lcu->position_px.x / 2], - &hor_buf->u[lcu->position_px.x / 2 + lcu->position.y * cur_pic->width / 2], - lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2); - picture_blit_pixels(&cur_pic->v_recdata[(lcu->position_next_px.y / 2 - 1) * cur_pic->width / 2 + lcu->position_px.x / 2], - &hor_buf->v[lcu->position_px.x / 2 + lcu->position.y * cur_pic->width / 2], - lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2); + if (hor_buf) { + const int rdpx = lcu->position_px.x; + const int rdpy = lcu->position_px.y + lcu->size.y - 1; + const int by = lcu->position.y; + + //Copy the bottom row of this LCU to the horizontal buffer + picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx], + &hor_buf->y[lcu->position_px.x + by * cur_pic->width], + lcu->size.x, 1, cur_pic->width, cur_pic->width); + picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)], + &hor_buf->u[lcu->position_px.x / 2 + by * cur_pic->width / 2], + lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2); + picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)], + &hor_buf->v[lcu->position_px.x / 2 + by * cur_pic->width / 2], + lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2); + } - //Copy the right row of this LCU to the vertical buffer. - picture_blit_pixels(&cur_pic->y_recdata[lcu->position_px.y * cur_pic->width + lcu->position_next_px.x - 1], - &ver_buf->y[lcu->position_px.y + lcu->position.x * cur_pic->height], - 1, lcu->size.y, cur_pic->width, 1); - picture_blit_pixels(&cur_pic->u_recdata[lcu->position_px.y * cur_pic->width / 4 + (lcu->position_next_px.x / 2) - 1], - &ver_buf->u[lcu->position_px.y / 2 + lcu->position.x * cur_pic->height / 2], - 1, lcu->size.y / 2, cur_pic->width / 2, 1); - picture_blit_pixels(&cur_pic->v_recdata[lcu->position_px.y * cur_pic->width / 4 + (lcu->position_next_px.x / 2) - 1], - &ver_buf->v[lcu->position_px.y / 2 + lcu->position.x * cur_pic->height / 2], - 1, lcu->size.y / 2, cur_pic->width / 2, 1); + if (ver_buf) { + const int rdpx = lcu->position_px.x + lcu->size.x - 1; + const int rdpy = lcu->position_px.y; + const int bx = lcu->position.x; + + + //Copy the right row of this LCU to the vertical buffer. + picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx], + &ver_buf->y[lcu->position_px.y + bx * cur_pic->height], + 1, lcu->size.y, cur_pic->width, 1); + picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)], + &ver_buf->u[lcu->position_px.y / 2 + bx * cur_pic->height / 2], + 1, lcu->size.y / 2, cur_pic->width / 2, 1); + picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)], + &ver_buf->v[lcu->position_px.y / 2 + bx * cur_pic->height / 2], + 1, lcu->size.y / 2, cur_pic->width / 2, 1); + } } @@ -1334,7 +1354,16 @@ static void worker_encoder_state_encode_lcu(void * opaque) { } } - + if (encoder->sao_enable && lcu->above) { + //If we're not the first in the row + if (lcu->above->left) { + encoder_state_recdata_to_bufs(encoder_state, lcu->above->left, encoder_state->tile->hor_buf_before_sao, NULL); + } + //Latest LCU in the row, copy the data from the one above also + if (!lcu->right) { + encoder_state_recdata_to_bufs(encoder_state, lcu->above, encoder_state->tile->hor_buf_before_sao, NULL); + } + } } static void encoder_state_encode_leaf(encoder_state * const encoder_state) { @@ -1429,6 +1458,66 @@ static void worker_encoder_state_encode_children(void * opaque) { } } +typedef struct { + int y; + const encoder_state * encoder_state; +} worker_sao_reconstruct_lcu_data; + +// ./kvazaar -i /scratch/h265-encode/pedestrian_area_1080p25.yuv --input-res 1920x1080 -o /tmp/out.h265 --qp 23 -p 60 --frames 10 +// Processed 10 frames, 5063552 bits AVG PSNR: 42.9771 46.0609 48.0985 +// Total time: 19.440 s. +void worker_sao_reconstruct_lcu(void *opaque) { + worker_sao_reconstruct_lcu_data *data = opaque; + picture * const cur_pic = data->encoder_state->tile->cur_pic; + unsigned stride = cur_pic->width_in_lcu; + int x; + + //TODO: copy only needed data + pixel *new_y_data = MALLOC(pixel, cur_pic->width * cur_pic->height); + pixel *new_u_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2); + pixel *new_v_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2); + + const int offset = cur_pic->width * (data->y*LCU_WIDTH); + const int offset_c = cur_pic->width/2 * (data->y*LCU_WIDTH_C); + int num_pixels = cur_pic->width * (LCU_WIDTH + 2); + + if (num_pixels + offset > cur_pic->width * cur_pic->height) { + num_pixels = cur_pic->width * cur_pic->height - offset; + } + + memcpy(&new_y_data[offset], &cur_pic->y_recdata[offset], sizeof(pixel) * num_pixels); + memcpy(&new_u_data[offset_c], &cur_pic->u_recdata[offset_c], sizeof(pixel) * num_pixels >> 2); + memcpy(&new_v_data[offset_c], &cur_pic->v_recdata[offset_c], sizeof(pixel) * num_pixels >> 2); + + if (data->y>0) { + //copy first row from buffer + memcpy(&new_y_data[cur_pic->width * (data->y*LCU_WIDTH-1)], &data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y-1)], cur_pic->width * sizeof(pixel)); + memcpy(&new_u_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel)); + memcpy(&new_v_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel)); + } + //assertions to be sure everything's ok for the next line (don't bother with last one) + /* These assertions may not be true if the row are not processed in order. To avoid having an artificial dependency between rows, it's better to remove them. + assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y)], &cur_pic->y_recdata[cur_pic->width * ((data->y + 1)*LCU_WIDTH-1)], cur_pic->width * sizeof(pixel))==0); + assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y)], &cur_pic->u_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0); + assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y)], &cur_pic->v_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0);*/ + + for (x = 0; x < cur_pic->width_in_lcu; x++) { + // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma); + sao_info *sao_luma = &cur_pic->sao_luma[data->y * stride + x]; + sao_info *sao_chroma = &cur_pic->sao_chroma[data->y * stride + x]; + sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_y_data, x, data->y, sao_luma, COLOR_Y); + sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_u_data, x, data->y, sao_chroma, COLOR_U); + sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_v_data, x, data->y, sao_chroma, COLOR_V); + } + + free(new_y_data); + free(new_u_data); + free(new_v_data); + + free(opaque); +} + + static int tree_is_a_chain(const encoder_state * const encoder_state) { if (!encoder_state->children[0].encoder_control) return 1; if (encoder_state->children[1].encoder_control) return 0; @@ -1482,15 +1571,38 @@ static void encoder_state_encode(encoder_state * const main_state) { worker_encoder_state_encode_children(&(main_state->children[i])); } } - threadqueue_flush(main_state->encoder_control->threadqueue); //If children are wavefront, we need to reconstruct SAO if (main_state->encoder_control->sao_enable && main_state->children[0].type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) { - PERFORMANCE_MEASURE_START(); - sao_reconstruct_frame(main_state); - PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", main_state->global->frame, main_state->tile->id, main_state->slice->id,0,main_state->encoder_control->in.height_in_lcu - 1); + int y; + picture * const cur_pic = main_state->tile->cur_pic; + + for (y = 0; y < cur_pic->height_in_lcu; ++y) { + worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1); + threadqueue_job *job; +#ifdef _DEBUG + char job_description[256]; + sprintf(job_description, "frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y); +#else + char* job_description = NULL; +#endif + data->y = y; + data->encoder_state = main_state; + + job = threadqueue_submit(main_state->encoder_control->threadqueue, worker_sao_reconstruct_lcu, data, 1, job_description); + + if (y < cur_pic->height_in_lcu - 1) { + //Not last row: depend on the last LCU of the row below + threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 1) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]); + } else { + //Last row: depend on the last LCU of the row + threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 0) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]); + } + threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job); + + } } - + threadqueue_flush(main_state->encoder_control->threadqueue); } else { for (i=0; main_state->children[i].encoder_control; ++i) { worker_encoder_state_encode_children(&(main_state->children[i])); diff --git a/src/encoder.h b/src/encoder.h index 78c5277f..a74e20b2 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -180,6 +180,9 @@ typedef struct { //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index) yuv_t *ver_buf_search; + yuv_t *hor_buf_before_sao; + yuv_t *ver_buf_before_sao; + //Job pointers for wavefronts threadqueue_job **wf_jobs; } encoder_state_config_tile; @@ -208,7 +211,6 @@ typedef struct lcu_order_element { struct encoder_state *encoder_state; vector2d position; vector2d position_px; //Top-left - vector2d position_next_px; //Right-bottom vector2d size; int first_column; int first_row;