mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
Parallel SAO
This commit is contained in:
parent
a3fcb141ed
commit
f4f9093cb5
148
src/encoder.c
148
src/encoder.c
|
@ -562,6 +562,12 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state,
|
|||
//order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
|
||||
encoder_state->tile->ver_buf_search = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->height_in_lcu * encoder_state->tile->cur_pic->width_in_lcu);
|
||||
|
||||
if (encoder->sao_enable) {
|
||||
encoder_state->tile->hor_buf_before_sao = yuv_t_alloc(LCU_WIDTH * encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
|
||||
} else {
|
||||
encoder_state->tile->hor_buf_before_sao = NULL;
|
||||
}
|
||||
|
||||
if (encoder->wpp) {
|
||||
encoder_state->tile->wf_jobs = MALLOC(threadqueue_job*, encoder_state->tile->cur_pic->width_in_lcu * encoder_state->tile->cur_pic->height_in_lcu);
|
||||
if (!encoder_state->tile->wf_jobs) {
|
||||
|
@ -577,6 +583,7 @@ static int encoder_state_config_tile_init(encoder_state * const encoder_state,
|
|||
}
|
||||
|
||||
static void encoder_state_config_tile_finalize(encoder_state * const encoder_state) {
|
||||
if (encoder_state->tile->hor_buf_before_sao) yuv_t_free(encoder_state->tile->hor_buf_before_sao);
|
||||
|
||||
yuv_t_free(encoder_state->tile->hor_buf_search);
|
||||
yuv_t_free(encoder_state->tile->ver_buf_search);
|
||||
|
@ -1233,27 +1240,40 @@ static void write_aud(encoder_state * const encoder_state)
|
|||
static void encoder_state_recdata_to_bufs(encoder_state * const encoder_state, const lcu_order_element * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
|
||||
picture* const cur_pic = encoder_state->tile->cur_pic;
|
||||
|
||||
if (hor_buf) {
|
||||
const int rdpx = lcu->position_px.x;
|
||||
const int rdpy = lcu->position_px.y + lcu->size.y - 1;
|
||||
const int by = lcu->position.y;
|
||||
|
||||
//Copy the bottom row of this LCU to the horizontal buffer
|
||||
picture_blit_pixels(&cur_pic->y_recdata[(lcu->position_next_px.y - 1) * cur_pic->width + lcu->position_px.x],
|
||||
&hor_buf->y[lcu->position_px.x + lcu->position.y * cur_pic->width],
|
||||
picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
|
||||
&hor_buf->y[lcu->position_px.x + by * cur_pic->width],
|
||||
lcu->size.x, 1, cur_pic->width, cur_pic->width);
|
||||
picture_blit_pixels(&cur_pic->u_recdata[(lcu->position_next_px.y / 2 - 1) * cur_pic->width / 2 + lcu->position_px.x / 2],
|
||||
&hor_buf->u[lcu->position_px.x / 2 + lcu->position.y * cur_pic->width / 2],
|
||||
picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
|
||||
&hor_buf->u[lcu->position_px.x / 2 + by * cur_pic->width / 2],
|
||||
lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
|
||||
picture_blit_pixels(&cur_pic->v_recdata[(lcu->position_next_px.y / 2 - 1) * cur_pic->width / 2 + lcu->position_px.x / 2],
|
||||
&hor_buf->v[lcu->position_px.x / 2 + lcu->position.y * cur_pic->width / 2],
|
||||
picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
|
||||
&hor_buf->v[lcu->position_px.x / 2 + by * cur_pic->width / 2],
|
||||
lcu->size.x / 2, 1, cur_pic->width / 2, cur_pic->width / 2);
|
||||
}
|
||||
|
||||
if (ver_buf) {
|
||||
const int rdpx = lcu->position_px.x + lcu->size.x - 1;
|
||||
const int rdpy = lcu->position_px.y;
|
||||
const int bx = lcu->position.x;
|
||||
|
||||
|
||||
//Copy the right row of this LCU to the vertical buffer.
|
||||
picture_blit_pixels(&cur_pic->y_recdata[lcu->position_px.y * cur_pic->width + lcu->position_next_px.x - 1],
|
||||
&ver_buf->y[lcu->position_px.y + lcu->position.x * cur_pic->height],
|
||||
picture_blit_pixels(&cur_pic->y_recdata[rdpy * cur_pic->width + rdpx],
|
||||
&ver_buf->y[lcu->position_px.y + bx * cur_pic->height],
|
||||
1, lcu->size.y, cur_pic->width, 1);
|
||||
picture_blit_pixels(&cur_pic->u_recdata[lcu->position_px.y * cur_pic->width / 4 + (lcu->position_next_px.x / 2) - 1],
|
||||
&ver_buf->u[lcu->position_px.y / 2 + lcu->position.x * cur_pic->height / 2],
|
||||
picture_blit_pixels(&cur_pic->u_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
|
||||
&ver_buf->u[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
|
||||
1, lcu->size.y / 2, cur_pic->width / 2, 1);
|
||||
picture_blit_pixels(&cur_pic->v_recdata[lcu->position_px.y * cur_pic->width / 4 + (lcu->position_next_px.x / 2) - 1],
|
||||
&ver_buf->v[lcu->position_px.y / 2 + lcu->position.x * cur_pic->height / 2],
|
||||
picture_blit_pixels(&cur_pic->v_recdata[(rdpy/2) * cur_pic->width/2 + (rdpx/2)],
|
||||
&ver_buf->v[lcu->position_px.y / 2 + bx * cur_pic->height / 2],
|
||||
1, lcu->size.y / 2, cur_pic->width / 2, 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1334,7 +1354,16 @@ static void worker_encoder_state_encode_lcu(void * opaque) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if (encoder->sao_enable && lcu->above) {
|
||||
//If we're not the first in the row
|
||||
if (lcu->above->left) {
|
||||
encoder_state_recdata_to_bufs(encoder_state, lcu->above->left, encoder_state->tile->hor_buf_before_sao, NULL);
|
||||
}
|
||||
//Latest LCU in the row, copy the data from the one above also
|
||||
if (!lcu->right) {
|
||||
encoder_state_recdata_to_bufs(encoder_state, lcu->above, encoder_state->tile->hor_buf_before_sao, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void encoder_state_encode_leaf(encoder_state * const encoder_state) {
|
||||
|
@ -1429,6 +1458,66 @@ static void worker_encoder_state_encode_children(void * opaque) {
|
|||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int y;
|
||||
const encoder_state * encoder_state;
|
||||
} worker_sao_reconstruct_lcu_data;
|
||||
|
||||
// ./kvazaar -i /scratch/h265-encode/pedestrian_area_1080p25.yuv --input-res 1920x1080 -o /tmp/out.h265 --qp 23 -p 60 --frames 10
|
||||
// Processed 10 frames, 5063552 bits AVG PSNR: 42.9771 46.0609 48.0985
|
||||
// Total time: 19.440 s.
|
||||
void worker_sao_reconstruct_lcu(void *opaque) {
|
||||
worker_sao_reconstruct_lcu_data *data = opaque;
|
||||
picture * const cur_pic = data->encoder_state->tile->cur_pic;
|
||||
unsigned stride = cur_pic->width_in_lcu;
|
||||
int x;
|
||||
|
||||
//TODO: copy only needed data
|
||||
pixel *new_y_data = MALLOC(pixel, cur_pic->width * cur_pic->height);
|
||||
pixel *new_u_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
|
||||
pixel *new_v_data = MALLOC(pixel, (cur_pic->width * cur_pic->height) >> 2);
|
||||
|
||||
const int offset = cur_pic->width * (data->y*LCU_WIDTH);
|
||||
const int offset_c = cur_pic->width/2 * (data->y*LCU_WIDTH_C);
|
||||
int num_pixels = cur_pic->width * (LCU_WIDTH + 2);
|
||||
|
||||
if (num_pixels + offset > cur_pic->width * cur_pic->height) {
|
||||
num_pixels = cur_pic->width * cur_pic->height - offset;
|
||||
}
|
||||
|
||||
memcpy(&new_y_data[offset], &cur_pic->y_recdata[offset], sizeof(pixel) * num_pixels);
|
||||
memcpy(&new_u_data[offset_c], &cur_pic->u_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
|
||||
memcpy(&new_v_data[offset_c], &cur_pic->v_recdata[offset_c], sizeof(pixel) * num_pixels >> 2);
|
||||
|
||||
if (data->y>0) {
|
||||
//copy first row from buffer
|
||||
memcpy(&new_y_data[cur_pic->width * (data->y*LCU_WIDTH-1)], &data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y-1)], cur_pic->width * sizeof(pixel));
|
||||
memcpy(&new_u_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
|
||||
memcpy(&new_v_data[cur_pic->width/2 * (data->y*LCU_WIDTH_C-1)], &data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y-1)], cur_pic->width/2 * sizeof(pixel));
|
||||
}
|
||||
//assertions to be sure everything's ok for the next line (don't bother with last one)
|
||||
/* These assertions may not be true if the row are not processed in order. To avoid having an artificial dependency between rows, it's better to remove them.
|
||||
assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->y[cur_pic->width * (data->y)], &cur_pic->y_recdata[cur_pic->width * ((data->y + 1)*LCU_WIDTH-1)], cur_pic->width * sizeof(pixel))==0);
|
||||
assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->u[cur_pic->width/2 * (data->y)], &cur_pic->u_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0);
|
||||
assert((data->y >= cur_pic->height_in_lcu - 1) || memcmp(&data->encoder_state->tile->hor_buf_before_sao->v[cur_pic->width/2 * (data->y)], &cur_pic->v_recdata[cur_pic->width/2 * ((data->y + 1)*LCU_WIDTH_C-1)], cur_pic->width/2 * sizeof(pixel))==0);*/
|
||||
|
||||
for (x = 0; x < cur_pic->width_in_lcu; x++) {
|
||||
// sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
|
||||
sao_info *sao_luma = &cur_pic->sao_luma[data->y * stride + x];
|
||||
sao_info *sao_chroma = &cur_pic->sao_chroma[data->y * stride + x];
|
||||
sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_y_data, x, data->y, sao_luma, COLOR_Y);
|
||||
sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_u_data, x, data->y, sao_chroma, COLOR_U);
|
||||
sao_reconstruct(data->encoder_state->encoder_control, cur_pic, new_v_data, x, data->y, sao_chroma, COLOR_V);
|
||||
}
|
||||
|
||||
free(new_y_data);
|
||||
free(new_u_data);
|
||||
free(new_v_data);
|
||||
|
||||
free(opaque);
|
||||
}
|
||||
|
||||
|
||||
static int tree_is_a_chain(const encoder_state * const encoder_state) {
|
||||
if (!encoder_state->children[0].encoder_control) return 1;
|
||||
if (encoder_state->children[1].encoder_control) return 0;
|
||||
|
@ -1482,15 +1571,38 @@ static void encoder_state_encode(encoder_state * const main_state) {
|
|||
worker_encoder_state_encode_children(&(main_state->children[i]));
|
||||
}
|
||||
}
|
||||
threadqueue_flush(main_state->encoder_control->threadqueue);
|
||||
|
||||
//If children are wavefront, we need to reconstruct SAO
|
||||
if (main_state->encoder_control->sao_enable && main_state->children[0].type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
|
||||
PERFORMANCE_MEASURE_START();
|
||||
sao_reconstruct_frame(main_state);
|
||||
PERFORMANCE_MEASURE_END(main_state->encoder_control->threadqueue, "type=sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d", main_state->global->frame, main_state->tile->id, main_state->slice->id,0,main_state->encoder_control->in.height_in_lcu - 1);
|
||||
}
|
||||
int y;
|
||||
picture * const cur_pic = main_state->tile->cur_pic;
|
||||
|
||||
for (y = 0; y < cur_pic->height_in_lcu; ++y) {
|
||||
worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1);
|
||||
threadqueue_job *job;
|
||||
#ifdef _DEBUG
|
||||
char job_description[256];
|
||||
sprintf(job_description, "frame=%d,tile=%d,position_y=%d", main_state->global->frame, main_state->tile->id, y + main_state->tile->lcu_offset_y);
|
||||
#else
|
||||
char* job_description = NULL;
|
||||
#endif
|
||||
data->y = y;
|
||||
data->encoder_state = main_state;
|
||||
|
||||
job = threadqueue_submit(main_state->encoder_control->threadqueue, worker_sao_reconstruct_lcu, data, 1, job_description);
|
||||
|
||||
if (y < cur_pic->height_in_lcu - 1) {
|
||||
//Not last row: depend on the last LCU of the row below
|
||||
threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 1) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
|
||||
} else {
|
||||
//Last row: depend on the last LCU of the row
|
||||
threadqueue_job_dep_add(job, main_state->tile->wf_jobs[(y + 0) * cur_pic->width_in_lcu + cur_pic->width_in_lcu - 1]);
|
||||
}
|
||||
threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job);
|
||||
|
||||
}
|
||||
}
|
||||
threadqueue_flush(main_state->encoder_control->threadqueue);
|
||||
} else {
|
||||
for (i=0; main_state->children[i].encoder_control; ++i) {
|
||||
worker_encoder_state_encode_children(&(main_state->children[i]));
|
||||
|
|
|
@ -180,6 +180,9 @@ typedef struct {
|
|||
//order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
|
||||
yuv_t *ver_buf_search;
|
||||
|
||||
yuv_t *hor_buf_before_sao;
|
||||
yuv_t *ver_buf_before_sao;
|
||||
|
||||
//Job pointers for wavefronts
|
||||
threadqueue_job **wf_jobs;
|
||||
} encoder_state_config_tile;
|
||||
|
@ -208,7 +211,6 @@ typedef struct lcu_order_element {
|
|||
struct encoder_state *encoder_state;
|
||||
vector2d position;
|
||||
vector2d position_px; //Top-left
|
||||
vector2d position_next_px; //Right-bottom
|
||||
vector2d size;
|
||||
int first_column;
|
||||
int first_row;
|
||||
|
|
Loading…
Reference in a new issue