diff --git a/src/encoder.c b/src/encoder.c index db312234..003387a9 100644 --- a/src/encoder.c +++ b/src/encoder.c @@ -50,6 +50,9 @@ int8_t g_convert_to_bit[LCU_WIDTH + 1]; /* Local functions. */ static void add_checksum(encoder_control* encoder); static void encode_VUI(encoder_control* encoder); +static void encode_sao(encoder_control *encoder, + unsigned x_lcu, uint16_t y_lcu, + sao_info *sao_luma, sao_info *sao_chroma); /** * Initialize g_sig_last_scan with scan positions for a transform block of @@ -383,11 +386,18 @@ static void write_aud(encoder_control* encoder) void encode_one_frame(encoder_control* encoder) { + yuv_t *hor_buf = alloc_yuv_t(encoder->in.width); + // Allocate 2 extra luma pixels so we get 1 extra chroma pixel for the + // for the extra pixel on the top right. + yuv_t *ver_buf = alloc_yuv_t(LCU_WIDTH + 2); + const int is_first_frame = (encoder->frame == 0); const int is_i_radl = (encoder->cfg->intra_period == 1 && encoder->frame % 2 == 0); const int is_p_radl = (encoder->cfg->intra_period > 1 && (encoder->frame % encoder->cfg->intra_period) == 0); const int is_radl_frame = is_first_frame || is_i_radl || is_p_radl; + picture *pic = encoder->in.cur_pic; + // Initialize lambda value(s) to use in search init_lambda(encoder); @@ -464,17 +474,91 @@ void encode_one_frame(encoder_control* encoder) { vector2d lcu; + const vector2d size = { encoder->in.width, encoder->in.height }; + const vector2d size_lcu = { encoder->in.width_in_lcu, encoder->in.height_in_lcu }; for (lcu.y = 0; lcu.y < encoder->in.height_in_lcu; lcu.y++) { for (lcu.x = 0; lcu.x < encoder->in.width_in_lcu; lcu.x++) { const vector2d px = { lcu.x * LCU_WIDTH, lcu.y * LCU_WIDTH }; - search_lcu(encoder, px.x, px.y); + // Handle partial LCUs on the right and bottom. + const vector2d lcu_dim = { + MIN(LCU_WIDTH, size.x - px.x), MIN(LCU_WIDTH, size.y - px.y) + }; + const int right = px.x + lcu_dim.x; + const int bottom = px.y + lcu_dim.y; + + search_lcu(encoder, px.x, px.y, hor_buf, ver_buf); + + // Take the bottom right pixel from the LCU above and put it as the + // first pixel in this LCUs rightmost pixels. + if (lcu.y > 0) { + ver_buf->y[0] = hor_buf->y[right - 1]; + ver_buf->u[0] = hor_buf->u[right / 2 - 1]; + ver_buf->v[0] = hor_buf->v[right / 2 - 1]; + } + + // Take bottom and right pixels from this LCU to be used on the search of next LCU. + picture_blit_pixels(&pic->y_recdata[(bottom - 1) * size.x + px.x], + &hor_buf->y[px.x], + lcu_dim.x, 1, size.x, size.x); + picture_blit_pixels(&pic->u_recdata[(bottom / 2 - 1) * size.x / 2 + px.x / 2], + &hor_buf->u[px.x / 2], + lcu_dim.x / 2, 1, size.x / 2, size.x / 2); + picture_blit_pixels(&pic->v_recdata[(bottom / 2 - 1) * size.x / 2 + px.x / 2], + &hor_buf->v[px.x / 2], + lcu_dim.x / 2, 1, size.x / 2, size.x / 2); + + picture_blit_pixels(&pic->y_recdata[px.y * size.x + right - 1], + &ver_buf->y[1], + 1, lcu_dim.y, size.x, 1); + picture_blit_pixels(&pic->u_recdata[px.y * size.x / 4 + (right / 2) - 1], + &ver_buf->u[1], + 1, lcu_dim.y / 2, size.x / 2, 1); + picture_blit_pixels(&pic->v_recdata[px.y * size.x / 4 + (right / 2) - 1], + &ver_buf->v[1], + 1, lcu_dim.y / 2, size.x / 2, 1); + + if (encoder->deblock_enable) { + filter_deblock_lcu(encoder, px.x, px.y); + } + + if (encoder->sao_enable) { + const int stride = encoder->in.width_in_lcu; + sao_info *sao_luma = &pic->sao_luma[lcu.y * stride + lcu.x]; + sao_info *sao_chroma = &pic->sao_chroma[lcu.y * stride + lcu.x]; + init_sao_info(sao_luma); + init_sao_info(sao_chroma); + + { + sao_info *sao_top = lcu. y != 0 ? &pic->sao_luma[(lcu.y - 1) * stride + lcu.x] : NULL; + sao_info *sao_left = lcu.x != 0 ? &pic->sao_luma[lcu.y * stride + lcu.x -1] : NULL; + sao_search_luma(encoder->in.cur_pic, lcu.x, lcu.y, sao_luma, sao_top, sao_left); + } + + { + sao_info *sao_top = lcu.y != 0 ? &pic->sao_chroma[(lcu.y - 1) * stride + lcu.x] : NULL; + sao_info *sao_left = lcu.x != 0 ? &pic->sao_chroma[lcu.y * stride + lcu.x - 1] : NULL; + sao_search_chroma(encoder->in.cur_pic, lcu.x, lcu.y, sao_chroma, sao_top, sao_left); + } + + // Merge only if both luma and chroma can be merged + sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag; + sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag; + + encode_sao(encoder, lcu.x, lcu.y, sao_luma, sao_chroma); + } + + encode_coding_tree(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0); + + { + const int last_lcu = (lcu.x == size_lcu.x - 1 && lcu.y == size_lcu.y - 1); + cabac_encode_bin_trm(&cabac, last_lcu ? 1 : 0); // end_of_slice_segment_flag + } } } } - encode_slice_data(encoder); cabac_flush(&cabac); bitstream_align(encoder->stream); bitstream_flush(encoder->stream); @@ -491,10 +575,17 @@ void encode_one_frame(encoder_control* encoder) bitstream_clear_buffer(encoder->stream); + if (encoder->sao_enable) { + sao_reconstruct_frame(encoder); + } + // Calculate checksum add_checksum(encoder); encoder->in.cur_pic->poc = encoder->poc; + + dealloc_yuv_t(hor_buf); + dealloc_yuv_t(ver_buf); } static void fill_after_frame(unsigned height, unsigned array_width, @@ -1190,7 +1281,7 @@ static void encode_sao_merge_flags(sao_info *sao, } /** - * \brief Stub that encodes all LCU's as none type. + * \brief Encode SAO information. */ static void encode_sao(encoder_control *encoder, unsigned x_lcu, uint16_t y_lcu, @@ -1207,82 +1298,6 @@ static void encode_sao(encoder_control *encoder, } } -void encode_slice_data(encoder_control* encoder) -{ - uint16_t x_ctb, y_ctb; - picture *pic = encoder->in.cur_pic; - const vector2d size_lcu = { encoder->in.width_in_lcu, encoder->in.height_in_lcu }; - - // Filtering - if(encoder->deblock_enable) { - filter_deblock(encoder); - } - - if (encoder->sao_enable) { - pixel *new_y_data = MALLOC(pixel, pic->width * pic->height); - pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2); - pixel *new_v_data = MALLOC(pixel, (pic->width * pic->height) >> 2); - memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height); - memcpy(new_u_data, pic->u_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2); - memcpy(new_v_data, pic->v_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2); - - for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) { - for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) { - unsigned stride = encoder->in.width_in_lcu; - - //Fetch luma top and left merge candidate - sao_info *sao_top = y_ctb!=0?&pic->sao_luma[(y_ctb-1) * stride + x_ctb]:NULL; - sao_info *sao_left = x_ctb!=0?&pic->sao_luma[y_ctb * stride + x_ctb -1]:NULL; - - sao_info *sao_luma = &pic->sao_luma[y_ctb * stride + x_ctb]; - sao_info *sao_chroma = &pic->sao_chroma[y_ctb * stride + x_ctb]; - init_sao_info(sao_luma); - init_sao_info(sao_chroma); - - sao_search_luma(encoder->in.cur_pic, x_ctb, y_ctb, sao_luma, sao_top, sao_left); - // Chroma top and left merge candidate - sao_top = y_ctb!=0?&pic->sao_chroma[(y_ctb-1) * stride + x_ctb]:NULL; - sao_left = x_ctb!=0?&pic->sao_chroma[y_ctb * stride + x_ctb -1]:NULL; - sao_search_chroma(encoder->in.cur_pic, x_ctb, y_ctb, sao_chroma, sao_top, sao_left); - - // Merge only if both luma and chroma can be merged - sao_luma->merge_left_flag = sao_luma->merge_left_flag & sao_chroma->merge_left_flag; - sao_luma->merge_up_flag = sao_luma->merge_up_flag & sao_chroma->merge_up_flag; - - // sao_do_rdo(encoder, x_ctb, y_ctb, sao_luma, sao_chroma); - sao_reconstruct(pic, new_y_data, x_ctb, y_ctb, sao_luma, COLOR_Y); - sao_reconstruct(pic, new_u_data, x_ctb, y_ctb, sao_chroma, COLOR_U); - sao_reconstruct(pic, new_v_data, x_ctb, y_ctb, sao_chroma, COLOR_V); - } - } - - free(new_y_data); - free(new_u_data); - free(new_v_data); - } - - // Loop through every LCU in the slice - for (y_ctb = 0; y_ctb < encoder->in.height_in_lcu; y_ctb++) { - for (x_ctb = 0; x_ctb < encoder->in.width_in_lcu; x_ctb++) { - uint8_t depth = 0; - const int last_lcu = (x_ctb == size_lcu.x - 1 && y_ctb == size_lcu.y - 1); - - if (encoder->sao_enable) { - picture *pic = encoder->in.cur_pic; - unsigned stride = encoder->in.width_in_lcu; - sao_info sao_luma = pic->sao_luma[y_ctb * stride + x_ctb]; - sao_info sao_chroma = pic->sao_chroma[y_ctb * stride + x_ctb]; - - encode_sao(encoder, x_ctb, y_ctb, &sao_luma, &sao_chroma); - } - - // Recursive function for looping through all the sub-blocks - encode_coding_tree(encoder, x_ctb << MAX_DEPTH, y_ctb << MAX_DEPTH, depth); - - cabac_encode_bin_trm(&cabac, last_lcu ? 1 : 0); // end_of_slice_segment_flag - } - } -} void encode_coding_tree(encoder_control *encoder, uint16_t x_ctb, uint16_t y_ctb, uint8_t depth) diff --git a/src/encoder.h b/src/encoder.h index b4f6dd22..8fda103c 100644 --- a/src/encoder.h +++ b/src/encoder.h @@ -109,7 +109,6 @@ int read_one_frame(FILE *file, encoder_control *encoder); void encode_seq_parameter_set(encoder_control *encoder); void encode_pic_parameter_set(encoder_control *encoder); void encode_vid_parameter_set(encoder_control *encoder); -void encode_slice_data(encoder_control *encoder); void encode_slice_header(encoder_control *encoder); void encode_access_unit_delimiter(encoder_control* encoder); void encode_prefix_sei_version(encoder_control* encoder); diff --git a/src/filter.c b/src/filter.c index 23796227..c1ea9675 100644 --- a/src/filter.c +++ b/src/filter.c @@ -166,26 +166,30 @@ void filter_deblock_edge_luma(encoder_control *encoder, int32_t xpos, int32_t ypos, int8_t depth, int8_t dir) { - int32_t stride = encoder->in.cur_pic->width; - int32_t offset = stride; - int32_t beta_offset_div2 = encoder->beta_offset_div2; - int32_t tc_offset_div2 = encoder->tc_offset_div2; - // TODO: support 10+bits - pixel *orig_src = &encoder->in.cur_pic->y_recdata[xpos + ypos*stride]; - pixel *src = orig_src; - int32_t step = 1; cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(xpos>>MIN_SIZE) + (ypos>>MIN_SIZE) * (encoder->in.width_in_lcu << MAX_DEPTH)]; - cu_info *cu_p = NULL; - int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE; - int8_t strength = 0; - - if(dir == EDGE_VER) { - offset = 1; - step = stride; + { + // Return if called with a coordinate which is not at CU or TU boundary. + // TODO: Add handling for asymmetric inter CU boundaries which do not coincide + // with transform boundaries. + const int tu_width = LCU_WIDTH >> cu_q->tr_depth; + if (dir == EDGE_HOR && (ypos & (tu_width - 1))) return; + if (dir == EDGE_VER && (xpos & (tu_width - 1))) return; } { + int32_t stride = encoder->in.cur_pic->width; + int32_t offset = stride; + int32_t beta_offset_div2 = encoder->beta_offset_div2; + int32_t tc_offset_div2 = encoder->tc_offset_div2; + // TODO: support 10+bits + pixel *orig_src = &encoder->in.cur_pic->y_recdata[xpos + ypos*stride]; + pixel *src = orig_src; + int32_t step = 1; + cu_info *cu_p = NULL; + int16_t x_cu = xpos>>MIN_SIZE,y_cu = ypos>>MIN_SIZE; + int8_t strength = 0; + int32_t qp = encoder->QP; int32_t bitdepth_scale = 1 << (g_bitdepth - 8); int32_t b_index = CLIP(0, 51, qp + (beta_offset_div2 << 1)); @@ -194,13 +198,30 @@ void filter_deblock_edge_luma(encoder_control *encoder, uint32_t blocks_in_part = (LCU_WIDTH >> depth) / 4; uint32_t block_idx; int32_t tc_index,tc,thr_cut; + + if (dir == EDGE_VER) { + offset = 1; + step = stride; + } + // TODO: add CU based QP calculation // For each 4-pixel part in the edge for (block_idx = 0; block_idx < blocks_in_part; ++block_idx) { int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d; - if((block_idx & 1) == 0) + { + vector2d px = { + (dir == EDGE_HOR ? xpos + block_idx * 4 : xpos), + (dir == EDGE_VER ? ypos + block_idx * 4 : ypos) + }; + + // Don't deblock the last 4x4 block of the LCU. This will be deblocked + // when processing the next LCU. + if (block_idx > 0 && dir == EDGE_HOR && (px.x + 4) % 64 == 0 && (px.x + 4 != encoder->in.width)) { + continue; + } + // CU in the side we are filtering, update every 8-pixels cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0)) + (y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0)) @@ -269,45 +290,68 @@ void filter_deblock_edge_chroma(encoder_control *encoder, int32_t x, int32_t y, int8_t depth, int8_t dir) { - int32_t stride = encoder->in.cur_pic->width >> 1; - int32_t tc_offset_div2 = encoder->tc_offset_div2; - // TODO: support 10+bits - pixel *src_u = &encoder->in.cur_pic->u_recdata[x + y*stride]; - pixel *src_v = &encoder->in.cur_pic->v_recdata[x + y*stride]; - // Init offset and step to EDGE_HOR - int32_t offset = stride; - int32_t step = 1; cu_info *cu_q = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x>>(MIN_SIZE-1)) + (y>>(MIN_SIZE-1)) * (encoder->in.width_in_lcu << MAX_DEPTH)]; - cu_info *cu_p = NULL; - int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1); - int8_t strength = 2; - // We cannot filter edges not on 8x8 grid - if((depth == MAX_DEPTH && (( (y & 0x7) && dir == EDGE_HOR ) || ( (x & 0x7) && dir == EDGE_VER ) ) )) - { - return; + // Chroma edges that do not lay on a 8x8 grid are not deblocked. + if (depth >= MAX_DEPTH) { + if (dir == EDGE_HOR && (y & (8 - 1))) return; + if (dir == EDGE_VER && (x & (8 - 1))) return; } - if(dir == EDGE_VER) { - offset = 1; - step = stride; + // Return if called with a coordinate which is not at CU or TU boundary. + // TODO: Add handling for asymmetric inter CU boundaries which do not coincide + // with transform boundaries. + const int tu_width = (LCU_WIDTH / 2) >> cu_q->tr_depth; + if (dir == EDGE_HOR && (y & (tu_width - 1))) return; + if (dir == EDGE_VER && (x & (tu_width - 1))) return; } // For each subpart { + int32_t stride = encoder->in.cur_pic->width >> 1; + int32_t tc_offset_div2 = encoder->tc_offset_div2; + // TODO: support 10+bits + pixel *src_u = &encoder->in.cur_pic->u_recdata[x + y*stride]; + pixel *src_v = &encoder->in.cur_pic->v_recdata[x + y*stride]; + // Init offset and step to EDGE_HOR + int32_t offset = stride; + int32_t step = 1; + cu_info *cu_p = NULL; + int16_t x_cu = x>>(MIN_SIZE-1),y_cu = y>>(MIN_SIZE-1); + int8_t strength = 2; + int32_t QP = g_chroma_scale[encoder->QP]; int32_t bitdepth_scale = 1 << (g_bitdepth-8); int32_t TC_index = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1))); int32_t Tc = g_tc_table_8x8[TC_index]*bitdepth_scale; - uint32_t blocks_in_part= (LCU_WIDTH>>(depth+1)) / 4; + + // Special handling for depth 4. It's meaning is that we want to bypass + // last block in LCU check in order to deblock just that block. + uint32_t blocks_in_part= (LCU_WIDTH>>(depth == 4 ? depth : depth + 1)) / 4; uint32_t blk_idx; + if(dir == EDGE_VER) { + offset = 1; + step = stride; + } + for (blk_idx = 0; blk_idx < blocks_in_part; ++blk_idx) { + vector2d px = { + (dir == EDGE_HOR ? x + blk_idx * 4 : x), + (dir == EDGE_VER ? y + blk_idx * 4 : y) + }; cu_p = &encoder->in.cur_pic->cu_array[MAX_DEPTH][(x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? blk_idx : 0)) + (y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? blk_idx : 0)) * (encoder->in.width_in_lcu << MAX_DEPTH)]; + + // Don't deblock the last 4x4 block of the LCU. This will be deblocked + // when processing the next LCU. + if (depth != 4 && dir == EDGE_HOR && (px.x + 4) % 32 == 0 && (px.x + 4 != encoder->in.width / 2)) { + continue; + } + // Only filter when strenght == 2 (one of the blocks is intra coded) if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) { // Chroma U @@ -406,6 +450,40 @@ void filter_deblock(encoder_control* encoder) } +/** + * \brief Deblock a single LCU without using data from right or down. + * + * Filter all the following edges: + * - All edges within the LCU, except for the last 4 pixels on the right when + * using horizontal filtering. + * - Left edge and top edge. + * - After vertical filtering the left edge, filter the last 4 pixels of + * horizontal edges in the LCU to the left. + */ +void filter_deblock_lcu(encoder_control *encoder, int x_px, int y_px) +{ + const vector2d lcu = { x_px / LCU_WIDTH, y_px / LCU_WIDTH }; + + filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_VER); + + // Filter rightmost 4 pixels from last LCU now that they have been + // finally deblocked vertically. + if (lcu.x > 0) { + int y; + for (y = 0; y < 64; y += 8) { + if (lcu.y + y == 0) continue; + filter_deblock_edge_luma(encoder, lcu.x * 64 - 4, lcu.y * 64 + y, 4, EDGE_HOR); + } + for (y = 0; y < 32; y += 8) { + if (lcu.y + y == 0) continue; + filter_deblock_edge_chroma(encoder, lcu.x * 32 - 4, lcu.y * 32 + y, 4, EDGE_HOR); + } + } + + filter_deblock_cu(encoder, lcu.x << MAX_DEPTH, lcu.y << MAX_DEPTH, 0, EDGE_HOR); +} + + /** * \brief Interpolation for chroma half-pixel * \param src source image in integer pels (-2..width+3, -2..height+3) diff --git a/src/filter.h b/src/filter.h index f0bdc3ff..fd33d9e6 100644 --- a/src/filter.h +++ b/src/filter.h @@ -32,7 +32,7 @@ ////////////////////////////////////////////////////////////////////////// // FUNCTIONS // Deblocking -void filter_deblock_cu(encoder_control *encoder, int32_t x_cu, int32_t y_cu, +void filter_deblock_cu(encoder_control *encoder, int32_t x_px, int32_t y_px, int8_t depth, int32_t edge); void filter_deblock_edge_luma(encoder_control *encoder, int32_t x_pos, int32_t y_pos, @@ -41,6 +41,7 @@ void filter_deblock_edge_chroma(encoder_control *encoder, int32_t xpos, int32_t ypos, int8_t depth, int8_t dir); void filter_deblock(encoder_control *encoder); +void filter_deblock_lcu(encoder_control *encoder, int x_px, int y_px); void filter_deblock_luma(pixel *src, int32_t offset, int32_t tc , int8_t sw, int8_t part_p_nofilter, int8_t part_q_nofilter, int32_t thr_cut, diff --git a/src/picture.c b/src/picture.c index 230f870d..14e60322 100644 --- a/src/picture.c +++ b/src/picture.c @@ -33,6 +33,29 @@ #define PSNRMAX (255.0 * 255.0) + +yuv_t * alloc_yuv_t(int luma_size) +{ + // Get buffers with separate mallocs in order to take advantage of + // automatic buffer overrun checks. + yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv)); + yuv->y = (pixel *)malloc(luma_size * sizeof(*yuv->y)); + yuv->u = (pixel *)malloc(luma_size / 2 * sizeof(*yuv->u)); + yuv->v = (pixel *)malloc(luma_size / 2 * sizeof(*yuv->v)); + yuv->size = luma_size; + + return yuv; +} + +void dealloc_yuv_t(yuv_t * yuv) +{ + free(yuv->y); + free(yuv->u); + free(yuv->v); + free(yuv); +} + + /** * \brief BLock Image Transfer from one buffer to another. * diff --git a/src/picture.h b/src/picture.h index a7977137..90477617 100644 --- a/src/picture.h +++ b/src/picture.h @@ -177,6 +177,13 @@ typedef struct { pixel v[LCU_CHROMA_SIZE]; } lcu_yuv_t; +typedef struct { + int size; + pixel *y; + pixel *u; + pixel *v; +} yuv_t; + typedef struct { lcu_ref_px_t top_ref; //!< Reference pixels from adjacent LCUs. lcu_ref_px_t left_ref; //!< Reference pixels from adjacent LCUs. @@ -202,6 +209,9 @@ typedef struct { ////////////////////////////////////////////////////////////////////////// // FUNCTIONS +yuv_t * alloc_yuv_t(int luma_size); +void dealloc_yuv_t(yuv_t * yuv); + picture * picture_init(int32_t width, int32_t height, int32_t width_in_lcu, int32_t height_in_lcu); int picture_destroy(picture *pic); diff --git a/src/sao.c b/src/sao.c index 3685df66..8edab2db 100644 --- a/src/sao.c +++ b/src/sao.c @@ -634,3 +634,36 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf rec_list[0] = rec; sao_search_best_mode(orig_list, rec_list, block_width, block_height, 1, sao, sao_top, sao_left); } + +void sao_reconstruct_frame(encoder_control *encoder) +{ + vector2d lcu; + picture *pic = encoder->in.cur_pic; + + // These are needed because SAO needs the pre-SAO pixels form left and + // top LCUs. Single pixel wide buffers, like what search_lcu takes, would + // be enough though. + pixel *new_y_data = MALLOC(pixel, pic->width * pic->height); + pixel *new_u_data = MALLOC(pixel, (pic->width * pic->height) >> 2); + pixel *new_v_data = MALLOC(pixel, (pic->width * pic->height) >> 2); + memcpy(new_y_data, pic->y_recdata, sizeof(pixel) * pic->width * pic->height); + memcpy(new_u_data, pic->u_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2); + memcpy(new_v_data, pic->v_recdata, sizeof(pixel) * (pic->width * pic->height) >> 2); + + for (lcu.y = 0; lcu.y < encoder->in.height_in_lcu; lcu.y++) { + for (lcu.x = 0; lcu.x < encoder->in.width_in_lcu; lcu.x++) { + unsigned stride = encoder->in.width_in_lcu; + sao_info *sao_luma = &pic->sao_luma[lcu.y * stride + lcu.x]; + sao_info *sao_chroma = &pic->sao_chroma[lcu.y * stride + lcu.x]; + + // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma); + sao_reconstruct(pic, new_y_data, lcu.x, lcu.y, sao_luma, COLOR_Y); + sao_reconstruct(pic, new_u_data, lcu.x, lcu.y, sao_chroma, COLOR_U); + sao_reconstruct(pic, new_v_data, lcu.x, lcu.y, sao_chroma, COLOR_V); + } + } + + free(new_y_data); + free(new_u_data); + free(new_v_data); +} diff --git a/src/sao.h b/src/sao.h index deaa3a71..45733ceb 100644 --- a/src/sao.h +++ b/src/sao.h @@ -51,5 +51,6 @@ void sao_search_luma(const picture *pic, unsigned x_ctb, unsigned y_ctb, sao_inf void sao_reconstruct(picture *pic, const pixel *old_rec, unsigned x_ctb, unsigned y_ctb, const sao_info *sao, color_index color_i); +void sao_reconstruct_frame(encoder_control *encoder); #endif diff --git a/src/search.c b/src/search.c index 0a220254..75ac55a3 100644 --- a/src/search.c +++ b/src/search.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "config.h" #include "bitstream.h" @@ -882,7 +883,7 @@ static int search_cu(encoder_control *encoder, int x, int y, int depth, lcu_t wo * - Copy reference pixels from neighbouring LCUs. * - Copy reference pixels from this LCU. */ -static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu) +static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t *lcu, const yuv_t *hor_buf, const yuv_t *ver_buf) { // Copy reference cu_info structs from neighbouring LCUs. { @@ -930,58 +931,33 @@ static void init_lcu_t(encoder_control *encoder, const int x, const int y, lcu_t // Copy reference pixels. { - const picture *pic = encoder->in.cur_pic; - const int pic_width = encoder->in.width; - const int pic_height = encoder->in.height; - const int ref_size = LCU_REF_PX_WIDTH; - - const int pic_width_c = encoder->in.width / 2; - const int pic_height_c = encoder->in.height / 2; - const int ref_size_c = LCU_REF_PX_WIDTH / 2; - const int x_c = x / 2; - const int y_c = y / 2; // Copy top reference pixels. if (y > 0) { - int x_max = MIN(ref_size, pic_width - x); - int x_max_c = x_max / 2; - picture_blit_pixels(&pic->y_recdata[x + (y - 1) * pic_width], - &lcu->top_ref.y[1], - x_max, 1, pic_width, ref_size); - - picture_blit_pixels(&pic->u_recdata[x_c + (y_c - 1) * pic_width_c], - &lcu->top_ref.u[1], - x_max, 1, pic_width_c, ref_size_c); - picture_blit_pixels(&pic->v_recdata[x_c + (y_c - 1) * pic_width_c], - &lcu->top_ref.v[1], - x_max, 1, pic_width_c, ref_size_c); + // hor_buf is of size pic_width so there might not be LCU_REF_PX_WIDTH + // number of allocated pixels left. + int x_max = MIN(LCU_REF_PX_WIDTH, pic_width - x); + memcpy(&lcu->top_ref.y[1], &hor_buf->y[x], x_max); + memcpy(&lcu->top_ref.u[1], &hor_buf->u[x / 2], x_max / 2); + memcpy(&lcu->top_ref.v[1], &hor_buf->v[x / 2], x_max / 2); } // Copy left reference pixels. if (x > 0) { - int y_max = MIN(LCU_REF_PX_WIDTH, pic_height - y); - int y_max_c = y_max / 2; - picture_blit_pixels(&pic->y_recdata[(x - 1) + y * pic_width], - &lcu->left_ref.y[1], - 1, y_max, pic_width, 1); - - picture_blit_pixels(&pic->u_recdata[(x_c - 1) + (y_c) * pic_width_c], - &lcu->left_ref.u[1], - 1, y_max_c, pic_width_c, 1); - picture_blit_pixels(&pic->v_recdata[(x_c - 1) + (y_c) * pic_width_c], - &lcu->left_ref.v[1], - 1, y_max_c, pic_width_c, 1); + memcpy(&lcu->left_ref.y[1], &ver_buf->y[1], LCU_WIDTH); + memcpy(&lcu->left_ref.u[1], &ver_buf->u[1], LCU_WIDTH); + memcpy(&lcu->left_ref.v[1], &ver_buf->v[1], LCU_WIDTH); } // Copy top-left reference pixel. if (x > 0 && y > 0) { - lcu->top_ref.y[0] = pic->y_recdata[(x - 1) + (y - 1) * pic_width]; - lcu->left_ref.y[0] = pic->y_recdata[(x - 1) + (y - 1) * pic_width]; + lcu->top_ref.y[0] = ver_buf->y[0]; + lcu->left_ref.y[0] = ver_buf->y[0]; - lcu->top_ref.u[0] = pic->u_recdata[(x_c - 1) + (y_c - 1) * pic_width_c]; - lcu->left_ref.u[0] = pic->u_recdata[(x_c - 1) + (y_c - 1) * pic_width_c]; + lcu->top_ref.u[0] = ver_buf->u[0]; + lcu->left_ref.u[0] = ver_buf->u[0]; - lcu->top_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c]; - lcu->left_ref.v[0] = pic->v_recdata[(x_c - 1) + (y_c - 1) * pic_width_c]; + lcu->top_ref.v[0] = ver_buf->v[0]; + lcu->left_ref.v[0] = ver_buf->v[0]; } } @@ -1065,14 +1041,14 @@ static void copy_lcu_to_cu_data(encoder_control *encoder, int x_px, int y_px, co * Search LCU for modes. * - Best mode gets copied to current picture. */ -void search_lcu(encoder_control *encoder, int x, int y) +void search_lcu(encoder_control *encoder, int x, int y, yuv_t *hor_buf, yuv_t *ver_buf) { lcu_t work_tree[MAX_PU_DEPTH + 1]; int depth; // Initialize work tree. for (depth = 0; depth <= MAX_PU_DEPTH; ++depth) { memset(&work_tree[depth], 0, sizeof(work_tree[depth])); - init_lcu_t(encoder, x, y, &work_tree[depth]); + init_lcu_t(encoder, x, y, &work_tree[depth], hor_buf, ver_buf); } // Start search from depth 0. @@ -1080,17 +1056,3 @@ void search_lcu(encoder_control *encoder, int x, int y) copy_lcu_to_cu_data(encoder, x, y, &work_tree[0]); } - - -/** - * Perform mode search for every LCU in the current picture. - */ -static void search_frame(encoder_control *encoder) -{ - int y_lcu, x_lcu; - for (y_lcu = 0; y_lcu < encoder->in.height_in_lcu; y_lcu++) { - for (x_lcu = 0; x_lcu < encoder->in.width_in_lcu; x_lcu++) { - search_lcu(encoder, x_lcu * LCU_WIDTH, y_lcu * LCU_WIDTH); - } - } -} diff --git a/src/search.h b/src/search.h index d774acc9..4734c448 100644 --- a/src/search.h +++ b/src/search.h @@ -27,8 +27,9 @@ #include "global.h" #include "encoder.h" +#include "picture.h" -void search_lcu(encoder_control *encoder, int x, int y); +void search_lcu(encoder_control *encoder, int x, int y, yuv_t *hor_buf, yuv_t *ver_buf); #endif