From e591e89ade3b6b9c39d671fe58a7c4e4f13fb842 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 15 Sep 2014 15:46:00 +0300 Subject: [PATCH 01/28] Add prediction mode to chroma reconstruction parameters. - Just like in luma. --- src/intra.c | 14 +++++++------- src/intra.h | 2 +- src/search.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/intra.c b/src/intra.c index 84833a58..b4e90b56 100644 --- a/src/intra.c +++ b/src/intra.c @@ -703,7 +703,7 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int } } -void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, lcu_t *lcu) +void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu) { const encoder_control * const encoder = encoder_state->encoder_control; const vector2d lcu_px = { x & 0x3f, y & 0x3f }; @@ -714,10 +714,10 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i if (depth == 0 || cur_cu->tr_depth > depth) { int offset = width / 2; - intra_recon_lcu_chroma(encoder_state, x, y, depth+1, lcu); - intra_recon_lcu_chroma(encoder_state, x + offset, y, depth+1, lcu); - intra_recon_lcu_chroma(encoder_state, x, y + offset, depth+1, lcu); - intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, lcu); + intra_recon_lcu_chroma(encoder_state, x, y, depth+1, intra_mode, lcu); + intra_recon_lcu_chroma(encoder_state, x + offset, y, depth+1, intra_mode, lcu); + intra_recon_lcu_chroma(encoder_state, x, y + offset, depth+1, intra_mode, lcu); + intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu); if (depth < MAX_DEPTH) { cu_info *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) + (lcu_px.y>>3) *LCU_T_CU_WIDTH]; @@ -757,7 +757,7 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i width_c, recbase_u, rec_stride >> 1, - cur_cu->intra[0].mode_chroma, + intra_mode, 1); intra_build_reference_border(encoder, x, y,(int16_t)width_c * 2 + 8, rec, (int16_t)width_c * 2 + 8, 2, @@ -768,7 +768,7 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i width_c, recbase_v, rec_stride >> 1, - cur_cu->intra[0].mode_chroma, + intra_mode, 2); } diff --git a/src/intra.h b/src/intra.h index b6d4de42..e01733ae 100644 --- a/src/intra.h +++ b/src/intra.h @@ -49,6 +49,6 @@ void intra_get_angular_pred(const encoder_control *encoder, pixel* src, int32_t void intra_recon(const encoder_control *encoder, pixel* rec, int32_t rec_stride, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma); void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu); -void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, lcu_t *lcu); +void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu); #endif diff --git a/src/search.c b/src/search.c index 01d7084f..98bccbeb 100644 --- a/src/search.c +++ b/src/search.c @@ -1385,7 +1385,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept intra_mode_chroma, cur_cu->part_size); intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); - intra_recon_lcu_chroma(encoder_state, x, y, depth, &work_tree[depth]); + intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); } else if (cur_cu->type == CU_INTER) { int cbf; inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]); From 549ac96438e90d3009601fc699a3cd90a31e578a Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 17 Sep 2014 11:52:22 +0300 Subject: [PATCH 02/28] Change costs to doubles to avoid rounding intermediate results. - Helps with debugging. --- src/rdo.c | 18 +++---- src/search.c | 137 ++++++++++++++++++++++++--------------------------- 2 files changed, 74 insertions(+), 81 deletions(-) diff --git a/src/rdo.c b/src/rdo.c index 7bf664f3..ff493990 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -196,8 +196,6 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel int16_t block[LCU_WIDTH*LCU_WIDTH>>2]; int16_t temp_block[LCU_WIDTH*LCU_WIDTH>>2]; coefficient temp_coeff[LCU_WIDTH*LCU_WIDTH>>2]; - uint32_t cost = 0; - uint32_t coeffcost = 0; int8_t luma_scan_mode = SCAN_DIAG; int i = 0,x,y; @@ -225,26 +223,28 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel dequant(encoder_state, temp_coeff, pre_quant_coeff, width, width, 0, CU_INTRA); itransform2d(encoder, temp_block,pre_quant_coeff,width,0); + unsigned ssd = 0; // SSD between original and reconstructed for (i = 0; i < width*width; i++) { int diff = temp_block[i]-block[i]; - cost += diff*diff; + ssd += diff*diff; } + double coeff_bits = 0; // Simple RDO if(encoder->rdo == 1) { // SSD between reconstruction and original + sum of coeffs + int coeff_abs = 0; for (i = 0; i < width*width; i++) { - coeffcost += abs((int)temp_coeff[i]); + coeff_abs += abs((int)temp_coeff[i]); } - cost += (1 + coeffcost + (coeffcost>>1))*((int)encoder_state->global->cur_lambda_cost+0.5); + coeff_bits += 1 + 1.5 * coeff_abs; // Full RDO } else if(encoder->rdo >= 2) { - coeffcost = get_coeff_cost(encoder_state, temp_coeff, width, 0, luma_scan_mode); - - cost += coeffcost*((int)encoder_state->global->cur_lambda_cost+0.5); + coeff_bits = get_coeff_cost(encoder_state, temp_coeff, width, 0, luma_scan_mode); } - return cost; + + return (uint32_t)(0.5 + ssd + coeff_bits * encoder_state->global->cur_lambda_cost); } diff --git a/src/search.c b/src/search.c index 98bccbeb..a0fb37c7 100644 --- a/src/search.c +++ b/src/search.c @@ -742,7 +742,7 @@ static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info *cu * Takes into account SSD of reconstruction and the cost of encoding whatever * prediction unit data needs to be coded. */ -static int cu_rd_cost_luma(const encoder_state *const encoder_state, +static double cu_rd_cost_luma(const encoder_state *const encoder_state, const int x_px, const int y_px, const int depth, const cu_info *const pred_cu, lcu_t *const lcu) @@ -753,76 +753,70 @@ static int cu_rd_cost_luma(const encoder_state *const encoder_state, // cur_cu is used for TU parameters. cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH]; - int x, y; - int cost = 0; + double coeff_bits = 0; + double trtree_bits = 0; // Check that lcu is not in assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); - { - int trtree_bits = 0; - if (width <= TR_MAX_WIDTH + if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH - && pred_cu->part_size != SIZE_NxN) { - trtree_bits += 1; // split_transform_flag - } - cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + && pred_cu->part_size != SIZE_NxN) + { + trtree_bits += 1; // split_transform_flag } + if (tr_cu->tr_depth > depth) { int offset = width / 2; + double sum = 0; - cost += cu_rd_cost_luma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_luma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); - return cost; + return sum + trtree_bits * encoder_state->global->cur_lambda_cost; } if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) { - int trtree_bits = 1; // cbf_luma - cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + trtree_bits += 1; // cbf_luma } + unsigned ssd = 0; // SSD between reconstruction and original - for (y = y_px; y < y_px + width; ++y) { - for (x = x_px; x < x_px + width; ++x) { + for (int y = y_px; y < y_px + width; ++y) { + for (int x = x_px; x < x_px + width; ++x) { int diff = (int)lcu->rec.y[y * LCU_WIDTH + x] - (int)lcu->ref.y[y * LCU_WIDTH + x]; - cost += diff*diff; + ssd += diff*diff; } } if (rdo == 1) { - int coeff_cost = 0; + int coeff_abs = 0; // Estimate coding cost to be 1.5 * summ of abs coeffs. - for (y = y_px; y < y_px + width; ++y) { - for (x = x_px; x < x_px + width; ++x) { - coeff_cost += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]); + for (int y = y_px; y < y_px + width; ++y) { + for (int x = x_px; x < x_px + width; ++x) { + coeff_abs += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]); } } - cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); - + coeff_bits += 1.5 * coeff_abs; } else if (rdo >= 2) { - int coeff_cost = 0; - coefficient coeff_temp[32 * 32]; int8_t luma_scan_mode = get_scan_order(pred_cu->type, pred_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth); // Code coeffs using cabac to get a better estimate of real coding costs. coefficients_blit(&lcu->coeff.y[(y_px*LCU_WIDTH) + x_px], coeff_temp, width, width, LCU_WIDTH, width); - coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode); - - // Multiply bit count with lambda to get RD-cost - cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode); } - return cost; + double bits = trtree_bits + coeff_bits; + return ssd + bits * encoder_state->global->cur_lambda_cost; } -static int cu_rd_cost_chroma(const encoder_state *const encoder_state, +static double cu_rd_cost_chroma(const encoder_state *const encoder_state, const int x_px, const int y_px, const int depth, const cu_info *const pred_cu, lcu_t *const lcu) @@ -832,15 +826,13 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state, const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH]; - int x, y; - - int cost = 0; + double trtree_bits = 0; + double coeff_bits = 0; assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); if (depth < MAX_PU_DEPTH) { - int trtree_bits = 0; // cbf_c bits are present only when log2TrafoSize > 2 if (tr_cu->tr_depth == depth) { // cbf_c bits are always present at transform depth 0. @@ -850,7 +842,6 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state, trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1); trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1); } - cost += trtree_bits * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) { // For MAX_PU_DEPTH calculate chroma for previous depth for the first // block and return 0 cost for all others. @@ -859,57 +850,58 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state, if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); + int sum = 0; - cost += cu_rd_cost_chroma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); - cost += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_chroma(encoder_state, x_px, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); + sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); - return cost; + return sum + trtree_bits * encoder_state->global->cur_lambda_cost; } // Chroma SSD - for (y = lcu_px.y; y < lcu_px.y + width; ++y) { - for (x = lcu_px.x; x < lcu_px.x + width; ++x) { + int ssd = 0; + for (int y = lcu_px.y; y < lcu_px.y + width; ++y) { + for (int x = lcu_px.x; x < lcu_px.x + width; ++x) { int diff = (int)lcu->rec.u[y * LCU_WIDTH_C + x] - (int)lcu->ref.u[y * LCU_WIDTH_C + x]; - cost += diff * diff; - diff = (int)lcu->rec.v[y * LCU_WIDTH_C + x] - (int)lcu->ref.v[y * LCU_WIDTH_C + x]; - cost += diff * diff; + ssd += diff * diff; + } + } + for (int y = lcu_px.y; y < lcu_px.y + width; ++y) { + for (int x = lcu_px.x; x < lcu_px.x + width; ++x) { + int diff = (int)lcu->rec.v[y * LCU_WIDTH_C + x] - (int)lcu->ref.v[y * LCU_WIDTH_C + x]; + ssd += diff * diff; } } if (rdo == 1) { - int coeff_cost = 0; + int coeff_abs = 0; // Estimate coding cost to be 1.5 * summ of abs coeffs. - for (y = lcu_px.y; y < lcu_px.y + width; ++y) { - for (x = lcu_px.x; x < lcu_px.x + width; ++x) { - coeff_cost += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]); - coeff_cost += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]); + for (int y = lcu_px.y; y < lcu_px.y + width; ++y) { + for (int x = lcu_px.x; x < lcu_px.x + width; ++x) { + coeff_abs += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]); + coeff_abs += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]); } } - cost += (coeff_cost + (coeff_cost >> 1)) * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + coeff_bits = 1.5 * coeff_abs; } else if (rdo >= 2) { coefficient coeff_temp[16 * 16]; int8_t scan_order = get_scan_order(pred_cu->type, pred_cu->intra[0].mode_chroma, depth); - - int coeff_cost = 0; - + coefficients_blit(&lcu->coeff.u[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x], - coeff_temp, width, width, LCU_WIDTH_C, width); - coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order); - + coeff_temp, width, width, LCU_WIDTH_C, width); + coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order); coefficients_blit(&lcu->coeff.v[(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x], - coeff_temp, width, width, LCU_WIDTH_C, width); - coeff_cost += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order); - - // Multiply bit count with lambda to get RD-cost - cost += coeff_cost * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + coeff_temp, width, width, LCU_WIDTH_C, width); + coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order); } - return cost; + double bits = trtree_bits + coeff_bits; + return ssd + bits * encoder_state->global->cur_lambda_cost; } @@ -925,7 +917,7 @@ static int cu_rd_cost_chroma(const encoder_state *const encoder_state, * \param intra_mode Intra prediction mode. * \param cost_treshold RD cost at which search can be stopped. */ -static int32_t search_intra_trdepth(encoder_state * const encoder_state, +static double search_intra_trdepth(encoder_state * const encoder_state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, const cu_info *const pred_cu, @@ -938,8 +930,8 @@ static int32_t search_intra_trdepth(encoder_state * const encoder_state, pixel nosplit_pixels[TR_MAX_WIDTH*TR_MAX_WIDTH]; - int32_t split_cost = INT32_MAX; - int32_t nosplit_cost = INT32_MAX; + double split_cost = INT32_MAX; + double nosplit_cost = INT32_MAX; assert(width >= TR_MIN_WIDTH); @@ -962,7 +954,7 @@ static int32_t search_intra_trdepth(encoder_state * const encoder_state, } if (depth < max_depth && depth < MAX_PU_DEPTH) { - split_cost = 3 * (int32_t)(encoder_state->global->cur_lambda_cost + 0.5); + split_cost = 3 * encoder_state->global->cur_lambda_cost; split_cost += search_intra_trdepth(encoder_state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); if (split_cost < nosplit_cost) { @@ -1200,7 +1192,8 @@ static void search_intra_rdo(encoder_state * const encoder_state, // Reset transform split data in lcu.cu for this area. lcu_set_trdepth(lcu, x_px, y_px, depth, depth); - costs[rdo_mode] += search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); + double mode_cost = search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); + costs[rdo_mode] += (uint32_t)(0.5 + mode_cost); } } From c164978e210ec7090b40df919ad232a9382c160e Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 17 Sep 2014 12:09:15 +0300 Subject: [PATCH 03/28] Add FULL_CU_SPLIT_SEARCH macro for disabling cu split optimization. --- src/search.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index a0fb37c7..efdbeae6 100644 --- a/src/search.c +++ b/src/search.c @@ -47,6 +47,13 @@ && (x) + (block_width) <= (width) \ && (y) + (block_height) <= (height)) +#ifndef CU_SPLIT_COST +# define CU_SPLIT_COST 9 +#endif +#ifndef FULL_CU_SPLIT_SEARCH +# define FULL_CU_SPLIT_SEARCH false +#endif + /** * This is used in the hexagon_search to select 3 points to search. * @@ -1404,11 +1411,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept // Bitcost cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5); } - -#ifndef CU_SPLIT_COST -# define CU_SPLIT_COST 9 -#endif - + // Recursively split all the way to max search depth. if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) { int half_cu = cu_width / 2; @@ -1419,7 +1422,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting // might not give any better results but takes more time to do. - if(cur_cu->type == CU_NOTSET || cbf) { + if (cur_cu->type == CU_NOTSET || cbf || FULL_CU_SPLIT_SEARCH) { split_cost += search_cu(encoder_state, x, y, depth + 1, work_tree); split_cost += search_cu(encoder_state, x + half_cu, y, depth + 1, work_tree); split_cost += search_cu(encoder_state, x, y + half_cu, depth + 1, work_tree); From a0ab469c89fe9973066ef8db7da3b7a69ee65750 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 17 Sep 2014 12:26:26 +0300 Subject: [PATCH 04/28] Disable rdo_cost_intra. --- src/search.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search.c b/src/search.c index efdbeae6..a9765351 100644 --- a/src/search.c +++ b/src/search.c @@ -1184,7 +1184,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, int rdo_bitcost = intra_pred_ratecost(modes[rdo_mode], intra_preds); costs[rdo_mode] = rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5); - if (tr_depth == depth) { + if (0 && tr_depth == depth) { // The reconstruction is calculated again here, it could be saved from before.. intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[rdo_mode], 0); costs[rdo_mode] += rdo_cost_intra(encoder_state, pred, orig_block, width, modes[rdo_mode], width == 4 ? 1 : 0); From ccc575e2c6e0814199b39523c3e442d36cb576fe Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Tue, 23 Sep 2014 09:47:59 +0300 Subject: [PATCH 05/28] Disable transform tree bits. --- src/search.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/search.c b/src/search.c index a9765351..2416b252 100644 --- a/src/search.c +++ b/src/search.c @@ -771,7 +771,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, && width > TR_MIN_WIDTH && pred_cu->part_size != SIZE_NxN) { - trtree_bits += 1; // split_transform_flag + //trtree_bits += 1; // split_transform_flag } if (tr_cu->tr_depth > depth) { @@ -787,7 +787,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, } if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) { - trtree_bits += 1; // cbf_luma + //trtree_bits += 1; // cbf_luma } unsigned ssd = 0; @@ -843,11 +843,11 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, // cbf_c bits are present only when log2TrafoSize > 2 if (tr_cu->tr_depth == depth) { // cbf_c bits are always present at transform depth 0. - trtree_bits += 2; + //trtree_bits += 2; } else { // cbf_c bits are not present if cbf has already been set to 0. - trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1); - trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1); + //trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1); + //trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1); } } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) { // For MAX_PU_DEPTH calculate chroma for previous depth for the first From bc7d7d5cb6a68d541fbadf17ea48a2b4b796e840 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Tue, 23 Sep 2014 14:41:25 +0300 Subject: [PATCH 06/28] Add cu_info* as parameter to reconstruction functions. - This is required so these functions can be used for searching. When NULL is given they take the CU from LCU struct as they did previously. Conflicts: src/search.c --- src/intra.c | 33 +++++++++++++++++++-------------- src/intra.h | 4 ++-- src/search.c | 10 +++++----- src/transform.c | 28 ++++++++++++++++------------ src/transform.h | 4 ++-- 5 files changed, 44 insertions(+), 35 deletions(-) diff --git a/src/intra.c b/src/intra.c index b4e90b56..49236100 100644 --- a/src/intra.c +++ b/src/intra.c @@ -656,20 +656,22 @@ void intra_get_planar_pred(pixel* src, int32_t srcstride, uint32_t width, pixel* } } -void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu) +void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu) { const encoder_control * const encoder = encoder_state->encoder_control; const vector2d lcu_px = { x & 0x3f, y & 0x3f }; - cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH]; + if (cur_cu == NULL) { + cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + } const int8_t width = LCU_WIDTH >> depth; if (depth == 0 || cur_cu->tr_depth > depth) { int offset = width / 2; - intra_recon_lcu_luma(encoder_state, x, y, depth+1, intra_mode, lcu); - intra_recon_lcu_luma(encoder_state, x + offset, y, depth+1, intra_mode, lcu); - intra_recon_lcu_luma(encoder_state, x, y + offset, depth+1, intra_mode, lcu); - intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu); + intra_recon_lcu_luma(encoder_state, x, y, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_luma(encoder_state, x + offset, y, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_luma(encoder_state, x, y + offset, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_luma(encoder_state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu); if (depth < MAX_DEPTH) { cu_info *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) + (lcu_px.y>>3) *LCU_T_CU_WIDTH]; @@ -699,25 +701,28 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int intra_recon(encoder, rec_shift, width * 2 + 8, width, recbase_y, rec_stride, intra_mode, 0); - quantize_lcu_luma_residual(encoder_state, x, y, depth, lcu); + quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, lcu); } } -void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu) +void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu) { const encoder_control * const encoder = encoder_state->encoder_control; const vector2d lcu_px = { x & 0x3f, y & 0x3f }; - cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH]; const int8_t width = LCU_WIDTH >> depth; const int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2); + if (cur_cu == NULL) { + cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + } + if (depth == 0 || cur_cu->tr_depth > depth) { int offset = width / 2; - intra_recon_lcu_chroma(encoder_state, x, y, depth+1, intra_mode, lcu); - intra_recon_lcu_chroma(encoder_state, x + offset, y, depth+1, intra_mode, lcu); - intra_recon_lcu_chroma(encoder_state, x, y + offset, depth+1, intra_mode, lcu); - intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, lcu); + intra_recon_lcu_chroma(encoder_state, x, y, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_chroma(encoder_state, x + offset, y, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_chroma(encoder_state, x, y + offset, depth+1, intra_mode, NULL, lcu); + intra_recon_lcu_chroma(encoder_state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu); if (depth < MAX_DEPTH) { cu_info *cu_a = &lcu->cu[LCU_CU_OFFSET + ((lcu_px.x + offset)>>3) + (lcu_px.y>>3) *LCU_T_CU_WIDTH]; @@ -772,6 +777,6 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i 2); } - quantize_lcu_chroma_residual(encoder_state, x, y, depth, lcu); + quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, lcu); } } diff --git a/src/intra.h b/src/intra.h index e01733ae..3df46c9f 100644 --- a/src/intra.h +++ b/src/intra.h @@ -48,7 +48,7 @@ void intra_get_angular_pred(const encoder_control *encoder, pixel* src, int32_t void intra_recon(const encoder_control *encoder, pixel* rec, int32_t rec_stride, uint32_t width, pixel* dst, int32_t dst_stride, int8_t mode, int8_t chroma); -void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu); -void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, lcu_t *lcu); +void intra_recon_lcu_luma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu); +void intra_recon_lcu_chroma(encoder_state *encoder_state, int x, int y, int depth, int8_t intra_mode, cu_info *cur_cu, lcu_t *lcu); #endif diff --git a/src/search.c b/src/search.c index 2416b252..f993750e 100644 --- a/src/search.c +++ b/src/search.c @@ -944,7 +944,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, if (depth > 0) { tr_cu->tr_depth = depth; - intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, lcu); + intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); // Clear cbf bits because they have been set by the reconstruction. @@ -1384,13 +1384,13 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept intra_mode, intra_mode_chroma, cur_cu->part_size); - intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); - intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); + intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]); + intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]); } else if (cur_cu->type == CU_INTER) { int cbf; inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]); - quantize_lcu_luma_residual(encoder_state, x, y, depth, &work_tree[depth]); - quantize_lcu_chroma_residual(encoder_state, x, y, depth, &work_tree[depth]); + quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, &work_tree[depth]); + quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, &work_tree[depth]); cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth); diff --git a/src/transform.c b/src/transform.c index daa6a11d..2b9042e2 100644 --- a/src/transform.c +++ b/src/transform.c @@ -523,12 +523,14 @@ int quantize_residual_trskip( * - lcu->cbf coded block flags for the area * - lcu->cu.intra[].tr_skip for the area */ -void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu) +void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, cu_info *cur_cu, lcu_t* lcu) { // we have 64>>depth transform size const vector2d lcu_px = {x & 0x3f, y & 0x3f}; const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4); - cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH]; + if (cur_cu == NULL) { + cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + } const int8_t width = LCU_WIDTH>>depth; // Tell clang-analyzer what is up. For some reason it can't figure out from @@ -538,10 +540,10 @@ void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, // Split transform and increase depth if (depth == 0 || cur_cu->tr_depth > depth) { int offset = width / 2; - quantize_lcu_luma_residual(encoder_state, x, y, depth+1, lcu); - quantize_lcu_luma_residual(encoder_state, x + offset, y, depth+1, lcu); - quantize_lcu_luma_residual(encoder_state, x, y + offset, depth+1, lcu); - quantize_lcu_luma_residual(encoder_state, x + offset, y + offset, depth+1, lcu); + quantize_lcu_luma_residual(encoder_state, x, y, depth+1, NULL, lcu); + quantize_lcu_luma_residual(encoder_state, x + offset, y, depth+1, NULL, lcu); + quantize_lcu_luma_residual(encoder_state, x, y + offset, depth+1, NULL, lcu); + quantize_lcu_luma_residual(encoder_state, x + offset, y + offset, depth+1, NULL, lcu); // Propagate coded block flags from child CUs to parent CU. if (depth < MAX_DEPTH) { @@ -605,13 +607,15 @@ void quantize_lcu_luma_residual(encoder_state * const encoder_state, int32_t x, } -void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, lcu_t* lcu) +void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x, int32_t y, const uint8_t depth, cu_info *cur_cu, lcu_t* lcu) { // we have 64>>depth transform size const vector2d lcu_px = {x & 0x3f, y & 0x3f}; const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4); - cu_info *cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x>>3) + (lcu_px.y>>3)*LCU_T_CU_WIDTH]; const int8_t width = LCU_WIDTH>>depth; + if (cur_cu == NULL) { + cur_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + } // Tell clang-analyzer what is up. For some reason it can't figure out from // asserting just depth. @@ -620,10 +624,10 @@ void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x // Split transform and increase depth if (depth == 0 || cur_cu->tr_depth > depth) { int offset = width / 2; - quantize_lcu_chroma_residual(encoder_state, x, y, depth+1, lcu); - quantize_lcu_chroma_residual(encoder_state, x + offset, y, depth+1, lcu); - quantize_lcu_chroma_residual(encoder_state, x, y + offset, depth+1, lcu); - quantize_lcu_chroma_residual(encoder_state, x + offset, y + offset, depth+1, lcu); + quantize_lcu_chroma_residual(encoder_state, x, y, depth+1, NULL, lcu); + quantize_lcu_chroma_residual(encoder_state, x + offset, y, depth+1, NULL, lcu); + quantize_lcu_chroma_residual(encoder_state, x, y + offset, depth+1, NULL, lcu); + quantize_lcu_chroma_residual(encoder_state, x + offset, y + offset, depth+1, NULL, lcu); // Propagate coded block flags from child CUs to parent CU. if (depth < MAX_DEPTH) { diff --git a/src/transform.h b/src/transform.h index 1479e90e..e45b1fe6 100644 --- a/src/transform.h +++ b/src/transform.h @@ -46,7 +46,7 @@ void itransform2d(const encoder_control *encoder, int16_t *block,int16_t *coeff, int32_t get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset); -void quantize_lcu_luma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu); -void quantize_lcu_chroma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, lcu_t* lcu); +void quantize_lcu_luma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, cu_info *cur_cu, lcu_t* lcu); +void quantize_lcu_chroma_residual(encoder_state *encoder_state, int32_t x, int32_t y, uint8_t depth, cu_info *cur_cu, lcu_t* lcu); #endif From 51662e1081b69d206d953432681fd66e29d98091 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Tue, 23 Sep 2014 15:17:56 +0300 Subject: [PATCH 07/28] Fix differences between cu_rd_cost_luma and rdo_cost_intra. --- src/intra.c | 2 +- src/rdo.c | 4 +++- src/search.c | 13 +++++++++++-- src/transform.c | 4 +++- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/intra.c b/src/intra.c index 49236100..3ccf0fb6 100644 --- a/src/intra.c +++ b/src/intra.c @@ -701,7 +701,7 @@ void intra_recon_lcu_luma(encoder_state * const encoder_state, int x, int y, int intra_recon(encoder, rec_shift, width * 2 + 8, width, recbase_y, rec_stride, intra_mode, 0); - quantize_lcu_luma_residual(encoder_state, x, y, depth, NULL, lcu); + quantize_lcu_luma_residual(encoder_state, x, y, depth, cur_cu, lcu); } } diff --git a/src/rdo.c b/src/rdo.c index ff493990..d3635f37 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -226,7 +226,9 @@ uint32_t rdo_cost_intra(encoder_state * const encoder_state, pixel *pred, pixel unsigned ssd = 0; // SSD between original and reconstructed for (i = 0; i < width*width; i++) { - int diff = temp_block[i]-block[i]; + //int diff = temp_block[i]-block[i]; + int diff = orig_block[i] - CLIP(0, 255, pred[i] + temp_block[i]); + ssd += diff*diff; } diff --git a/src/search.c b/src/search.c index f993750e..d1eb5b2d 100644 --- a/src/search.c +++ b/src/search.c @@ -927,7 +927,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, static double search_intra_trdepth(encoder_state * const encoder_state, int x_px, int y_px, int depth, int max_depth, int intra_mode, int cost_treshold, - const cu_info *const pred_cu, + cu_info *const pred_cu, lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; @@ -944,6 +944,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, if (depth > 0) { tr_cu->tr_depth = depth; + pred_cu->tr_depth = depth; intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); @@ -1195,6 +1196,10 @@ static void search_intra_rdo(encoder_state * const encoder_state, pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); pred_cu.intra[0].mode = modes[rdo_mode]; + pred_cu.intra[1].mode = modes[rdo_mode]; + pred_cu.intra[2].mode = modes[rdo_mode]; + pred_cu.intra[3].mode = modes[rdo_mode]; + pred_cu.intra[0].mode_chroma = modes[rdo_mode]; // Reset transform split data in lcu.cu for this area. lcu_set_trdepth(lcu, x_px, y_px, depth, depth); @@ -1212,6 +1217,10 @@ static void search_intra_rdo(encoder_state * const encoder_state, pred_cu.type = CU_INTRA; pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); pred_cu.intra[0].mode = modes[0]; + pred_cu.intra[1].mode = modes[0]; + pred_cu.intra[2].mode = modes[0]; + pred_cu.intra[3].mode = modes[0]; + pred_cu.intra[0].mode_chroma = modes[0]; search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); } } @@ -1294,7 +1303,7 @@ static int search_cu_intra(encoder_state * const encoder_state, } int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search); search_intra_rdo(encoder_state, - lcu_px.x, lcu_px.y, depth, + x_px, y_px, depth, ref_pixels, LCU_WIDTH, cu_in_rec_buffer, cu_width * 2 + 8, candidate_modes, diff --git a/src/transform.c b/src/transform.c index 2b9042e2..ee16d7b2 100644 --- a/src/transform.c +++ b/src/transform.c @@ -361,8 +361,10 @@ int quantize_residual(encoder_state *const encoder_state, // Quantize coeffs. (coeff -> quant_coeff) if (encoder_state->encoder_control->rdoq_enable) { + int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; + tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); rdoq(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2), - scan_order, cur_cu->type, cur_cu->tr_depth-cur_cu->depth); + scan_order, cur_cu->type, tr_depth); } else { quant(encoder_state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2), scan_order, cur_cu->type); From 8a80845b914d89e60100240968c4c9abbd425f43 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 1 Oct 2014 12:31:10 +0300 Subject: [PATCH 08/28] Add chroma to transform split search. --- src/intra.c | 2 +- src/search.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/intra.c b/src/intra.c index 3ccf0fb6..b50eb457 100644 --- a/src/intra.c +++ b/src/intra.c @@ -777,6 +777,6 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i 2); } - quantize_lcu_chroma_residual(encoder_state, x, y, depth, NULL, lcu); + quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu); } } diff --git a/src/search.c b/src/search.c index d1eb5b2d..69b7b603 100644 --- a/src/search.c +++ b/src/search.c @@ -931,11 +931,17 @@ static double search_intra_trdepth(encoder_state * const encoder_state, lcu_t *const lcu) { const int width = LCU_WIDTH >> depth; + const int width_c = width > TR_MIN_WIDTH ? width / 2 : width; + const int offset = width / 2; const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; - pixel nosplit_pixels[TR_MAX_WIDTH*TR_MAX_WIDTH]; + struct { + pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; + pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH]; + pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH]; + } nosplit_pixels; double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; @@ -945,11 +951,16 @@ static double search_intra_trdepth(encoder_state * const encoder_state, if (depth > 0) { tr_cu->tr_depth = depth; pred_cu->tr_depth = depth; - intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); - nosplit_cost = cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); - // Clear cbf bits because they have been set by the reconstruction. - cbf_clear(&tr_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4)); + nosplit_cost = 0.0; + + intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); + nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + + if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); + nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); + } // Early stop codition for the recursive search. // If the cost of any 1/4th of the transform is already larger than the @@ -958,7 +969,11 @@ static double search_intra_trdepth(encoder_state * const encoder_state, return nosplit_cost; } - pixels_blit(lcu->rec.y, nosplit_pixels, width, width, LCU_WIDTH, width); + pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width); + if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c); + pixels_blit(lcu->rec.v, nosplit_pixels.v, width_c, width_c, LCU_WIDTH_C, width_c); + } } if (depth < max_depth && depth < MAX_PU_DEPTH) { @@ -985,7 +1000,11 @@ static double search_intra_trdepth(encoder_state * const encoder_state, // We only restore the pixel data and not coefficients or cbf data. // The only thing we really need are the border pixels. - pixels_blit(nosplit_pixels, lcu->rec.y, width, width, width, LCU_WIDTH); + pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH); + if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C); + pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C); + } return nosplit_cost; } From e1b801eb6ffe2f9b67ce79f5ab6915a0aa6693e4 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 1 Oct 2014 12:32:29 +0300 Subject: [PATCH 09/28] Add transform tree chroma cbf bits. --- src/search.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/src/search.c b/src/search.c index 69b7b603..68f8d561 100644 --- a/src/search.c +++ b/src/search.c @@ -767,14 +767,37 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); + bool split_transform_flag = tr_cu->tr_depth > depth; + + // Add cost of intra split flag on transform tree. + bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + double tr_tree_bits = 0.0; if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH - && pred_cu->part_size != SIZE_NxN) + && !intra_split_flag) { - //trtree_bits += 1; // split_transform_flag + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag); } - if (tr_cu->tr_depth > depth) { + // Add cost of cbf chroma bits on transform tree. + // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true + // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0 + // if this and any previous transform block has no chroma coefficients. + // When searching the first block we don't actually know the real values, + // so this will code cbf as 0 and not code the cbf at all for descendants. + int tr_depth = depth - pred_cu->depth; + if (depth < MAX_PU_DEPTH) { // log2TrafoSize > 2 + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) { + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth)); + } + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) { + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth)); + } + } + + if (split_transform_flag) { int offset = width / 2; double sum = 0; @@ -989,6 +1012,35 @@ static double search_intra_trdepth(encoder_state * const encoder_state, if (split_cost < nosplit_cost) { split_cost += search_intra_trdepth(encoder_state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu); } + + double tr_split_bit = 0.0; + double cbf_bits = 0.0; + + bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; + if (depth >= 1 && depth <= 3 && !intra_split_flag) { + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); + tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); + } + + // Add cost of cbf chroma bits on transform tree. + // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true + // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0 + // if this and any previous transform block has no chroma coefficients. + // When searching the first block we don't actually know the real values, + // so this will code cbf as 0 and not code the cbf at all for descendants. + int tr_depth = depth - pred_cu->depth; + if (depth < MAX_PU_DEPTH) { // log2TrafoSize > 2 + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) { + cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth)); + } + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) { + cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth)); + } + } + + double bits = tr_split_bit + cbf_bits; + split_cost += bits * encoder_state->global->cur_lambda_cost; } else { assert(width <= TR_MAX_WIDTH); } From 85dea10f3f56a90f534f619e29296db1162874b1 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 1 Oct 2014 16:51:49 +0300 Subject: [PATCH 10/28] Clean up transform split search. - Remove unnecessary checks and comment. --- src/search.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/search.c b/src/search.c index 68f8d561..10b3769a 100644 --- a/src/search.c +++ b/src/search.c @@ -960,6 +960,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state, const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); + struct { pixel y[TR_MAX_WIDTH*TR_MAX_WIDTH]; pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH]; @@ -980,7 +982,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); - if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + if (reconstruct_chroma) { intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } @@ -993,12 +995,17 @@ static double search_intra_trdepth(encoder_state * const encoder_state, } pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width); - if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + if (reconstruct_chroma) { pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c); pixels_blit(lcu->rec.v, nosplit_pixels.v, width_c, width_c, LCU_WIDTH_C, width_c); } } + // Recurse further if all of the following: + // - Current depth is less than maximum depth of the search (max_depth). + // - Maximum transform hierarchy depth is constrained by clipping + // max_depth. + // - Min transform size hasn't been reached (MAX_PU_DEPTH). if (depth < max_depth && depth < MAX_PU_DEPTH) { split_cost = 3 * encoder_state->global->cur_lambda_cost; @@ -1016,8 +1023,9 @@ static double search_intra_trdepth(encoder_state * const encoder_state, double tr_split_bit = 0.0; double cbf_bits = 0.0; - bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; - if (depth >= 1 && depth <= 3 && !intra_split_flag) { + // Add bits for split_transform_flag = 1, because transform depth search bypasses + // the normal recursion in the cost functions. + if (depth >= 1 && depth <= 3) { const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1); } @@ -1028,8 +1036,9 @@ static double search_intra_trdepth(encoder_state * const encoder_state, // if this and any previous transform block has no chroma coefficients. // When searching the first block we don't actually know the real values, // so this will code cbf as 0 and not code the cbf at all for descendants. - int tr_depth = depth - pred_cu->depth; - if (depth < MAX_PU_DEPTH) { // log2TrafoSize > 2 + { + const uint8_t tr_depth = depth - pred_cu->depth; + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) { cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth)); @@ -1053,7 +1062,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, // We only restore the pixel data and not coefficients or cbf data. // The only thing we really need are the border pixels. pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH); - if (PU_INDEX(x_px >> 2, y_px >> 2) == 0) { + if (reconstruct_chroma) { pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C); pixels_blit(nosplit_pixels.v, lcu->rec.v, width_c, width_c, width_c, LCU_WIDTH_C); } From 296f142d9e258697ad14dff7bb7f0f22125325d0 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 1 Oct 2014 18:06:28 +0300 Subject: [PATCH 11/28] Retain coded block flag data during transform split search. --- src/search.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/search.c b/src/search.c index 10b3769a..7e5e0d9c 100644 --- a/src/search.c +++ b/src/search.c @@ -967,6 +967,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, pixel u[TR_MAX_WIDTH*TR_MAX_WIDTH]; pixel v[TR_MAX_WIDTH*TR_MAX_WIDTH]; } nosplit_pixels; + cu_cbf_t nosplit_cbf; double split_cost = INT32_MAX; double nosplit_cost = INT32_MAX; @@ -979,10 +980,15 @@ static double search_intra_trdepth(encoder_state * const encoder_state, nosplit_cost = 0.0; + cbf_clear(&pred_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4)); + intra_recon_lcu_luma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost += cu_rd_cost_luma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); if (reconstruct_chroma) { + cbf_clear(&pred_cu->cbf.u, depth); + cbf_clear(&pred_cu->cbf.v, depth); + intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, intra_mode, pred_cu, lcu); nosplit_cost += cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu); } @@ -994,6 +1000,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state, return nosplit_cost; } + nosplit_cbf = pred_cu->cbf; + pixels_blit(lcu->rec.y, nosplit_pixels.y, width, width, LCU_WIDTH, width); if (reconstruct_chroma) { pixels_blit(lcu->rec.u, nosplit_pixels.u, width_c, width_c, LCU_WIDTH_C, width_c); @@ -1059,6 +1067,8 @@ static double search_intra_trdepth(encoder_state * const encoder_state, } else { lcu_set_trdepth(lcu, x_px, y_px, depth, depth); + pred_cu->cbf = nosplit_cbf; + // We only restore the pixel data and not coefficients or cbf data. // The only thing we really need are the border pixels. pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH); @@ -1280,6 +1290,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, pred_cu.intra[2].mode = modes[rdo_mode]; pred_cu.intra[3].mode = modes[rdo_mode]; pred_cu.intra[0].mode_chroma = modes[rdo_mode]; + memset(&pred_cu.cbf, 0, sizeof(pred_cu.cbf)); // Reset transform split data in lcu.cu for this area. lcu_set_trdepth(lcu, x_px, y_px, depth, depth); @@ -1301,6 +1312,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, pred_cu.intra[2].mode = modes[0]; pred_cu.intra[3].mode = modes[0]; pred_cu.intra[0].mode_chroma = modes[0]; + memset(&pred_cu.cbf, 0, sizeof(pred_cu.cbf)); search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu); } } From 3b04d39db40a1302df6986aa834c35e387e7dbf4 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Thu, 2 Oct 2014 11:51:34 +0300 Subject: [PATCH 12/28] Take cabac bits into account on transform tree. --- src/search.c | 53 ++++++++++++++++++---------------------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/search.c b/src/search.c index 7e5e0d9c..f952a277 100644 --- a/src/search.c +++ b/src/search.c @@ -761,7 +761,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH]; double coeff_bits = 0; - double trtree_bits = 0; + double tr_tree_bits = 0; // Check that lcu is not in assert(x_px >= 0 && x_px < LCU_WIDTH); @@ -771,7 +771,6 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, // Add cost of intra split flag on transform tree. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; - double tr_tree_bits = 0.0; if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag) @@ -780,23 +779,6 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag); } - // Add cost of cbf chroma bits on transform tree. - // All cbf bits are accumulated to pred_cu.cbf and cbf_is_set returns true - // if cbf is set at any level >= depth, so cbf chroma is assumed to be 0 - // if this and any previous transform block has no chroma coefficients. - // When searching the first block we don't actually know the real values, - // so this will code cbf as 0 and not code the cbf at all for descendants. - int tr_depth = depth - pred_cu->depth; - if (depth < MAX_PU_DEPTH) { // log2TrafoSize > 2 - const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth)); - } - if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) { - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth)); - } - } - if (split_transform_flag) { int offset = width / 2; double sum = 0; @@ -806,7 +788,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, sum += cu_rd_cost_luma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); sum += cu_rd_cost_luma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); - return sum + trtree_bits * encoder_state->global->cur_lambda_cost; + return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost; } if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) { @@ -841,7 +823,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 0, luma_scan_mode); } - double bits = trtree_bits + coeff_bits; + double bits = tr_tree_bits + coeff_bits; return ssd + bits * encoder_state->global->cur_lambda_cost; } @@ -856,28 +838,29 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth; cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x / 4) + (lcu_px.y / 4)*LCU_T_CU_WIDTH]; - double trtree_bits = 0; + double tr_tree_bits = 0; double coeff_bits = 0; assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); - if (depth < MAX_PU_DEPTH) { - // cbf_c bits are present only when log2TrafoSize > 2 - if (tr_cu->tr_depth == depth) { - // cbf_c bits are always present at transform depth 0. - //trtree_bits += 2; - } else { - // cbf_c bits are not present if cbf has already been set to 0. - //trtree_bits += cbf_is_set(tr_cu->cbf.u, depth - 1); - //trtree_bits += cbf_is_set(tr_cu->cbf.v, depth - 1); - } - } else if (PU_INDEX(x_px / 4, y_px / 4) != 0) { + if (PU_INDEX(x_px / 4, y_px / 4) != 0) { // For MAX_PU_DEPTH calculate chroma for previous depth for the first // block and return 0 cost for all others. return 0; } + if (depth < MAX_PU_DEPTH) { + const int tr_depth = depth - pred_cu->depth; + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_chroma[tr_depth]); + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) { + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth)); + } + if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) { + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth)); + } + } + if (tr_cu->tr_depth > depth) { int offset = LCU_WIDTH >> (depth + 1); int sum = 0; @@ -887,7 +870,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, sum += cu_rd_cost_chroma(encoder_state, x_px, y_px + offset, depth + 1, pred_cu, lcu); sum += cu_rd_cost_chroma(encoder_state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu); - return sum + trtree_bits * encoder_state->global->cur_lambda_cost; + return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost; } // Chroma SSD @@ -930,7 +913,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, coeff_bits += get_coeff_cost(encoder_state, coeff_temp, width, 2, scan_order); } - double bits = trtree_bits + coeff_bits; + double bits = tr_tree_bits + coeff_bits; return ssd + bits * encoder_state->global->cur_lambda_cost; } From 17473624d3c61c8aa29ff9836fc4f9018e60c483 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Thu, 2 Oct 2014 12:33:17 +0300 Subject: [PATCH 13/28] Add transform tree bit costs for cbf_luma. --- src/search.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index f952a277..7ffce895 100644 --- a/src/search.c +++ b/src/search.c @@ -756,6 +756,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, { const int rdo = encoder_state->encoder_control->rdo; const int width = LCU_WIDTH >> depth; + const uint8_t pu_index = PU_INDEX(x_px / 4, y_px / 4); // cur_cu is used for TU parameters. cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (x_px / 8) + (y_px / 8) * LCU_T_CU_WIDTH]; @@ -767,19 +768,19 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, assert(x_px >= 0 && x_px < LCU_WIDTH); assert(y_px >= 0 && y_px < LCU_WIDTH); - bool split_transform_flag = tr_cu->tr_depth > depth; + const uint8_t tr_depth = tr_cu->tr_depth - depth; - // Add cost of intra split flag on transform tree. + // Add transform_tree split_transform_flag bit cost. bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3; if (width <= TR_MAX_WIDTH && width > TR_MIN_WIDTH && !intra_split_flag) { const cabac_ctx *ctx = &(encoder_state->cabac.ctx.trans_subdiv_model[5 - (6 - depth)]); - tr_tree_bits += CTX_ENTROPY_FBITS(ctx, split_transform_flag); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0); } - if (split_transform_flag) { + if (tr_depth > 0) { int offset = width / 2; double sum = 0; @@ -791,8 +792,14 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, return sum + tr_tree_bits * encoder_state->global->cur_lambda_cost; } - if (pred_cu->type == CU_INTRA || depth > pred_cu->depth) { - //trtree_bits += 1; // cbf_luma + // Add transform_tree cbf_luma bit cost. + if (pred_cu->type == CU_INTRA || + tr_depth > 0 || + cbf_is_set(tr_cu->cbf.u, depth) || + cbf_is_set(tr_cu->cbf.v, depth)) + { + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.qt_cbf_model_luma[!tr_depth]); + tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.y, depth + pu_index)); } unsigned ssd = 0; From 38b224cf693b68b026e1d6c8f098e67e34fb1dfb Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 6 Oct 2014 17:44:15 +0300 Subject: [PATCH 14/28] Change rest of cu split search costs to double. --- src/cu.h | 4 ++-- src/search.c | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/cu.h b/src/cu.h index 3518743e..2acdf163 100644 --- a/src/cu.h +++ b/src/cu.h @@ -46,7 +46,7 @@ typedef struct { */ typedef struct { - uint32_t cost; + double cost; uint32_t bitcost; int8_t mode; int8_t mode_chroma; @@ -58,7 +58,7 @@ typedef struct */ typedef struct { - uint32_t cost; + double cost; uint32_t bitcost; int16_t mv[2]; int16_t mvd[2]; diff --git a/src/search.c b/src/search.c index 7ffce895..13b051dc 100644 --- a/src/search.c +++ b/src/search.c @@ -1072,13 +1072,13 @@ static double search_intra_trdepth(encoder_state * const encoder_state, } -static void sort_modes(int8_t *modes, uint32_t *costs, int length) +static void sort_modes(int8_t *modes, double *costs, int length) { int i, j; for (i = 0; i < length; ++i) { j = i; while (j > 0 && costs[j] < costs[j - 1]) { - SWAP(costs[j], costs[j - 1], uint32_t); + SWAP(costs[j], costs[j - 1], double); SWAP(modes[j], modes[j - 1], int8_t); --j; } @@ -1089,7 +1089,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, pixel *orig, int32_t origstride, pixel *rec, int16_t recstride, int width, int8_t *intra_preds, - int8_t modes[35], uint32_t costs[35]) + int8_t modes[35], double costs[35]) { cost_pixel_nxn_func *cost_func = pixels_get_sad_func(width); @@ -1217,7 +1217,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, pixel *rec, int16_t recstride, int8_t *intra_preds, int modes_to_check, - int8_t modes[35], uint32_t costs[35], + int8_t modes[35], double costs[35], lcu_t *lcu) { const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + encoder_state->encoder_control->tr_depth_intra); @@ -1286,7 +1286,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, lcu_set_trdepth(lcu, x_px, y_px, depth, depth); double mode_cost = search_intra_trdepth(encoder_state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); - costs[rdo_mode] += (uint32_t)(0.5 + mode_cost); + costs[rdo_mode] += mode_cost; } } @@ -1312,7 +1312,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, * Update lcu to have best modes at this depth. * \return Cost of best mode. */ -static int search_cu_intra(encoder_state * const encoder_state, +static double search_cu_intra(encoder_state * const encoder_state, const int x_px, const int y_px, const int depth, lcu_t *lcu) { @@ -1358,7 +1358,7 @@ static int search_cu_intra(encoder_state * const encoder_state, unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2); int8_t modes[35]; - uint32_t costs[35]; + double costs[35]; int8_t number_of_modes; bool skip_rough_search = (depth == 0 || encoder_state->encoder_control->rdo >= 3); if (!skip_rough_search) { @@ -1415,11 +1415,11 @@ static int search_cu_intra(encoder_state * const encoder_state, * - All the final data for the LCU gets eventually copied to depth 0, which * will be the final output of the recursion. */ -static int search_cu(encoder_state * const encoder_state, int x, int y, int depth, lcu_t work_tree[MAX_PU_DEPTH]) +static double search_cu(encoder_state * const encoder_state, int x, int y, int depth, lcu_t work_tree[MAX_PU_DEPTH]) { const videoframe * const frame = encoder_state->tile->frame; int cu_width = LCU_WIDTH >> depth; - int cost = MAX_INT; + double cost = MAX_INT; cu_info *cur_cu; int x_local = (x&0x3f), y_local = (y&0x3f); #ifdef _DEBUG @@ -1459,7 +1459,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept if (depth >= MIN_INTRA_SEARCH_DEPTH && depth <= MAX_INTRA_SEARCH_DEPTH) { - int mode_cost = search_cu_intra(encoder_state, x, y, depth, &work_tree[depth]); + double mode_cost = search_cu_intra(encoder_state, x, y, depth, &work_tree[depth]); if (mode_cost < cost) { cost = mode_cost; cur_cu->type = CU_INTRA; @@ -1507,7 +1507,7 @@ static int search_cu(encoder_state * const encoder_state, int x, int y, int dept if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) { int half_cu = cu_width / 2; // Using Cost = lambda * 9 to compensate on the price of the split - int split_cost = (int)(encoder_state->global->cur_lambda_cost + 0.5) * CU_SPLIT_COST; + double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST; int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth); // If skip mode was selected for the block, skip further search. From b6710e78934b6d8349514456c69f84c3c7583fc1 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 6 Oct 2014 16:48:50 +0300 Subject: [PATCH 15/28] Add cabac bits for cu split flag. --- src/search.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/search.c b/src/search.c index 13b051dc..b0a6ba1a 100644 --- a/src/search.c +++ b/src/search.c @@ -1509,6 +1509,18 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d // Using Cost = lambda * 9 to compensate on the price of the split double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST; int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth); + + if (depth < MAX_DEPTH) { + vector2d lcu_cu = { x_local / 8, y_local / 8 }; + cu_info *cu_array = &(&work_tree[depth])->cu[LCU_CU_OFFSET]; + bool condA = x >= 8 && cu_array[(lcu_cu.x - 1) * lcu_cu.y * LCU_T_CU_WIDTH].depth > depth; + bool condL = y >= 8 && cu_array[lcu_cu.x * (lcu_cu.y - 1) * LCU_T_CU_WIDTH].depth > depth; + uint8_t split_model = condA + condL; + + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.split_flag_model[split_model]); + cost += CTX_ENTROPY_FBITS(ctx, 0); + split_cost += CTX_ENTROPY_FBITS(ctx, 1); + } // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting From 49ad845c3356e6f96010f81359b9dc191cc92246 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 6 Oct 2014 19:19:51 +0300 Subject: [PATCH 16/28] Add cabac bits for part_mode. --- src/search.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/search.c b/src/search.c index b0a6ba1a..822b1563 100644 --- a/src/search.c +++ b/src/search.c @@ -1522,6 +1522,12 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d split_cost += CTX_ENTROPY_FBITS(ctx, 1); } + if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) { + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.part_size_model[0]); + cost += CTX_ENTROPY_FBITS(ctx, 1); // 2Nx2N + split_cost += CTX_ENTROPY_FBITS(ctx, 0); // NxN + } + // If skip mode was selected for the block, skip further search. // Skip mode means there's no coefficients in the block, so splitting // might not give any better results but takes more time to do. From cbb2aa75b74c910e170283234a072f9a1dce8872 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 6 Oct 2014 21:46:12 +0300 Subject: [PATCH 17/28] Add macros for adjusting weight of distortion between luma and chroma. - Everything needs to have a short name because windows has a maximum path length limitation that is breaking my testing framework. --- src/encoderstate.c | 5 +++++ src/search.c | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index 2faf81ea..0a348bd6 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -43,6 +43,9 @@ #include "sao.h" #include "rdo.h" +#ifndef LMBD +# define LMBD 1.0 +#endif /*! \brief Initializes lambda-value for current QP @@ -70,6 +73,8 @@ void encoder_state_init_lambda(encoder_state * const encoder_state) lambda *= 0.95; } + lambda *= LMBD; + encoder_state->global->cur_lambda_cost = lambda; encoder_state->global->cur_lambda_cost_sqrt = sqrt(lambda); } diff --git a/src/search.c b/src/search.c index 822b1563..4ee39d43 100644 --- a/src/search.c +++ b/src/search.c @@ -47,13 +47,20 @@ && (x) + (block_width) <= (width) \ && (y) + (block_height) <= (height)) -#ifndef CU_SPLIT_COST -# define CU_SPLIT_COST 9 +#ifndef CUSPL +# define CUSPL 9 #endif #ifndef FULL_CU_SPLIT_SEARCH # define FULL_CU_SPLIT_SEARCH false #endif +#ifndef LMUL +# define LMUL 1.0 +#endif +#ifndef CMUL +# define CMUL 1.0 +#endif + /** * This is used in the hexagon_search to select 3 points to search. * @@ -831,7 +838,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, } double bits = tr_tree_bits + coeff_bits; - return ssd + bits * encoder_state->global->cur_lambda_cost; + return (double)ssd * LMUL + bits * encoder_state->global->cur_lambda_cost; } @@ -921,7 +928,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, } double bits = tr_tree_bits + coeff_bits; - return ssd + bits * encoder_state->global->cur_lambda_cost; + return (double)ssd * CMUL + bits * encoder_state->global->cur_lambda_cost; } @@ -1507,7 +1514,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) { int half_cu = cu_width / 2; // Using Cost = lambda * 9 to compensate on the price of the split - double split_cost = encoder_state->global->cur_lambda_cost * CU_SPLIT_COST; + double split_cost = encoder_state->global->cur_lambda_cost * CUSPL; int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth); if (depth < MAX_DEPTH) { From 28d1532578796e62b4cb0e43a5de1363fd76a1e4 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 8 Oct 2014 12:50:03 +0300 Subject: [PATCH 18/28] Make rd=1 use cabac for coeff cost estimation. --- src/search.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/src/search.c b/src/search.c index 4ee39d43..47112206 100644 --- a/src/search.c +++ b/src/search.c @@ -818,17 +818,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, } } - if (rdo == 1) { - int coeff_abs = 0; - - // Estimate coding cost to be 1.5 * summ of abs coeffs. - for (int y = y_px; y < y_px + width; ++y) { - for (int x = x_px; x < x_px + width; ++x) { - coeff_abs += abs((int)lcu->coeff.y[y * LCU_WIDTH + x]); - } - } - coeff_bits += 1.5 * coeff_abs; - } else if (rdo >= 2) { + if (rdo >= 1) { coefficient coeff_temp[32 * 32]; int8_t luma_scan_mode = get_scan_order(pred_cu->type, pred_cu->intra[PU_INDEX(x_px / 4, y_px / 4)].mode, depth); @@ -902,19 +892,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, } } - if (rdo == 1) { - int coeff_abs = 0; - - // Estimate coding cost to be 1.5 * summ of abs coeffs. - for (int y = lcu_px.y; y < lcu_px.y + width; ++y) { - for (int x = lcu_px.x; x < lcu_px.x + width; ++x) { - coeff_abs += abs((int)lcu->coeff.u[y * (LCU_WIDTH_C)+x]); - coeff_abs += abs((int)lcu->coeff.v[y * (LCU_WIDTH_C)+x]); - } - } - - coeff_bits = 1.5 * coeff_abs; - } else if (rdo >= 2) { + if (rdo >= 1) { coefficient coeff_temp[16 * 16]; int8_t scan_order = get_scan_order(pred_cu->type, pred_cu->intra[0].mode_chroma, depth); From f164a5ba794ffa262082529a7b0ee86b852fe6eb Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 8 Oct 2014 17:39:55 +0300 Subject: [PATCH 19/28] Add fast transform skip estimation to rough intra search. --- src/search.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/search.c b/src/search.c index 47112206..0fe9a505 100644 --- a/src/search.c +++ b/src/search.c @@ -60,6 +60,9 @@ #ifndef CMUL # define CMUL 1.0 #endif +#ifndef MN // fast tr_skip Magic Number +# define MN 0.0 +#endif /** * This is used in the hexagon_search to select 3 points to search. @@ -1070,13 +1073,31 @@ static void sort_modes(int8_t *modes, double *costs, int length) } } + +static unsigned get_cost(pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width) +{ + unsigned cost = satd_func(pred, orig_block); + if (MN != 0 && width == 4) { + // If the mode looks better with SAD than SATD it might be a good + // candidate for transform skip. How much better SAD has to be is + // controlled by MN. + unsigned sad_cost = MN * sad_func(pred, orig_block); + if (sad_cost < cost) { + cost = sad_cost; + } + } + return cost; +} + + static int8_t search_intra_rough(encoder_state * const encoder_state, pixel *orig, int32_t origstride, pixel *rec, int16_t recstride, int width, int8_t *intra_preds, int8_t modes[35], double costs[35]) { - cost_pixel_nxn_func *cost_func = pixels_get_sad_func(width); + cost_pixel_nxn_func *satd_func = pixels_get_satd_func(width); + cost_pixel_nxn_func *sad_func = pixels_get_sad_func(width); // Temporary block arrays pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT]; @@ -1099,7 +1120,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, int16_t x, y; for (y = -1; y < recstride; y++) { ref[1][y*recstride - 1] = rec[y*recstride - 1]; - } + } for (x = 0; x < recstride; x++) { ref[1][x - recstride] = rec[x - recstride]; } @@ -1127,7 +1148,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, // the recursive search. for (int mode = 2; mode <= 34; mode += offset) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = cost_func(pred, orig_block); + costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; min_cost = MIN(min_cost, costs[modes_selected]); @@ -1147,7 +1168,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, int8_t mode = modes[0] - offset; if (mode >= 2) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = cost_func(pred, orig_block); + costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } @@ -1155,7 +1176,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, mode = modes[0] + offset; if (mode <= 34) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = cost_func(pred, orig_block); + costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } @@ -1178,7 +1199,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, if (!has_mode) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = cost_func(pred, orig_block); + costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } From 7a5cf5d8651986c7f692677b2618b74c78d389e4 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Thu, 9 Oct 2014 19:08:47 +0300 Subject: [PATCH 20/28] Add trskip mode cost to fast trskip mode decision. --- src/search.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/search.c b/src/search.c index 0fe9a505..88e1ca9d 100644 --- a/src/search.c +++ b/src/search.c @@ -1074,19 +1074,21 @@ static void sort_modes(int8_t *modes, double *costs, int length) } -static unsigned get_cost(pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width) +static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width) { - unsigned cost = satd_func(pred, orig_block); + double satd_cost = satd_func(pred, orig_block); if (MN != 0 && width == 4) { // If the mode looks better with SAD than SATD it might be a good // candidate for transform skip. How much better SAD has to be is // controlled by MN. - unsigned sad_cost = MN * sad_func(pred, orig_block); - if (sad_cost < cost) { - cost = sad_cost; + const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma; + double trskip_cost = encoder_state->global->cur_lambda_cost_sqrt * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); + double sad_cost = MN * sad_func(pred, orig_block) + trskip_cost; + if (sad_cost < satd_cost) { + return sad_cost; } } - return cost; + return satd_cost; } @@ -1148,7 +1150,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, // the recursive search. for (int mode = 2; mode <= 34; mode += offset) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); + costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; min_cost = MIN(min_cost, costs[modes_selected]); @@ -1168,7 +1170,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, int8_t mode = modes[0] - offset; if (mode >= 2) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); + costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } @@ -1176,7 +1178,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, mode = modes[0] + offset; if (mode <= 34) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); + costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } @@ -1199,7 +1201,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, if (!has_mode) { intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, mode, 0); - costs[modes_selected] = get_cost(pred, orig_block, satd_func, sad_func, width); + costs[modes_selected] = get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); modes[modes_selected] = mode; ++modes_selected; } From a469c059a59b3674d1532fd7bf716d35e9b1671a Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Mon, 13 Oct 2014 10:48:39 +0300 Subject: [PATCH 21/28] Take chroma tr-skip bits into account. --- src/search.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/search.c b/src/search.c index 88e1ca9d..e459bad8 100644 --- a/src/search.c +++ b/src/search.c @@ -1082,8 +1082,10 @@ static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel * // candidate for transform skip. How much better SAD has to be is // controlled by MN. const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma; - double trskip_cost = encoder_state->global->cur_lambda_cost_sqrt * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); - double sad_cost = MN * sad_func(pred, orig_block) + trskip_cost; + double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); + ctx = &encoder_state->cabac.ctx.transform_skip_model_chroma; + trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); + double sad_cost = MN * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits; if (sad_cost < satd_cost) { return sad_cost; } From c9e212ba92d0c79f2de5d3d60ba040d7562c41b4 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 15 Oct 2014 16:01:58 +0300 Subject: [PATCH 22/28] Add intra chroma mode search. - Based on full chroma reconstruction so enabled only for --rd=2. --- src/intra.c | 4 +-- src/search.c | 90 +++++++++++++++++++++++++++++++++++++++++++++---- src/transform.c | 3 ++ 3 files changed, 89 insertions(+), 8 deletions(-) diff --git a/src/intra.c b/src/intra.c index b50eb457..ce4b0f1d 100644 --- a/src/intra.c +++ b/src/intra.c @@ -775,8 +775,8 @@ void intra_recon_lcu_chroma(encoder_state * const encoder_state, int x, int y, i rec_stride >> 1, intra_mode, 2); - } - quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu); + quantize_lcu_chroma_residual(encoder_state, x, y, depth, cur_cu, lcu); + } } } diff --git a/src/search.c b/src/search.c index e459bad8..b02db394 100644 --- a/src/search.c +++ b/src/search.c @@ -1060,6 +1060,68 @@ static double search_intra_trdepth(encoder_state * const encoder_state, } +static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma_mode, int8_t luma_mode) +{ + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]); + double mode_bits; + if (chroma_mode == luma_mode) { + mode_bits = CTX_ENTROPY_FBITS(ctx, 0); + } else { + mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1); + } + + return mode_bits; +} + + +static int8_t search_intra_chroma(encoder_state * const encoder_state, + int x_px, int y_px, int depth, + int8_t intra_mode, + lcu_t *const lcu) +{ + const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); + + if (reconstruct_chroma) { + const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; + cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; + + int8_t chroma_modes[5] = { 0, 26, 10, 1, intra_mode }; + const int8_t num_chroma_modes = 5; + + if (intra_mode == 0 || intra_mode == 26 || intra_mode == 10 || intra_mode == 1) { + chroma_modes[4] = 34; + } + + struct { + double cost; + int8_t mode; + } chroma, best_chroma; + + best_chroma.mode = 0; + best_chroma.cost = MAX_INT; + + for (int8_t chroma_mode_i = 0; chroma_mode_i < num_chroma_modes; ++chroma_mode_i) { + chroma.mode = chroma_modes[chroma_mode_i]; + + intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, chroma.mode, NULL, lcu); + chroma.cost = cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); + + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]); + double mode_bits = chroma_mode_bits(encoder_state, chroma.mode, intra_mode); + chroma.cost += mode_bits * encoder_state->global->cur_lambda_cost; + + if (chroma.cost < best_chroma.cost) { + best_chroma = chroma; + } + } + + return best_chroma.mode; + } + + return 100; +} + + static void sort_modes(int8_t *modes, double *costs, int length) { int i, j; @@ -1406,8 +1468,6 @@ static double search_cu_intra(encoder_state * const encoder_state, cur_cu->intra[pu_index].mode = modes[0]; cur_cu->intra[pu_index].cost = costs[0]; cur_cu->intra[pu_index].bitcost = intra_pred_ratecost(modes[0], candidate_modes); - - cur_cu->intra[0].mode_chroma = cur_cu->intra[0].mode; } return cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].cost; @@ -1480,13 +1540,22 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { int8_t intra_mode = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode; - int8_t intra_mode_chroma = cur_cu->intra[0].mode_chroma; lcu_set_intra_mode(&work_tree[depth], x, y, depth, intra_mode, - intra_mode_chroma, + 100, cur_cu->part_size); intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]); - intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]); + + if (PU_INDEX(x >> 2, y >> 2) == 0) { + int8_t intra_mode_chroma = intra_mode; + if (encoder_state->encoder_control->rdo >= 2) { + intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); + } + lcu_set_intra_mode(&work_tree[depth], x, y, depth, + intra_mode, intra_mode_chroma, + cur_cu->part_size); + intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode_chroma, NULL, &work_tree[depth]); + } } else if (cur_cu->type == CU_INTER) { int cbf; inter_recon_lcu(encoder_state, encoder_state->global->ref->images[cur_cu->inter.mv_ref], x, y, LCU_WIDTH>>depth, cur_cu->inter.mv, &work_tree[depth]); @@ -1509,8 +1578,17 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d cost = cu_rd_cost_luma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]); cost += cu_rd_cost_chroma(encoder_state, x_local, y_local, depth, cur_cu, &work_tree[depth]); + double mode_bits; // Bitcost - cost += (cur_cu->type == CU_INTER ? cur_cu->inter.bitcost : cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost) * (int32_t)(encoder_state->global->cur_lambda_cost+0.5); + if (cur_cu->type == CU_INTER) { + mode_bits = cur_cu->inter.bitcost; + } else { + mode_bits = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost; + if (PU_INDEX(x >> 2, y >> 2) == 0) { + mode_bits += chroma_mode_bits(encoder_state, cur_cu->intra[0].mode_chroma, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode); + } + } + cost += mode_bits * encoder_state->global->cur_lambda_cost; } // Recursively split all the way to max search depth. diff --git a/src/transform.c b/src/transform.c index ee16d7b2..5c10e401 100644 --- a/src/transform.c +++ b/src/transform.c @@ -650,6 +650,9 @@ void quantize_lcu_chroma_residual(encoder_state * const encoder_state, int32_t x // If luma is 4x4, do chroma for the 8x8 luma area when handling the top // left PU because the coordinates are correct. if (depth <= MAX_DEPTH || pu_index == 0) { + cbf_clear(&cur_cu->cbf.u, depth); + cbf_clear(&cur_cu->cbf.v, depth); + const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH_C; pixel *recbase_u = &lcu->rec.u[chroma_offset]; pixel *recbase_v = &lcu->rec.v[chroma_offset]; From 8a407b0313ee6d2b1475d4e17458478d29608a4d Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 15 Oct 2014 14:15:10 +0300 Subject: [PATCH 23/28] Estimate luma and chroma intra mode bits separately. - Remove cu_info.intra[].cost and bitcost as unnecessary. - Add luma_mode_bits to complement chroma_mode_bits and remove intra_pred_ratecost as unneccessary. Difference is that intra_pred_ratecost was more coarse and included chroma mode with the assumption that it would be the same as chroma. --- src/cu.h | 2 -- src/rdo.c | 20 -------------------- src/rdo.h | 1 - src/search.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/cu.h b/src/cu.h index 2acdf163..77eff46a 100644 --- a/src/cu.h +++ b/src/cu.h @@ -46,8 +46,6 @@ typedef struct { */ typedef struct { - double cost; - uint32_t bitcost; int8_t mode; int8_t mode_chroma; int8_t tr_skip; //!< \brief transform skip flag diff --git a/src/rdo.c b/src/rdo.c index d3635f37..ef54d22c 100644 --- a/src/rdo.c +++ b/src/rdo.c @@ -127,26 +127,6 @@ const float f_entropy_bits[128] = }; -/** - * \brief Helper function to find intra merge costs - * \returns intra mode coding cost in bits - */ -uint32_t intra_pred_ratecost(int16_t mode, int8_t *intra_preds) -{ - // merge mode -1 means they are not used -> cost 0 - if(intra_preds[0] == -1) return 0; - - // First candidate needs only one bit and two other need two - if(intra_preds[0] == mode) { - return 1; - } else if(intra_preds[1] == mode || intra_preds[2] == mode) { - return 2; - } - // Without merging the cost is 5 bits - return 5; -} - - /** * \brief Function to compare RDO costs * \param rdo_costs array of current costs diff --git a/src/rdo.h b/src/rdo.h index 889f10d7..33954cf1 100644 --- a/src/rdo.h +++ b/src/rdo.h @@ -42,7 +42,6 @@ typedef struct extern const uint32_t g_go_rice_range[5]; extern const uint32_t g_go_rice_prefix_len[5]; -uint32_t intra_pred_ratecost(int16_t mode, int8_t *intra_preds); int intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost); void rdoq(encoder_state *encoder_state, coefficient *coef, coefficient *dest_coeff, int32_t width, diff --git a/src/search.c b/src/search.c index b02db394..8941ec2c 100644 --- a/src/search.c +++ b/src/search.c @@ -1048,7 +1048,7 @@ static double search_intra_trdepth(encoder_state * const encoder_state, pred_cu->cbf = nosplit_cbf; // We only restore the pixel data and not coefficients or cbf data. - // The only thing we really need are the border pixels. + // The only thing we really need are the border pixels.intra_get_dir_luma_predictor pixels_blit(nosplit_pixels.y, lcu->rec.y, width, width, width, LCU_WIDTH); if (reconstruct_chroma) { pixels_blit(nosplit_pixels.u, lcu->rec.u, width_c, width_c, width_c, LCU_WIDTH_C); @@ -1060,6 +1060,30 @@ static double search_intra_trdepth(encoder_state * const encoder_state, } +static double luma_mode_bits(const encoder_state *encoder_state, int8_t luma_mode, const int8_t *intra_preds) +{ + double mode_bits; + + bool mode_in_preds = false; + for (int i = 0; i < 3; ++i) { + if (luma_mode == intra_preds[i]) { + mode_in_preds = true; + } + } + + const cabac_ctx *ctx = &(encoder_state->cabac.ctx.intra_mode_model); + mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds); + + if (mode_in_preds) { + mode_bits += ((luma_mode == intra_preds[0]) ? 1 : 2); + } else { + mode_bits += 5; + } + + return mode_bits; +} + + static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma_mode, int8_t luma_mode) { const cabac_ctx *ctx = &(encoder_state->cabac.ctx.chroma_pred_model[0]); @@ -1275,7 +1299,7 @@ static int8_t search_intra_rough(encoder_state * const encoder_state, // affecting the halving search. int lambda_cost = (int)(encoder_state->global->cur_lambda_cost_sqrt + 0.5); for (int mode_i = 0; mode_i < modes_selected; ++mode_i) { - costs[mode_i] += lambda_cost * intra_pred_ratecost(modes[mode_i], intra_preds); + costs[mode_i] += lambda_cost * luma_mode_bits(encoder_state, modes[mode_i], intra_preds); } sort_modes(modes, costs, modes_selected); @@ -1334,7 +1358,7 @@ static void search_intra_rdo(encoder_state * const encoder_state, } for(rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) { - int rdo_bitcost = intra_pred_ratecost(modes[rdo_mode], intra_preds); + int rdo_bitcost = luma_mode_bits(encoder_state, modes[rdo_mode], intra_preds); costs[rdo_mode] = rdo_bitcost * (int)(encoder_state->global->cur_lambda_cost + 0.5); if (0 && tr_depth == depth) { @@ -1424,13 +1448,14 @@ static double search_cu_intra(encoder_state * const encoder_state, lcu); } + int8_t modes[35]; + double costs[35]; + // Find best intra mode for 2Nx2N. { pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH]; unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2); - int8_t modes[35]; - double costs[35]; int8_t number_of_modes; bool skip_rough_search = (depth == 0 || encoder_state->encoder_control->rdo >= 3); if (!skip_rough_search) { @@ -1466,11 +1491,9 @@ static double search_cu_intra(encoder_state * const encoder_state, } cur_cu->intra[pu_index].mode = modes[0]; - cur_cu->intra[pu_index].cost = costs[0]; - cur_cu->intra[pu_index].bitcost = intra_pred_ratecost(modes[0], candidate_modes); } - return cur_cu->intra[PU_INDEX(x_px >> 2, y_px >> 2)].cost; + return costs[0]; } @@ -1583,7 +1606,17 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (cur_cu->type == CU_INTER) { mode_bits = cur_cu->inter.bitcost; } else { - mode_bits = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].bitcost; + int8_t candidate_modes[3]; + { + lcu_t *lcu = &work_tree[depth]; + const vector2d lcu_px = { x & 0x3f, y & 0x3f }; + const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 }; + const cu_info *left_cu = ((x >> 3) ? &cur_cu[-1] : NULL); + const cu_info *above_cu = ((lcu_cu.y) ? &cur_cu[-LCU_T_CU_WIDTH] : NULL); + intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu); + } + + mode_bits = luma_mode_bits(encoder_state, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode, candidate_modes); if (PU_INDEX(x >> 2, y >> 2) == 0) { mode_bits += chroma_mode_bits(encoder_state, cur_cu->intra[0].mode_chroma, cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode); } From 3e6023dfb54dc377997e6366283bd26a6ea11960 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 15 Oct 2014 20:17:38 +0300 Subject: [PATCH 24/28] Rename search constants and set sane defaults. --- src/global.h | 2 +- src/search.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/global.h b/src/global.h index df74c657..22a2ecb6 100644 --- a/src/global.h +++ b/src/global.h @@ -71,7 +71,7 @@ typedef int16_t coefficient; # define MAX_INTRA_SEARCH_DEPTH 4 #endif #ifndef MIN_INTRA_SEARCH_DEPTH -# define MIN_INTRA_SEARCH_DEPTH 0 +# define MIN_INTRA_SEARCH_DEPTH 1 #endif // Maximum CU depth when descending form LCU level. diff --git a/src/search.c b/src/search.c index 8941ec2c..70b13d76 100644 --- a/src/search.c +++ b/src/search.c @@ -47,21 +47,28 @@ && (x) + (block_width) <= (width) \ && (y) + (block_height) <= (height)) -#ifndef CUSPL -# define CUSPL 9 +// Extra cost for CU split. +// Compensates for missing or incorrect bit costs. Must be recalculated if +// bits are added or removed from cu-tree search. +#ifndef CU_COST +# define CU_COST 3 #endif +// Disable early cu-split pruning. #ifndef FULL_CU_SPLIT_SEARCH # define FULL_CU_SPLIT_SEARCH false #endif - -#ifndef LMUL -# define LMUL 1.0 +// Modify weight of luma SSD. +#ifndef LUMA_MULT +# define LUMA_MULT 0.8 #endif -#ifndef CMUL -# define CMUL 1.0 +// Modify weight of chroma SSD. +#ifndef CHROMA_MULT +# define CHROMA_MULT 1.5 #endif -#ifndef MN // fast tr_skip Magic Number -# define MN 0.0 +// Normalize SAD for comparison against SATD to estimate transform skip +// for 4x4 blocks. +#ifndef TRSKIP_RATIO +# define TRSKIP_RATIO 1.7 #endif /** @@ -831,7 +838,7 @@ static double cu_rd_cost_luma(const encoder_state *const encoder_state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * LMUL + bits * encoder_state->global->cur_lambda_cost; + return (double)ssd * LUMA_MULT + bits * encoder_state->global->cur_lambda_cost; } @@ -909,7 +916,7 @@ static double cu_rd_cost_chroma(const encoder_state *const encoder_state, } double bits = tr_tree_bits + coeff_bits; - return (double)ssd * CMUL + bits * encoder_state->global->cur_lambda_cost; + return (double)ssd * CHROMA_MULT + bits * encoder_state->global->cur_lambda_cost; } @@ -1163,15 +1170,15 @@ static void sort_modes(int8_t *modes, double *costs, int length) static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel *orig_block, cost_pixel_nxn_func *satd_func, cost_pixel_nxn_func *sad_func, int width) { double satd_cost = satd_func(pred, orig_block); - if (MN != 0 && width == 4) { + if (TRSKIP_RATIO != 0 && width == 4) { // If the mode looks better with SAD than SATD it might be a good // candidate for transform skip. How much better SAD has to be is - // controlled by MN. + // controlled by TRSKIP_RATIO. const cabac_ctx *ctx = &encoder_state->cabac.ctx.transform_skip_model_luma; double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0); ctx = &encoder_state->cabac.ctx.transform_skip_model_chroma; trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0)); - double sad_cost = MN * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits; + double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + encoder_state->global->cur_lambda_cost_sqrt * trskip_bits; if (sad_cost < satd_cost) { return sad_cost; } @@ -1628,7 +1635,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (depth < MAX_INTRA_SEARCH_DEPTH || (depth < MAX_INTER_SEARCH_DEPTH && encoder_state->global->slicetype != SLICE_I)) { int half_cu = cu_width / 2; // Using Cost = lambda * 9 to compensate on the price of the split - double split_cost = encoder_state->global->cur_lambda_cost * CUSPL; + double split_cost = encoder_state->global->cur_lambda_cost * CU_COST; int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth); if (depth < MAX_DEPTH) { From d12dbd4aa0db85ae663325fa7a725df1e04afd48 Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 15 Oct 2014 22:11:45 +0300 Subject: [PATCH 25/28] Add fast intra chroma mode search. --- src/search.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/src/search.c b/src/search.c index 70b13d76..3dade9d3 100644 --- a/src/search.c +++ b/src/search.c @@ -1187,6 +1187,64 @@ static double get_cost(encoder_state * const encoder_state, pixel *pred, pixel * } + +static void search_intra_chroma_rough(encoder_state * const encoder_state, + int x_px, int y_px, int depth, + const pixel *orig_u, const pixel *orig_v, int16_t origstride, + const pixel *rec_u, const pixel *rec_v, int16_t recstride, + int8_t luma_mode, + int8_t modes[5], double costs[5]) +{ + const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); + if (!reconstruct_chroma) return; + + const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); + const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; + + modes[0] = 0; + modes[1] = 26; + modes[2] = 10; + modes[3] = 1; + if (luma_mode == 0 || luma_mode == 26 || luma_mode == 10 || luma_mode == 1) { + modes[4] = 34; + } else { + modes[4] = luma_mode; + } + + cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width); + //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width); + + pixel _pred[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT]; + pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT); + + pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT]; + pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); + + for (int i = 0; i < 5; ++i) { + costs[i] = encoder_state->global->cur_lambda_cost_sqrt * chroma_mode_bits(encoder_state, modes[i], luma_mode); + } + + // Chroma doesn't use filtered pixels, so filtered pixels pointer is NULL. + const pixel *ref[2] = { rec_u, NULL }; + pixels_blit(orig_u, orig_block, width, width, origstride, width); + for (int i = 0; i < 5; ++i) { + intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 1); + //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); + costs[i] += satd_func(pred, orig_block); + } + + ref[0] = rec_v; + pixels_blit(orig_v, orig_block, width, width, origstride, width); + for (int i = 0; i < 5; ++i) { + intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 2); + //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); + costs[i] += satd_func(pred, orig_block); + } + + sort_modes(modes, costs, 5); +} + + static int8_t search_intra_rough(encoder_state * const encoder_state, pixel *orig, int32_t origstride, pixel *rec, int16_t recstride, @@ -1521,6 +1579,11 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d int cu_width = LCU_WIDTH >> depth; double cost = MAX_INT; cu_info *cur_cu; + + const vector2d lcu_px = { x & 0x3f, y & 0x3f }; + const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 }; + lcu_t *const lcu = &work_tree[depth]; + int x_local = (x&0x3f), y_local = (y&0x3f); #ifdef _DEBUG int debug_split = 0; @@ -1578,6 +1641,42 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (PU_INDEX(x >> 2, y >> 2) == 0) { int8_t intra_mode_chroma = intra_mode; + + if (encoder_state->encoder_control->rdo >= 1) { + const videoframe * const frame = encoder_state->tile->frame; + + int8_t modes[5]; + double costs[5]; + + pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; + pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; + + const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); + const int16_t rec_stride = width_c * 2 + 8; + const int16_t out_stride = rec_stride; + + intra_build_reference_border(encoder_state->encoder_control, + x, y, out_stride, + rec_u, rec_stride, COLOR_U, + frame->width / 2, frame->height / 2, + lcu); + intra_build_reference_border(encoder_state->encoder_control, + x, y, out_stride, + rec_v, rec_stride, COLOR_V, + frame->width / 2, frame->height / 2, + lcu); + + vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 }; + pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + + search_intra_chroma_rough(encoder_state, x, y, depth, + ref_u, ref_v, LCU_WIDTH_C, + &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride, + intra_mode, modes, costs); + intra_mode_chroma = modes[0]; + } + if (encoder_state->encoder_control->rdo >= 2) { intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); } @@ -1615,9 +1714,6 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d } else { int8_t candidate_modes[3]; { - lcu_t *lcu = &work_tree[depth]; - const vector2d lcu_px = { x & 0x3f, y & 0x3f }; - const vector2d lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 }; const cu_info *left_cu = ((x >> 3) ? &cur_cu[-1] : NULL); const cu_info *above_cu = ((lcu_cu.y) ? &cur_cu[-LCU_T_CU_WIDTH] : NULL); intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu); From 3cf5e422e8c67bccbdd999592fc1706c0399cf6a Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Wed, 15 Oct 2014 23:07:28 +0300 Subject: [PATCH 26/28] Make fast chroma mode search select modes for slower chroma search. --- src/search.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/search.c b/src/search.c index 3dade9d3..26bd65f4 100644 --- a/src/search.c +++ b/src/search.c @@ -1108,6 +1108,7 @@ static double chroma_mode_bits(const encoder_state *encoder_state, int8_t chroma static int8_t search_intra_chroma(encoder_state * const encoder_state, int x_px, int y_px, int depth, int8_t intra_mode, + int8_t modes[5], int8_t num_modes, lcu_t *const lcu) { const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); @@ -1116,13 +1117,6 @@ static int8_t search_intra_chroma(encoder_state * const encoder_state, const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; cu_info *const tr_cu = &lcu->cu[LCU_CU_OFFSET + (lcu_px.x >> 3) + (lcu_px.y >> 3)*LCU_T_CU_WIDTH]; - int8_t chroma_modes[5] = { 0, 26, 10, 1, intra_mode }; - const int8_t num_chroma_modes = 5; - - if (intra_mode == 0 || intra_mode == 26 || intra_mode == 10 || intra_mode == 1) { - chroma_modes[4] = 34; - } - struct { double cost; int8_t mode; @@ -1131,8 +1125,8 @@ static int8_t search_intra_chroma(encoder_state * const encoder_state, best_chroma.mode = 0; best_chroma.cost = MAX_INT; - for (int8_t chroma_mode_i = 0; chroma_mode_i < num_chroma_modes; ++chroma_mode_i) { - chroma.mode = chroma_modes[chroma_mode_i]; + for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) { + chroma.mode = modes[chroma_mode_i]; intra_recon_lcu_chroma(encoder_state, x_px, y_px, depth, chroma.mode, NULL, lcu); chroma.cost = cu_rd_cost_chroma(encoder_state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu); @@ -1193,7 +1187,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, const pixel *orig_u, const pixel *orig_v, int16_t origstride, const pixel *rec_u, const pixel *rec_v, int16_t recstride, int8_t luma_mode, - int8_t modes[5], double costs[5]) + int8_t modes[5], double costs[5], int num_modes) { const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); if (!reconstruct_chroma) return; @@ -1211,6 +1205,14 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, modes[4] = luma_mode; } + for (int i = 0; i < 5; ++i) { + costs[i] = 0; + } + + // If the number of modes is all of them, skip ordering them. + if (num_modes == 5) return; + + cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width); //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width); @@ -1220,14 +1222,12 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, pixel _orig_block[LCU_WIDTH * LCU_WIDTH + 1 + SIMD_ALIGNMENT]; pixel *orig_block = ALIGNED_POINTER(_orig_block, SIMD_ALIGNMENT); - for (int i = 0; i < 5; ++i) { - costs[i] = encoder_state->global->cur_lambda_cost_sqrt * chroma_mode_bits(encoder_state, modes[i], luma_mode); - } // Chroma doesn't use filtered pixels, so filtered pixels pointer is NULL. const pixel *ref[2] = { rec_u, NULL }; pixels_blit(orig_u, orig_block, width, width, origstride, width); for (int i = 0; i < 5; ++i) { + if (modes[i] == luma_mode) continue; intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 1); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); costs[i] += satd_func(pred, orig_block); @@ -1236,6 +1236,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, ref[0] = rec_v; pixels_blit(orig_v, orig_block, width, width, origstride, width); for (int i = 0; i < 5; ++i) { + if (modes[i] == luma_mode) continue; intra_get_pred(encoder_state->encoder_control, ref, recstride, pred, width, modes[i], 2); //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width); costs[i] += satd_func(pred, orig_block); @@ -1641,8 +1642,12 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (PU_INDEX(x >> 2, y >> 2) == 0) { int8_t intra_mode_chroma = intra_mode; - - if (encoder_state->encoder_control->rdo >= 1) { + + // There is almost no benefit to doing the chroma mode search for + // rd2. Possibly because the luma mode search already takes chroma + // into account, so there is less of a chanse of luma mode being + // really bad for chroma. + if (encoder_state->encoder_control->rdo < 2) { const videoframe * const frame = encoder_state->tile->frame; int8_t modes[5]; @@ -1670,16 +1675,22 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + // The number of modes to select for slower chroma search. Luma mode + // is always one of the modes, so 2 means the final decision is made + // between luma mode and one other mode that looks the best + // according to search_intra_chroma_rough. + // When tested 2 modes is around -0.5% bdrate compared to 0 and 5 modes + // is around -0.8. + int num_modes = 2; + search_intra_chroma_rough(encoder_state, x, y, depth, ref_u, ref_v, LCU_WIDTH_C, &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride, - intra_mode, modes, costs); - intra_mode_chroma = modes[0]; + intra_mode, modes, costs, num_modes); + + intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]); } - if (encoder_state->encoder_control->rdo >= 2) { - intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, &work_tree[depth]); - } lcu_set_intra_mode(&work_tree[depth], x, y, depth, intra_mode, intra_mode_chroma, cur_cu->part_size); From 02ec26fceaac09093fc9b5c0e54b5646ba52c93f Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Thu, 16 Oct 2014 00:42:22 +0300 Subject: [PATCH 27/28] Try different number of chroma intra modes for different depths. - And avoid doing extra work if no extra modes are tested for certain depths. --- src/search.c | 92 ++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/src/search.c b/src/search.c index 26bd65f4..552b5bd1 100644 --- a/src/search.c +++ b/src/search.c @@ -1187,7 +1187,7 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, const pixel *orig_u, const pixel *orig_v, int16_t origstride, const pixel *rec_u, const pixel *rec_v, int16_t recstride, int8_t luma_mode, - int8_t modes[5], double costs[5], int num_modes) + int8_t modes[5], double costs[5]) { const bool reconstruct_chroma = !(x_px & 4 || y_px & 4); if (!reconstruct_chroma) return; @@ -1195,24 +1195,10 @@ static void search_intra_chroma_rough(encoder_state * const encoder_state, const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); const vector2d lcu_px = { x_px & 0x3f, y_px & 0x3f }; - modes[0] = 0; - modes[1] = 26; - modes[2] = 10; - modes[3] = 1; - if (luma_mode == 0 || luma_mode == 26 || luma_mode == 10 || luma_mode == 1) { - modes[4] = 34; - } else { - modes[4] = luma_mode; - } - for (int i = 0; i < 5; ++i) { costs[i] = 0; } - // If the number of modes is all of them, skip ordering them. - if (num_modes == 5) return; - - cost_pixel_nxn_func *const satd_func = pixels_get_satd_func(width); //cost_pixel_nxn_func *const sad_func = pixels_get_sad_func(width); @@ -1636,7 +1622,7 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d int8_t intra_mode = cur_cu->intra[PU_INDEX(x >> 2, y >> 2)].mode; lcu_set_intra_mode(&work_tree[depth], x, y, depth, intra_mode, - 100, + intra_mode, cur_cu->part_size); intra_recon_lcu_luma(encoder_state, x, y, depth, intra_mode, NULL, &work_tree[depth]); @@ -1650,50 +1636,56 @@ static double search_cu(encoder_state * const encoder_state, int x, int y, int d if (encoder_state->encoder_control->rdo < 2) { const videoframe * const frame = encoder_state->tile->frame; - int8_t modes[5]; double costs[5]; - - pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; - pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; - - const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); - const int16_t rec_stride = width_c * 2 + 8; - const int16_t out_stride = rec_stride; - - intra_build_reference_border(encoder_state->encoder_control, - x, y, out_stride, - rec_u, rec_stride, COLOR_U, - frame->width / 2, frame->height / 2, - lcu); - intra_build_reference_border(encoder_state->encoder_control, - x, y, out_stride, - rec_v, rec_stride, COLOR_V, - frame->width / 2, frame->height / 2, - lcu); - - vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 }; - pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; - pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + int8_t modes[5] = { 0, 26, 10, 1, 34 }; + if (intra_mode != 0 && intra_mode != 26 && intra_mode != 10 && intra_mode != 1) { + modes[4] = intra_mode; + } // The number of modes to select for slower chroma search. Luma mode // is always one of the modes, so 2 means the final decision is made // between luma mode and one other mode that looks the best // according to search_intra_chroma_rough. - // When tested 2 modes is around -0.5% bdrate compared to 0 and 5 modes - // is around -0.8. - int num_modes = 2; + const int8_t modes_in_depth[5] = { 1, 1, 1, 1, 2 }; + int num_modes = modes_in_depth[depth]; - search_intra_chroma_rough(encoder_state, x, y, depth, - ref_u, ref_v, LCU_WIDTH_C, - &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride, - intra_mode, modes, costs, num_modes); + if (num_modes != 1 && num_modes != 5) { + pixel rec_u[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; + pixel rec_v[(LCU_WIDTH_C * 2 + 8) * (LCU_WIDTH_C * 2 + 8)]; - intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]); + const int16_t width_c = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH); + const int16_t rec_stride = width_c * 2 + 8; + const int16_t out_stride = rec_stride; + + intra_build_reference_border(encoder_state->encoder_control, + x, y, out_stride, + rec_u, rec_stride, COLOR_U, + frame->width / 2, frame->height / 2, + lcu); + intra_build_reference_border(encoder_state->encoder_control, + x, y, out_stride, + rec_v, rec_stride, COLOR_V, + frame->width / 2, frame->height / 2, + lcu); + + vector2d lcu_cpx = { lcu_px.x / 2, lcu_px.y / 2 }; + pixel *ref_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + pixel *ref_v = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C]; + + search_intra_chroma_rough(encoder_state, x, y, depth, + ref_u, ref_v, LCU_WIDTH_C, + &rec_u[rec_stride + 1], &rec_v[rec_stride + 1], rec_stride, + intra_mode, modes, costs); + } + + if (num_modes > 1) { + intra_mode_chroma = search_intra_chroma(encoder_state, x, y, depth, intra_mode, modes, num_modes, &work_tree[depth]); + lcu_set_intra_mode(&work_tree[depth], x, y, depth, + intra_mode, intra_mode_chroma, + cur_cu->part_size); + } } - lcu_set_intra_mode(&work_tree[depth], x, y, depth, - intra_mode, intra_mode_chroma, - cur_cu->part_size); intra_recon_lcu_chroma(encoder_state, x, y, depth, intra_mode_chroma, NULL, &work_tree[depth]); } } else if (cur_cu->type == CU_INTER) { From afb9e8c3f45297a430c8eae03e735349892a004e Mon Sep 17 00:00:00 2001 From: Ari Koivula Date: Thu, 16 Oct 2014 03:23:33 +0300 Subject: [PATCH 28/28] Remove extra parameter sets. --- src/encoder_state-bitstream.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 0157ce08..3f2d2383 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -689,24 +689,26 @@ static void encoder_state_write_bitstream_main(encoder_state * const main_state) } if (main_state->global->is_radl_frame) { - // Access Unit Delimiter (AUD) - if (encoder->aud_enable) - encoder_state_write_bitstream_aud(main_state); + if (main_state->global->frame == 0) { + // Access Unit Delimiter (AUD) + if (encoder->aud_enable) + encoder_state_write_bitstream_aud(main_state); - // Video Parameter Set (VPS) - nal_write(stream, NAL_VPS_NUT, 0, 1); - encoder_state_write_bitstream_vid_parameter_set(main_state); - bitstream_align(stream); + // Video Parameter Set (VPS) + nal_write(stream, NAL_VPS_NUT, 0, 1); + encoder_state_write_bitstream_vid_parameter_set(main_state); + bitstream_align(stream); - // Sequence Parameter Set (SPS) - nal_write(stream, NAL_SPS_NUT, 0, 1); - encoder_state_write_bitstream_seq_parameter_set(main_state); - bitstream_align(stream); + // Sequence Parameter Set (SPS) + nal_write(stream, NAL_SPS_NUT, 0, 1); + encoder_state_write_bitstream_seq_parameter_set(main_state); + bitstream_align(stream); - // Picture Parameter Set (PPS) - nal_write(stream, NAL_PPS_NUT, 0, 1); - encoder_state_write_bitstream_pic_parameter_set(main_state); - bitstream_align(stream); + // Picture Parameter Set (PPS) + nal_write(stream, NAL_PPS_NUT, 0, 1); + encoder_state_write_bitstream_pic_parameter_set(main_state); + bitstream_align(stream); + } if (main_state->global->frame == 0) { // Prefix SEI